In [22]:
import nltk
import numpy as np
import pandas as pd
import string

from pymongo import MongoClient
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer



Lecture: https://github.com/gSchool/DSI_Lectures/blob/master/nlp/moses_marsh/01_text_featurization.ipynb

Assignment: https://github.com/GalvanizeDataScience/nlp

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [2]:
client = MongoClient()
db = client.nyt_dump
coll = db.articles

In [3]:
SnowballStemmer("english")

<nltk.stem.snowball.SnowballStemmer at 0x7f87f81cf128>

In [4]:
stop_words = set(nltk.corpus.stopwords.words('english'))

## Text Processing Pipeline

**Goal:** Build a basic text processing pipeline to compare the documents. Let's play with nltk here. 


In [5]:
def text_pipeline(doc, stops={}, lemmatize=False, stem = False):
    '''
    Args:
        doc (str): the text to be tokenized
        stops (set): an optional set of words (tokens) to exclude
        lemmatize (bool): if True, lemmatize the words
    
    Returns: 
        tokens (list of strings)
    '''
    # sets all characters to lowercase and splits them into words
    doc = doc.lower().split()
    # antiquated but to remove punctuation
    punct = set(string.punctuation)
    # current to remove everything but ascii letters
    alphanum = set(string.ascii_lowercase)
    # grabs tokens character by character if they are lowercase letters
    tokens = [''.join([char for char in tok if char in alphanum]) 
              for tok in doc]
    tokens = [token for token in tokens if token]
    
    if stops:
        tokens = [tok for tok in tokens if (tok not in stops)]
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    if stem:
        stemmer = SnowballStemmer("english")
        tokens = [stemmer.stem(tok) for tok in tokens]
    return tokens

## Tokenization and Stop words

In [6]:
docs_cleaned = []
for document in coll.find():
     for raw_content in document['content']:
        lines = raw_content.split('\n')
        lines = [line for line in lines if line]
        test = [text_pipeline(lines, stops = stop_words, lemmatize=False, stem=True)
        for lines in document['content']]
        print(test)
        docs_cleaned.extend(test)

[['hey', 'man', 'phone', 'said', 'still', 'come', 'tonight'], ['took', 'moment', 'realiz', 'call', 'distil', 'confirm', 'dinner', 'reserv', 'yes', 'repli', 'cool', 'said', 'sound', 'meant', 'distil', 'open', 'june', 'corner', 'franklin', 'street', 'west', 'broadway', 'tribeca', 'former', 'home', 'drew', 'niepor', 'layla', 'centrico', 'belli', 'dancer', 'frozenmargarita', 'machin', 'gone', 'certain', 'effervesc', 'remain', 'mr', 'niepor', 'hover', 'background', 'guru', 'distil', 'owner', 'firsttim', 'restaurateur', 'nick', 'iovacchini', 'shane', 'lyon', 'yearold', 'chef', 'space', 'bland', 'handsom', 'dark', 'wood', 'charcoal', 'banquett', 'breathless', 'high', 'ceil', 'quasimediev', 'wheel', 'chandeli', 'like', 'crown', 'fire', 'one', 'side', 'devot', 'bar', 'drink', 'benjamin', 'wood', 'ladykil', 'eleg', 'knife', 'twist', 'occasion', 'mope', 'rock', 'shimmer', 'speaker', 'servic', 'confound', 'friend', 'almost', 'coddl', 'stood', 'outsid', 'read', 'post', 'menu', 'someon', 'came', 'hu

[['question', 'futur', 'carmelo', 'anthoni', 'respond', 'four', 'word', 'practic', 'reassur'], ['im', 'go', 'nowher', 'told', 'bloomberg', 'televis', 'last', 'week', 'attempt', 'sooth', 'nerv', 'antsi', 'knick', 'fan', 'strain', 'syntax', 'asid', 'sensibl', 'thing', 'say', 'perhap', 'thing', 'say', 'might', 'even', 'true', 'although', 'predict', 'whim', 'nba', 'superstar', 'alway', 'dicey', 'proposit', 'ten', 'month', 'anthoni', 'probabl', 'opt', 'knick', 'contract', 'sign', 'fiveyear', 'deal', 'million', 'happili', 'resum', 'role', 'basketbal', 'princ', 'broadway', 'anthoni', 'love', 'new', 'york', 'new', 'york', 'love', 'anthoni', 'playoff', 'failur', 'notwithstand', 'everi', 'reason', 'stay', 'money', 'market', 'chanc', 'savior', 'end', 'knick', 'year', 'championship', 'drought', 'two', 'half', 'year', 'ago', 'anthoni', 'forc', 'denver', 'nugget', 'send', 'seem', 'unlik', 'would', 'leav', 'soon', 'lot', 'chang', 'month', 'specter', 'anthoni', 'free', 'agenc', 'shadow', 'knick', 'sea

[['actual', 'pay', 'simpl', 'omelet', 'coffe', 'room', 'servic', 'anaheim', 'hilton', 'hotel', 'southern', 'california', 'last', 'week'], ['yes', 'earli', 'laptop', 'misbehav', 'clock', 'tick', 'toward', 'deadlin', 'hadnt', 'dinner', 'night', 'still', 'that', 'first', 'time', 'ive', 'experienc', 'buyer', 'remors', 'omelet', 'though', 'arriv', 'quick', 'actual', 'pretti', 'good', 'busi', 'trip', 'peak', 'valley', 'thought', 'would', 'pass', 'along', 'exampl', 'recent', 'day', 'annual', 'airlin', 'passeng', 'experi', 'trade', 'show', 'anaheim', 'ad', 'stop', 'los', 'angel', 'airfar', 'cheap', 'round', 'trip', 'american', 'airlin', 'tucson', 'los', 'angel', 'peak', 'avi', 'rental', 'car', 'complimentari', 'upgrad', 'sporti', 'volvo', 'xc', 'cost', 'week', 'includ', 'annoy', 'tax', 'fee', 'valley', 'also', 'came', 'car', 'electron', 'dashboard', 'interfac', 'control', 'everyth', 'actual', 'make', 'car', 'go', 'baffl', 'drove', 'tri', 'figur', 'unfathom', 'symbol', 'notic', 'brian', 'iphon'

[['rooki', 'wil', 'myer', 'homer', 'drove', 'three', 'run', 'alex', 'cobb', 'pitch', 'eight', 'solid', 'inning', 'tampa', 'bay', 'ray', 'beat', 'texa', 'ranger', 'monday', 'night', 'take', 'lead', 'american', 'leagu', 'wildcard', 'race'], ['cobb', 'strikeout', 'allow', 'run', 'hit', 'one', 'walk', 'matt', 'garza', 'gave', 'six', 'run', 'eight', 'hit', 'four', 'onethird', 'inning', 'lost', 'third', 'straight', 'start', 'ranger', 'lost', 'seven', 'row', 'septemb', 'announc', 'crowd', 'ray', 'major', 'leagu', 'lowest', 'home', 'attend', 'averag', 'tiger', 'marin', 'rick', 'porcello', 'struck', 'six', 'inning', 'host', 'detroit', 'move', 'step', 'closer', 'third', 'straight', 'al', 'central', 'titl', 'victor', 'martinez', 'broke', 'tie', 'sixth', 'runscor', 'singl', 'omar', 'infant', 'follow', 'rbi', 'singl', 'joaquin', 'benoit', 'pitch', 'hitless', 'ninth', 'st', 'save', 'chanc', 'royal', 'indian', 'jame', 'shield', 'gave', 'one', 'run', 'six', 'inning', 'salvador', 'perez', 'three', 'hit

[['frankfurt', 'real', 'polit', 'drama', 'germani', 'year', 'may', 'get', 'way', 'chancellor', 'angela', 'merkel', 'face', 'voter'], ['state', 'elect', 'bavaria', 'underlin', 'sunday', 'ms', 'merkel', 'person', 'popular', 'conceal', 'high', 'degre', 'polit', 'fragment', 'could', 'make', 'tricki', 'assembl', 'govern', 'coalit', 'especi', 'centerright', 'christian', 'democrat', 'parti', 'perform', 'way', 'poll', 'suggest', 'win', 'sunday', 'nation', 'elect', 'fall', 'short', 'outright', 'major', 'current', 'coalit', 'partner', 'free', 'democrat', 'parti', 'poor', 'bavarian', 'elect', 'might', 'emerg', 'nation', 'vote', 'seat', 'could', 'forc', 'ms', 'merkel', 'seek', 'third', 'term', 'make', 'powershar', 'deal', 'main', 'opposit', 'parti', 'leftlean', 'social', 'democrat', 'meanwhil', 'popup', 'parti', 'notabl', 'antieuro', 'altern', 'germani', 'occupi', 'polit', 'space', 'vacat', 'ms', 'merkel', 'move', 'left', 'issu', 'like', 'nation', 'minimum', 'wage', 'nuclear', 'power', 'situat', '

[['cardiff', 'wale', 'donizetti', 'never', 'thought', 'three', 'opera', 'tudor', 'queen', 'trilog', 'scarc', 'anyon', 'els', 'either', 'soprano', 'bever', 'sill', 'sang', 'new', 'york', 'citi', 'opera', 'libretto', 'differ', 'author', 'premier', 'sevenyear', 'period', 'differ', 'italian', 'theater', 'differ', 'singer', 'royal', 'protagonist', 'giuditta', 'pasta', 'anna', 'bolena', 'maria', 'malibran', 'maria', 'stuarda', 'giuseppina', 'ronzi', 'de', 'begni', 'elizabeth', 'roberto', 'devereux'], ['stori', 'familiar', 'histori', 'literari', 'fabric', 'far', 'apart', 'chronolog', 'yet', 'culmin', 'execut', 'ann', 'boleyn', 'behest', 'husband', 'henri', 'viii', 'among', 'thing', 'alleg', 'infidel', 'mari', 'stuart', 'threat', 'pose', 'elizabeth', 'rule', 'robert', 'devereux', 'treason', 'heart', 'plot', 'one', 'love', 'triangl', 'whether', 'root', 'realiti', 'invent', 'authent', 'tudor', 'trilog', 'happili', 'us', 'fine', 'opera', 'choos', 'favorit', 'easi', 'mine', 'tend', 'one', 'encount

[['follow', 'summer', 'road', 'trip', 'baton', 'roug', 'la', 'fargo', 'nd', 'may', 'wonder', 'got', 'new', 'york', 'start', 'point', 'first', 'place', 'drove', 'hour', 'alon', 'far', 'cheaper', 'fli', 'louisiana', 'rent', 'car', 'pay', 'huge', 'dropoff', 'fee', 'north', 'dakota', 'fli', 'home'], ['hope', 'compani', 'split', 'drive', 'gas', 'toll', 'week', 'trip', 'post', 'rout', 'three', 'rideshar', 'site', 'zimridecom', 'eridesharecom', 'ridestercom', 'got', 'peep', 'econom', 'pressur', 'push', 'travel', 'share', 'resourc', 'whenev', 'possibl', 'various', 'onlin', 'servic', 'make', 'ever', 'easier', 'wonder', 'possibl', 'someth', 'wrong', 'site', 'inadequ', 'simpli', 'enough', 'peopl', 'look', 'ride', 'answer', 'turn', 'bit', 'rideshar', 'move', 'campus', 'bulletin', 'board', 'internet', 'less', 'success', 'expect', 'though', 'still', 'potenti', 'moneysav', 'travel', 'that', 'most', 'distanc', 'easili', 'cover', 'day', 'could', 'done', 'better', 'first', 'mistak', 'post', 'everywher',

[['washington', 'suspect', 'kill', 'peopl', 'washington', 'navi', 'yard', 'monday', 'testfir', 'ar', 'assault', 'rifl', 'virginia', 'gun', 'store', 'last', 'week', 'stop', 'buy', 'one', 'state', 'law', 'limit', 'sale', 'weapon', 'outofst', 'buyer', 'accord', 'two', 'senior', 'law', 'enforc', 'offici'], ['instead', 'suspect', 'aaron', 'alexi', 'texa', 'bought', 'lawenforcementstyl', 'shotgun', 'remington', 'pumpact', 'use', 'monday', 'rampag', 'navi', 'yard', 'said', 'offici', 'request', 'anonym', 'investig', 'continu', 'gun', 'broken', 'half', 'bag', 'one', 'offici', 'said', 'remington', 'went', 'insid', 'build', 'assembl', 'bathroom', 'gunman', 'perch', 'atrium', 'fire', 'peopl', 'eat', 'breakfast', 'offici', 'said', 'ad', 'use', 'shotgun', 'shell', 'rough', 'dozen', 'larg', 'ballbearinglik', 'shot', 'increas', 'lethal', 'natur', 'discharg', 'piec', 'lead', 'would', 'spread', 'farther', 'went', 'one', 'offici', 'said', 'similar', 'weapon', 'use', 'bird', 'shoot', 'serious', 'scale', '

[['miami', 'mani', 'state', 'prepar', 'introduc', 'linchpin', 'health', 'care', 'law', 'insur', 'exchang', 'design', 'make', 'health', 'care', 'afford', 'hand', 'other', 'take', 'opposit', 'tack', 'complic', 'enrol', 'effort', 'limit', 'inform', 'new', 'program'], ['chief', 'among', 'florida', 'gov', 'rick', 'scott', 'republicandomin', 'legislatur', 'made', 'difficult', 'floridian', 'obtain', 'cheapest', 'insur', 'rate', 'exchang', 'get', 'help', 'special', 'train', 'outreach', 'counselor', 'missouri', 'ohio', 'two', 'state', 'troubl', 'afford', 'care', 'act', 'also', 'move', 'undercut', 'law', 'insur', 'exchang', 'set', 'open', 'oct', 'georgia', 'state', 'insur', 'commission', 'ralph', 'hudgen', 'said', 'everyth', 'power', 'obstructionist', 'alarm', 'resist', 'secretari', 'health', 'human', 'servic', 'kathleen', 'sebelius', 'obama', 'administr', 'intensifi', 'effort', 'win', 'public', 'support', 'exchang', 'florida', 'elsewher', 'confront', 'critic', 'head', 'tuesday', 'ms', 'sebelius

[['barnyard', 'babybi', 'elis', 'broachillustr', 'cori', 'doerrfeld', 'pp', 'littl', 'brown', 'compani', 'board', 'book', 'age', 'month', 'year'], ['sweet', 'fresh', 'appl', 'cider', 'barnyard', 'babi', 'elis', 'broach', 'illustr', 'cori', 'doerrfeld', 'follow', 'irrepress', 'toddler', 'though', 'day', 'adventur', 'farm', 'broach', 'text', 'impressionist', 'funtoreadaloud', 'singsong', 'rhythm', 'hayrid', 'babi', 'feed', 'sheep', 'leafpil', 'babi', 'run', 'leap', 'write', 'book', 'spread', 'autumnhu', 'mani', 'cheer', 'detail', 'brown', 'puppi', 'wear', 'jaunti', 'yellow', 'scarf', 'la', 'rupert', 'bear', 'tini', 'mice', 'hold', 'hand', 'play', 'hay', 'bale', 'leav', 'fall', 'ground', 'form', 'outlin', 'heart', 'there', 'much', 'look', 'discuss', 'lifttheflap', 'give', 'littl', 'hand', 'someth', 'kiss', 'kiss', 'good', 'nightbi', 'kenn', 'nesbittillustr', 'rebecca', 'elliott', 'pp', 'cartwheel', 'booksscholast', 'pictur', 'book', 'age', 'month', 'year', 'kitten', 'lamb', 'bear', 'cub',

[['milan', 'said', 'stefano', 'pilati', 'first', 'collect', 'agnona', 'love', 'women', 'wear', 'univers', 'dont', 'know', 'man'], ['much', 'challeng', 'less', 'object', 'men', 'wear', 'design', 'said', 'fabric', 'differ', 'there', 'movement', 'criteria', 'design', 'kick', 'milan', 'collect', 'wednesday', 'open', 'select', 'fashion', 'women', 'door', 'discreet', 'brand', 'shop', 'instead', 'show', 'audienc', 'invit', 'visitor', 'encourag', 'buy', 'cloth', 'saw', 'mannequin', 'calf', 'hoov', 'instead', 'human', 'leg', 'reflect', 'mr', 'pilati', 'messag', 'agnona', 'start', 'wooli', 'lamb', 'might', 'ad', 'silk', 'also', 'play', 'strong', 'role', 'collect', 'zero', 'name', 'design', 'said', 'first', 'show', 'brand', 'european', 'art', 'movement', 'start', 'agnona', 'earli', 'year', 'year', 'young', 'agnona', 'found', 'part', 'ermenegildo', 'zegna', 'luxuri', 'group', 'begin', 'new', 'life', 'make', 'gildo', 'zegna', 'compani', 'chief', 'execut', 'call', 'complet', 'collect', 'fulli', 'tai

[['new', 'york', 'citi', 'ballet', 'product', 'swan', 'lake', 'shape', 'compani', 'artist', 'director', 'peter', 'martin', 'simpl', 'experi', 'especi', 'perform', 'tuesday', 'keep', 'tug', 'audienc', 'differ', 'direct', 'design', 'per', 'kirkebi', 'turn', 'pencil', 'cartoon', 'sketch', 'swan', 'lake', 'show', 'that', 'daub', 'paintbynumb', 'beginn', 'class', 'princ', 'blue', 'villag', 'green', 'jester', 'orang', 'tchaikovski', 'tempo', 'taken', 'terrif', 'lick', 'romant', 'linger', 'often', 'music', 'andrew', 'sill', 'conduct', 'compani', 'orchestra', 'tuesday', 'rush', 'past', 'without', 'deepen', 'action', 'yet', 'central', 'passag', 'swept', 'urgent', 'vortex'], ['secondari', 'charact', 'tiresom', 'queen', 'victoria', 'complain', 'gladston', 'prime', 'minist', 'speak', 'public', 'meet', 'howev', 'queen', 'gwyneth', 'muller', 'tuesday', 'vast', 'mime', 'gestur', 'suggest', 'think', 'public', 'meet', 'inde', 'jester', 'daniel', 'ulbricht', 'though', 'prodigi', 'speed', 'complet', 'unf

[['cairo', 'moham', 'morsi', 'egypt', 'oust', 'presid', 'spoken', 'telephon', 'famili', 'twice', 'recent', 'day', 'first', 'contact', 'sinc', 'detain', 'militari', 'juli', 'lawyer', 'told', 'associ', 'press', 'wednesday'], ['lawyer', 'mustafa', 'attiyah', 'told', 'news', 'agenc', 'first', 'call', 'took', 'place', 'last', 'week', 'second', 'one', 'two', 'day', 'later', 'mr', 'morsi', 'said', 'good', 'health', 'report', 'gave', 'date', 'either', 'call', 'mr', 'attiyah', 'respond', 'phone', 'call', 'mr', 'morsi', 'famili', 'spoken', 'public', 'month', 'mr', 'morsi', 'held', 'undisclos', 'locat', 'sinc', 'remov', 'offic', 'last', 'report', 'contact', 'came', 'earli', 'august', 'militari', 'allow', 'visit', 'diplomat', 'european', 'union', 'african', 'union', 'end', 'august', 'author', 'announc', 'first', 'legal', 'charg', 'accus', 'incit', 'murder', 'last', 'decemb', 'stoke', 'clash', 'outsid', 'presidenti', 'palac', 'thousand', 'support', 'oppon', 'unclear', 'egypt', 'new', 'author', 'all

[['newark', 'eager', 'fan', 'wait', 'brittney', 'griner', 'wait', 'marvel', 'footinch', 'frame', 'wait', 'give', 'hug', 'wait', 'give', 'gift', 'like', 'white', 'bow', 'tie', 'rhineston', 'wait', 'ask', 'question', 'first', 'dunk', 'advic', 'younger', 'player'], ['last', 'week', 'liberti', 'honor', 'kati', 'smith', 'twotim', 'wnba', 'champion', 'scorer', 'leagu', 'histori', 'final', 'home', 'game', 'year', 'career', 'clear', 'though', 'fan', 'went', 'prudenti', 'center', 'see', 'griner', 'rooki', 'center', 'phoenix', 'mercuri', 'receiv', 'loudest', 'ovat', 'player', 'introduct', 'fan', 'cheer', 'griner', 'began', 'game', 'beat', 'everyon', 'els', 'court', 'acrobat', 'alleyoop', 'layup', 'assist', 'star', 'teammat', 'diana', 'taurasi', 'ooh', 'griner', 'record', 'block', 'final', 'minut', 'secur', 'victori', 'griner', 'finish', 'point', 'understand', 'number', 'seat', 'vital', 'number', 'box', 'score', 'year', 'said', 'yes', 'request', 'pictur', 'autograph', 'game', 'accept', 'gift', 'a

[['new', 'studi', 'provid', 'best', 'evid', 'yet', 'wear', 'back', 'brace', 'slow', 'progress', 'common', 'form', 'scoliosi', 'adolesc', 'help', 'avoid', 'pain', 'spine', 'surgeri'], ['physician', 'recommend', 'brace', 'year', 'studi', 'effect', 'produc', 'mix', 'result', 'unit', 'state', 'prevent', 'servic', 'task', 'forc', 'recommend', 'scoliosi', 'screen', 'school', 'part', 'ground', 'insuffici', 'evid', 'brace', 'conserv', 'treatment', 'reliev', 'back', 'pain', 'improv', 'qualiti', 'life', 'children', 'new', 'random', 'studi', 'publish', 'thursday', 'new', 'england', 'journal', 'medicin', 'end', 'longstand', 'debat', 'sever', 'expert', 'said', 'may', 'spur', 'task', 'forc', 'reconsid', 'posit', 'trial', 'convinc', 'said', 'dr', 'b', 'stephen', 'richard', 'pediatr', 'orthoped', 'surgeon', 'texa', 'scottish', 'rite', 'hospit', 'dalla', 'scientif', 'prove', 'brace', 'treatment', 'work', 'adolesc', 'scoliosi', 'risk', 'curv', 'worsen', 'point', 'need', 'surgeri', 'adolesc', 'girl', 'li

[['interview', 'alan', 'dabbier', 'chairman', 'airwatch', 'mobil', 'devic', 'manag', 'compani', 'conduct', 'condens', 'adam', 'bryant'], ['q', 'youv', 'start', 'help', 'build', 'coupl', 'compani', 'famili', 'dna', 'didnt', 'grow', 'entrepreneuri', 'famili', 'head', 'medic', 'school', 'four', 'year', 'chemistri', 'colleg', 'father', 'got', 'sick', 'pass', 'away', 'senior', 'year', 'didnt', 'like', 'environ', 'hospit', 'matter', 'fact', 'probabl', 'plan', 'doctor', 'want', 'doctor', 'famili', 'end', 'get', 'mba', 'work', 'consult', 'firm', 'four', 'year', 'found', 'consult', 'often', 'told', 'like', 'nonstop', 'travel', 'want', 'littl', 'control', 'life', 'autonomi', 'met', 'develop', 'got', 'togeth', 'decid', 'start', 'compani', 'q', 'learn', 'experi', 'start', 'compani', 'call', 'meet', 'way', 'structur', 'compani', 'own', 'percent', 'capit', 'got', 'compani', 'run', 'joint', 'own', 'percent', 'said', 'dont', 'think', 'that', 'fair', 'anymor', 'go', 'divid', 'equal', 'your', 'go', 'per

[['display', 'antiqu', 'miniatur', 'portrait', 'serv', 'giant', 'diagram', 'connect', 'among', 'famili', 'fortun', 'artist', 'paint', 'work', 'sometim', 'there', 'whiff', 'scandal'], ['wall', 'caption', 'app', 'exhibit', 'galleri', 'explain', 'rosi', 'face', 'jewel', 'frame', 'reveal', 'person', 'slept', 'person', 'person', 'relat', 'person', 'person', 'enemi', 'person', 'cori', 'korkow', 'curat', 'cleveland', 'museum', 'art', 'said', 'phone', 'interview', 'fall', 'museum', 'bring', 'british', 'portrait', 'miniatur', 'larg', 'storag', 'sinc', 'show', 'among', 'sever', 'exhibit', 'imag', 'paint', 'bit', 'ivori', 'vellum', 'enamel', 'paper', 'plan', 'way', 'unit', 'state', 'britain', 'bumper', 'time', 'miniatur', 'jo', 'langston', 'christi', 'specialist', 'said', 'phone', 'interview', 'nov', 'christi', 'london', 'auction', 'miniatur', 'estim', 'eliot', 'widow', 'valeri', 'hung', 'london', 'apart', 'peppermint', 'green', 'wall', 'depict', 'georg', 'iv', 'daughter', 'charlott', 'brother', 

[['want', 'interest', 'perspect', 'polic', 'procedur', 'watch', 'real', 'polic', 'offic', 'want', 'insight', 'accuraci', 'medic', 'drama', 'watch', 'doctor', 'bad', 'lateth', 'earlythcenturi', 'british', 'king', 'around', 'watch', 'hollow', 'crown', 'fourpart', 'serv', 'shakespear', 'begin', 'friday', 'pbss', 'great', 'perform', 'think', 'question', 'could', 'ask'], ['guy', 'realli', 'make', 'habit', 'spout', 'eloqu', 'eulog', 'speech', 'improb', 'time', 'realli', 'practic', 'women', 'england', 'except', 'mrs', 'weasley', 'harri', 'potter', 'movi', 'king', 'ever', 'actual', 'govern', 'fight', 'fret', 'wasnt', 'tri', 'unseat', 'cours', 'pester', 'might', 'earn', 'behead', 'seem', 'method', 'prefer', 'king', 'wouldb', 'king', 'yore', 'make', 'annoy', 'go', 'away', 'coupl', 'head', 'graphic', 'lop', 'richard', 'ii', 'friday', 'open', 'instal', 'seri', 'also', 'includ', 'success', 'friday', 'henri', 'iv', 'part', 'henri', 'iv', 'part', 'henri', 'v', 'tumbl', 'head', 'worth', 'note', 'first

[['celebr', 'written', 'spoken', 'word', 'friday', 'sunday', 'perfect', 'weekend', 'love', 'good', 'stori', 'brooklyn', 'book', 'festiv', 'sunday', 'pm', 'featur', 'indoor', 'outdoor', 'read', 'discuss', 'event', 'adult', 'children', 'various', 'locat', 'includ', 'borough', 'hall', 'plaza', 'joralemon', 'street', 'downtown', 'brooklyn', 'visitor', 'previous', 'event'], ['writer', 'appear', 'festiv', 'present', 'brooklyn', 'borough', 'presid', 'offic', 'support', 'att', 'includ', 'edwidg', 'danticat', 'meg', 'wolitz', 'david', 'levithan', 'tom', 'wolf', 'need', 'wait', 'till', 'sunday', 'pre', 'postfestiv', 'event', 'continu', 'read', 'caribbean', 'caribbeanamerican', 'writer', 'friday', 'pm', 'delroy', 'cafe', 'duryea', 'place', 'flatbush', 'literari', 'salon', 'wil', 'haygood', 'butler', 'wit', 'histori', 'white', 'hous', 'butler', 'eugen', 'allen', 'saturday', 'pm', 'skylight', 'galleri', 'fulton', 'street', 'new', 'york', 'brooklyn', 'avenu', 'literari', 'tailgat', 'combin', 'sport'

[['firstfloor', 'galleri', 'wadsworth', 'atheneum', 'museum', 'art', 'home', 'recent', 'heart', 'collect', 'thcenturi', 'landscap', 'paint', 'thoma', 'cole', 'freder', 'church', 'albert', 'bierstadt', 'member', 'hudson', 'river', 'school', 'collect', 'commiss', 'direct', 'artist', 'museum', 'founder', 'earliest', 'patron', 'remov', 'last', 'fall', 'instal', 'new', 'heat', 'cool', 'system', 'replac', 'exhibit', 'contemporari', 'photograph', 'show', 'differ', 'version', 'vision', 'natur'], ['instead', 'forest', 'elud', 'creep', 'industri', 'mountain', 'sunset', 'full', 'romanticera', 'symbol', 'artifici', 'wilder', 'landscap', 'contemporari', 'photographi', 'featur', 'monument', 'postindustri', 'landscap', 'gas', 'station', 'park', 'lot', 'sign', 'build', 'stamp', 'corpor', 'logo', 'great', 'analyst', 'earli', 'postwar', 'urban', 'sprawl', 'like', 'ed', 'ruscha', 'repres', 'vitrin', 'selfpublish', 'artist', 'book', 'one', 'everi', 'build', 'sunset', 'strip', 'deadpan', 'represent', 'exac

[['almost', 'year', 'hiatus', 'unit', 'airlin', 'urg', 'travel', 'fli', 'friend', 'sky'], ['icon', 'taglin', 'creat', 'leo', 'burnett', 'use', 'carrier', 'part', 'way', 'agenc', 'resurrect', 'multimedia', 'ad', 'campaign', 'mcgarrybowen', 'unit', 'largest', 'decad', 'campaign', 'also', 'featur', 'georg', 'gershwin', 'rhapsodi', 'blue', 'music', 'unit', 'use', 'continu', 'advertis', 'sinc', 'campaign', 'begin', 'sunday', 'broadcast', 'nfl', 'footbal', 'game', 'pga', 'tour', 'championship', 'season', 'premier', 'minut', 'emmi', 'award', 'program', 'contain', 'refresh', 'stcenturi', 'version', 'taglin', 'unit', 'tell', 'travel', 'everyth', 'legroom', 'friend', 'onlin', 'friend', 'shutey', 'friend', 'ewr', 'friend', 'refer', 'hub', 'continent', 'airlin', 'newark', 'liberti', 'intern', 'airport', 'unit', 'inherit', 'two', 'airlin', 'merg', 'creat', 'world', 'largest', 'carrier', 'term', 'passeng', 'traffic', 'burnett', 'came', 'fli', 'friend', 'sky', 'taglin', 'mid', 'pitch', 'unit', 'busi'

[['fangirlbi', 'rainbow', 'rowel', 'st', 'martin', 'griffin'], ['first', 'semest', 'univers', 'nebraska', 'near', 'everyth', 'made', 'cath', 'feel', 'happi', 'safe', 'come', 'undon', 'father', 'manic', 'episod', 'get', 'wors', 'mother', 'walk', 'famili', 'year', 'ago', 'sudden', 'want', 'friend', 'ident', 'twin', 'sister', 'social', 'adept', 'stylish', 'one', 'live', 'anoth', 'dorm', 'parti', 'hard', 'thousand', 'onlin', 'reader', 'cath', 'unhappi', 'freshman', 'eat', 'protein', 'bar', 'room', 'avoid', 'cafeteria', 'shes', 'magicath', 'prolif', 'author', 'fan', 'fiction', 'remix', 'huge', 'popular', 'simon', 'snow', 'novel', 'bear', 'tongueincheek', 'resembl', 'harri', 'potter', 'seri', 'cath', 'eas', 'plot', 'live', 'young', 'wizard', 'doesnt', 'translat', 'manag', 'rowel', 'whose', 'last', 'young', 'adult', 'novel', 'eleanor', 'park', 'special', 'young', 'misfit', 'chart', 'way', 'world', 'doesnt', 'disappoint', 'though', 'theme', 'young', 'writer', 'find', 'voic', 'may', 'familiar',

[['washington', 'rough', 'hour', 'sever', 'day', 'defens', 'lawyer', 'three', 'former', 'unit', 'state', 'naval', 'academi', 'footbal', 'player', 'grill', 'femal', 'midshipman', 'sexual', 'habit', 'public', 'hear', 'ask', 'woman', 'accus', 'three', 'athlet', 'rape', 'whether', 'wore', 'bra', 'wide', 'open', 'mouth', 'oral', 'sex', 'whether', 'apolog', 'anoth', 'midshipman', 'intercours', 'ho'], ['aggress', 'tactic', 'display', 'month', 'last', 'part', 'case', 'generat', 'intens', 'public', 'scrutini', 'rais', 'alarm', 'call', 'articl', 'proceed', 'help', 'determin', 'whether', 'case', 'sent', 'courtsmarti', 'articl', 'hear', 'permit', 'question', 'allow', 'civilian', 'court', 'includ', 'crossexamin', 'wit', 'intens', 'legal', 'expert', 'say', 'frighten', 'mani', 'victim', 'come', 'forward', 'becom', 'trial', 'said', 'jonathan', 'luri', 'professor', 'emeritus', 'legal', 'histori', 'rutger', 'univers', 'author', 'two', 'book', 'militari', 'justic', 'articl', 'come', 'time', 'either', 'ge

[['los', 'angel', 'emmi', 'award', 'wit', 'mani', 'new', 'player', 'red', 'carpet', 'year', 'never', 'gatecrash', 'quit', 'like', 'netflix'], ['hous', 'card', 'nomin', 'outstand', 'drama', 'first', 'time', 'program', 'distribut', 'internet', 'compet', 'emmi', 'right', 'alongsid', 'program', 'distribut', 'rabbit', 'ear', 'satellit', 'dish', 'prospect', 'stream', 'video', 'servic', 'like', 'netflix', 'could', 'end', 'winner', 'emmi', 'ceremoni', 'sunday', 'night', 'cast', 'spotlight', 'profound', 'televis', 'landscap', 'chang', 'still', 'televis', 'critic', 'selfprofess', 'emmi', 'expert', 'suspect', 'cabl', 'channel', 'amc', 'netflix', 'celebr', 'award', 'show', 'break', 'bad', 'nomin', 'best', 'drama', 'four', 'time', 'never', 'clear', 'favorit', 'year', 'email', 'debra', 'birnbaum', 'editor', 'chief', 'tv', 'guid', 'magazin', 'borrow', 'phrase', 'seri', 'meth', 'lord', 'walter', 'white', 'break', 'bad', 'danger', 'emmi', 'season', 'that', 'first', 'half', 'show', 'final', 'season', 'c

[['washington', 'obama', 'administr', 'potenti', 'pathbreak', 'propos', 'carbon', 'emiss', 'limit', 'new', 'power', 'plant', 'face', 'polit', 'legal', 'challeng', 'oppon', 'argu', 'technolog', 'need', 'close', 'proven', 'law', 'requir'], ['draft', 'rule', 'announc', 'friday', 'nation', 'press', 'club', 'gina', 'mccarthi', 'administr', 'environment', 'protect', 'agenc', 'protect', 'industri', 'pieintheski', 'requir', 'current', 'law', 'limit', 'rule', 'agenc', 'make', 'epa', 'rule', 'sometim', 'demand', 'technolog', 'advanc', 'goal', 'agenc', 'establish', 'met', 'techniqu', 'exist', 'law', 'describ', 'adequ', 'demonstr', 'propos', 'would', 'limit', 'new', 'gasfir', 'power', 'plant', 'pound', 'carbon', 'dioxid', 'emiss', 'per', 'megawatthour', 'new', 'coal', 'plant', 'pound', 'carbon', 'dioxid', 'industri', 'offici', 'say', 'averag', 'advanc', 'coal', 'plant', 'current', 'emit', 'pound', 'carbon', 'dioxid', 'per', 'megawatthour', 'megawatthour', 'littl', 'typic', 'american', 'household',

[['nairobi', 'kenya', 'mask', 'gunmen', 'storm', 'fanci', 'crowd', 'mall', 'nairobi', 'saturday', 'shot', 'dead', 'least', 'peopl', 'wound', 'one', 'chill', 'terrorist', 'attack', 'east', 'africa', 'sinc', 'al', 'qaeda', 'blew', 'two', 'american', 'embassi'], ['parent', 'hurl', 'bodi', 'children', 'peopl', 'jump', 'ventil', 'shaft', 'save', 'shopper', 'huddl', 'behind', 'plastic', 'mannequin', 'design', 'cloth', 'store', 'two', 'squad', 'gunmen', 'believ', 'link', 'somali', 'terrorist', 'group', 'move', 'mall', 'shoot', 'shopper', 'head', 'hour', 'later', 'mall', 'gleam', 'floor', 'smear', 'blood', 'polic', 'offic', 'dash', 'corpsestrewn', 'corridor', 'tri', 'find', 'assail', 'standoff', 'attack', 'report', 'heavili', 'arm', 'hold', 'unknown', 'number', 'hostag', 'continu', 'sun', 'rose', 'sunday', 'mall', 'call', 'westgat', 'symbol', 'kenya', 'rise', 'prosper', 'impress', 'fivestori', 'build', 'kenyan', 'buy', 'expens', 'cup', 'frozen', 'yogurt', 'plate', 'sushi', 'saturday', 'especi'

In [7]:
print(len(docs_cleaned))

3579


In [15]:
def our_count_vectorizer(docs):
    '''
    Args:
        docs (list of lists of strings): corpus
    Returns:
        X_count (numpy array): count vectors
        vocab (list of strings): alphabetical list 
                                 of unique words
    '''
    vocab_set = set()
    for doc in docs:
        vocab_set.update(doc)

    vocab = sorted(vocab_set)

    X_count = np.zeros(shape=(len(docs), len(vocab)))

    for i, doc in enumerate(docs):
        for word in doc:
            j = vocab.index(word)
            X_count[i,j] += 1
    return X_count, vocab

In [16]:
test_docs = docs_cleaned[0:2]

In [17]:
X_count, vocab = count_vectorizer(test_docs)

In [18]:
vocab

['abandon',
 'abet',
 'accept',
 'access',
 'accompani',
 'adorn',
 'aerat',
 'almost',
 'ambit',
 'american',
 'apricot',
 'arriv',
 'background',
 'bacon',
 'bake',
 'bakeri',
 'banquett',
 'bar',
 'basil',
 'batter',
 'beef',
 'beer',
 'believ',
 'belli',
 'benjamin',
 'bite',
 'bland',
 'blt',
 'boast',
 'border',
 'borrow',
 'bouchon',
 'brazen',
 'bread',
 'breathless',
 'brewer',
 'brioch',
 'broadway',
 'broccoli',
 'broken',
 'brought',
 'brunch',
 'burger',
 'burn',
 'butteri',
 'cake',
 'call',
 'came',
 'ceil',
 'centrico',
 'certain',
 'chandeli',
 'char',
 'charcoal',
 'chaser',
 'chees',
 'chef',
 'chicken',
 'chile',
 'chill',
 'chocol',
 'classic',
 'coddl',
 'come',
 'condiment',
 'confirm',
 'confound',
 'confront',
 'cook',
 'cool',
 'copi',
 'corner',
 'countri',
 'cours',
 'coy',
 'cracker',
 'crackl',
 'crane',
 'crisp',
 'crown',
 'crunch',
 'cryovac',
 'cube',
 'cumin',
 'custard',
 'cut',
 'dancer',
 'dark',
 'deconstruct',
 'degre',
 'dehydr',
 'departur',
 '

In [19]:
count_vectors = pd.DataFrame(data=X_count, columns=vocab)
count_vectors

Unnamed: 0,abandon,abet,accept,access,accompani,adorn,aerat,almost,ambit,american,...,without,wondra,wood,work,wouldnt,year,yearold,yeast,yes,yuengl
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Bag Of Words and TFIDF

In [33]:
vocab_dict = {word: i for i, word in enumerate(docs_cleaned)}

TypeError: unhashable type: 'list'