In [1]:
import numpy as np
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.neighbors import NearestNeighbors

In [2]:
# load dataframe
df0 = pd.read_csv('simpsons_dataset.csv')

In [52]:
# drops cols with null values and resets index
df = df0.dropna().reset_index(drop=True).copy()

In [53]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [54]:
nlp = spacy.load('en_core_web_lg')
STOPWORDS = nlp.Defaults.stop_words.union({' ', ''})

In [55]:
def tokenize(doc):
    '''
    function to tokenize, remove stop words, and return list of tokens from an input doc
    '''
    text = re.sub(r'[^a-zA-Z ]', '', doc)
    text = text.lower()
    tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
    tokens = tokenizer(text)
    list_of_tokens = [t for t in tokens if (str(t) not in STOPWORDS) and (t.is_punct == False)]
    return (list_of_tokens)

In [56]:
# creates a token feature
df['tokens'] = df['spoken_words'].apply(tokenize)
df['tokens'].head()

0    [actually, little, disease, magazines, news, s...
1                              [wheres, mr, bergstrom]
2    [dont, know, id, sure, like, talk, didnt, touc...
3                                [life, worth, living]
4    [polls, open, end, recess, case, decided, thou...
Name: tokens, dtype: object

In [57]:
# creating vectorizor
vectorizor = tfidf(stop_words=STOPWORDS, lowercase=True, min_df=0.005, max_df=0.95)

In [58]:
sparse = vectorizor.fit_transform(df['spoken_words'])

  'stop_words.' % sorted(inconsistent))


In [59]:
dense = sparse.todense()

In [60]:
dtm = pd.DataFrame(dense, columns=vectorizor.get_feature_names())
dtm.head()

Unnamed: 0,ah,away,baby,bad,bart,believe,best,better,big,boy,...,want,way,won,work,world,wow,ya,yeah,years,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
df_vect = pd.concat([df, dtm], axis=1)

In [62]:
nn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=3, p=2, radius=1.0)

In [63]:
# turns input quote into a vector to find similar quotes
input_quote = 'homer go home you big idiot'
fake_vec = vectorizor.transform([input_quote])

In [64]:
# finds similar quotes
quotes = nn.kneighbors(fake_vec.todense())

In [68]:
# looks at index numbers of similar quotes
quotes[1]

array([[116283,  13421,   6074]])

In [67]:
# looks at the text for one of the similar quotes
df['spoken_words'][6074]

"Homer, you're just a big sack of sugar."