In [2]:
import pandas as pd
import numpy as np
from nlpia.data.loaders import read_csv
from sklearn.metrics.pairwise import cosine_distances
import tqdm

df = read_csv('../../../10k_movie_lines.csv')
print(df.shape)
print(df.count())
df = df.fillna(' ')
print(df.count())
df.head()


(25149, 2)
x    25148
y    25148
dtype: int64
x    25149
y    25149
dtype: int64


Unnamed: 0,x,y
0,can we make this quick? roxanne korrine and a...,"well, i thought we'd start with pronunciation,..."
1,"well, i thought we'd start with pronunciation,...",not the hacking and gagging and spitting part....
2,not the hacking and gagging and spitting part....,okay... then how 'bout we try out some french ...
3,you're asking me out. that's so cute. what's ...,forget it.
4,"no, no, it's my fault -- we didn't have a prop...",cameron.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [4]:
tfidf = TfidfVectorizer(min_df=1, max_df=.3, max_features=1000000, ngram_range=(1, 1), stop_words=None)
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=1000000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
tfidf.fit(pd.concat([df[df.columns[i]] for i in range(2)]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=1000000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
print(list(tfidf.vocabulary_)[:10])
print(len(tfidf.vocabulary_))

['rage', 'vox', 'pioneer', 'gateway', 'disposal', 'apartments', 'tock', 'squeeze', 'speedball', 'concessions']
16597


In [7]:
def norm(vec):
    return np.divide(vec, np.power(np.dot(vec, vec.T), .5))

In [None]:
X = tfidf.transform(df.x).todense()
print(X[0])
X = [norm(row) for row in X]
X.shape

[[ 0.  0.  0. ...,  0.  0.  0.]]


  


In [140]:
y = tfidf.transform(df.y)
# y.todense()
# ~.8 GB @ 5000 words

In [9]:
norm(X[0])

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [141]:
def get_statement(s='hi'):
    q = tfidf.transform([s.lower()]).todense()[0]
    similarity = 0
    best_i = 0
    for i, v in enumerate(X):
        # print(i, q, v)
        sim = 2 - cosine_distances(q, v.todense())
        if sim > similarity:
            similarity = sim
            best_i = i
    return df.x.iloc[best_i], best_i

def get_reply(s='hi'):
    return df.y.iloc[get_statement(s)[1]]

In [99]:
s1 = 'hello ubuntu'
s2 = "use the force luke"
words = s2.split()
for i in range(len(words) - 2):
    print(words[i], words[i] in tfidf.vocabulary_)
    print(words[i], words[i+1], ' '.join([words[i], words[i+1]]) in tfidf.vocabulary_)

use True
use the True
the True
the force True


In [29]:
get_statement(s1)

('hello.', 1667)

In [30]:
df.iloc[1667]

x            hello.
y    hi, it's me...
Name: 1667, dtype: object

In [41]:
get_reply(s1)

"hi, it's me..."

In [42]:
get_statement(s2)

("doesn't work on you.  see ya around, little brother.", 8482)

In [43]:
get_reply(s2)

'not likely.'

In [142]:
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=50)
pca = pca.fit(X)   # tfidf.transform(df.Context) == X
X_100d = pca.transform(X)
y_100d = pca.transform(y)

In [143]:
def get_pca(s='Hi', qst=True, ans=True):
    q = pca.transform(tfidf.transform([s.lower()]).todense())[0]
    similarity = 0
    best_i = 0
    for i, v in enumerate(X_100d):
        # print(i, q.shape, v.shape)
        sim = 2 - cosine_distances(pd.np.array([q]), pd.np.array([v]))
        if sim > similarity:
            similarity = sim
            best_i = i
    
    if qst:
        print(s)
        print(df.x.iloc[best_i])
    
    if ans:
        return df.y.iloc[best_i], best_i

In [144]:
s3 = 'did you see the new movie'
t = tfidf.transform([s3])
t

<1x496674 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [145]:
get_pca(s1)

hello ubuntu
hello.


("hi, it's me...", 1667)

In [146]:
get_pca(s2)

use the force luke
didja use the commode story?


("fuckin' a. i tell it real good, too.", 64771)

In [147]:
get_pca(s3)

did you see the new movie
did you see the papers today?


('yes, sir, i did.', 49113)

In [148]:
get_pca("which way to the store?")

which way to the store?
which way to the hollywood bowl?


('down that way.', 128854)

In [149]:
get_pca('luke i am your father')

luke i am your father
laura, i am your friend.


("i know you are and you don't have to do anything crazy to prove it.", 83336)

In [150]:
get_pca('i have a bridge in brooklyn to sell you')

i have a bridge in brooklyn to sell you
you have to put things in perspective.


('i know, i know.', 75134)

In [155]:
get_pca("a samurai makes every decision in the span of seven breaths")

a samurai makes every decision in the span of seven breaths
in the immortal words of derrick coleman, whooopdeedamndoo!!!


("derrick coleman, he possessed all the talent in the world, coulda, shoulda, been a great ballplayer but alas d.c. didn't want it bad enough.  delacroix, do you want it? bad enough to kill for it?  do you want it that much.",
 9160)

In [152]:
get_pca('do you feel lucky, punk? do you')

do you feel lucky, punk? do you
do you feel comforted?


('yes.', 68006)

In [154]:
get_pca("i don't think we're in kansas anymore, toto")

i don't think we're in kansas anymore, toto
i don't think we're ready.


("you're not ready.", 19497)

In [156]:
get_pca("elephant")

elephant
a girl with a body like mine?


("you're breaking my concentration!", 106573)

In [158]:
get_pca('you are an elephant')

you are an elephant
you are an animal.


("really? wow! that's that's terrific to hear from someone so... feminine, so female.",
 34555)