In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA
from sklearn.manifold import TSNE 
from sklearn.decomposition import TruncatedSVD

from sklearn.neighbors import BallTree
from sklearn.base import BaseEstimator

from sklearn.pipeline import make_pipeline


In [3]:
subt = [line.rstrip('\n').replace('\\n',' ').replace('>','') for line in open('subtitles.txt')]

In [5]:
df = pd.DataFrame(columns=['context', 'reply'])
df['context'] = subt
df['context'] = df['context'].apply(lambda x: x.lower())
df['reply'] = subt[1:] + ['...']
df['reply'] = df['reply'].apply(lambda x: x.lower())

In [6]:
for sign in ['!', '?', ',', '.', ':']:
    df['context'] = df['context'].apply(lambda x: x.replace(sign,' '))
    df['reply'] = df['reply'].apply(lambda x: x.replace(sign,' '))

In [7]:
df.head(5)

Unnamed: 0,context,reply
0,obey the rules and manners to have a fun duel,light so glaring that it can't be hidden shatt...
1,light so glaring that it can't be hidden shatt...,the violently quaking and thirsty earth stakes...
2,the violently quaking and thirsty earth stakes...,just one more time is enough to create a miracle
3,just one more time is enough to create a miracle,for the power to regain the pages of memory lo...
4,for the power to regain the pages of memory lo...,the believing heart that penetrates the darkne...


In [8]:

vectorizer = TfidfVectorizer()
vectorizer.fit(df.context)
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
vector = vectorizer.transform(df.head(10))
print(vector.shape)
print(vector.toarray())
vector_large = vectorizer.transform(df.context)

[ 9.72938751  8.11698371 14.34450803 ... 13.24589574 13.93904292
 14.34450803]
(2, 59407)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
print(vector_large.shape)
TS = TruncatedSVD(n_components=300, algorithm='randomized')

TS.fit(vector_large)
vector_small = TS.transform(vector_large)

# Print new dimensionality and explained variance ratio
print(vector_small.shape)



(1248751, 59407)
(1248751, 300)


In [28]:
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))

class NeighborSampler(BaseEstimator):
    def __init__(self, k=10, temperature = 2.0):
        self.k = k
        self.temperature = temperature
    
    def fit(self, X, y):
        self.tree_ = BallTree(X)
        self.y_ = np.array(y)
        
    def predict(self, X, random_state = None):
        dis, ind = self.tree_.query(X, return_distance = True, k = self.k)
        result = []
        for d, i in zip(dis, ind):
            result.append(np.random.choice(i, p = sigmoid(d * self.temperature)))
            
        return self.y_[result]

In [29]:
nei_sam = NeighborSampler()
nei_sam.fit(vector_small, df.reply)

pipe = make_pipeline(vectorizer, TS, nei_sam)

In [48]:
print('To stop chatting type \'BYE(All uppercase)\'')
x= input()
while x!='BYE':
      print('JenBo: ',pipe.predict([x]))
      x=input('You: ')

To stop chatting type 'BYE(All uppercase)'
Hi
JenBo:  ['thanks  nako ']
You: How do you feel
JenBo:  ['the seed i shot into you will soon take root and control your body ']
You: you are rude
JenBo:  ['g-good grief  what the hell are  you talking about  polnareff ']
You: Do you know about haki?
JenBo:  ['no  would you like me to investigate ']
You: yes
JenBo:  ['take care on your trip ']
You: nice prank
JenBo:  ['tanaka is scoring consistently  though ']
You: do you want meat? 
JenBo:  ['no ']
You: why?
JenBo:  ["why does it matter  so you're doing something for christmas "]
You: BYE
