In [10]:
import os
import re

import tqdm
import aiml_bot

In [53]:
from nlpia.data.loaders import get_data
df = get_data('ubuntu_dialog')

In [54]:
df.head(4)

Unnamed: 0,Context,Utterance
0,i think we could import the old comments via r...,basically each xfree86 upload will NOT force u...
1,I'm not suggesting all - only the ones you mod...,oh? oops. __eou__
2,afternoon all __eou__ not entirely related to ...,we'll have a BOF about this __eou__ so you're ...
3,interesting __eou__ grub-install worked with /...,i fully endorse this suggestion </quimby> __eo...


In [4]:
def split_turns(s, splitter=re.compile('__eot__')):
    """ Split a string on __eot__ markders (turns) """
    for utterance in splitter.split(s):
        utterance = utterance.replace('__eou__', '\n')
        utterance = utterance.replace('__eot__', '')
        if len(utterance.strip()):
            yield utterance

In [5]:
for i, record in df.head(3).iterrows():
    statement = list(split_turns(record.Context))[-1]
    reply = list(split_turns(record.Utterance))[-1]
    print('Statement: {}'.format(statement))
    print()
    print('Reply: {}'.format(reply))

Statement:  I would prefer to avoid it at this stage.  this is something that has gone into XSF svn, I assume? 
 

Reply: basically each xfree86 upload will NOT force users to upgrade 100Mb of fonts for nothing 
 no something i did in my spare time. 

Statement:  ok, it sounds like you're agreeing with me, then 
 though rather than "the ones we modify", my idea is "the ones we need to merge" 
 

Reply: oh? oops. 

Statement:  should g2 in ubuntu do the magic dont-focus-window tricks? 
 join the gang, get an x-series thinkpad 
 sj has hung on my box, again. 
 what is monday mornings discussion actually about? 
 

Reply: we'll have a BOF about this 
 so you're coming tomorrow ? 



In [14]:
from tqdm import tqdm
def preprocess_ubuntu_corpus(df):
    """Split all strings in df.Context and df.Utterance on __eot__ (turn) markers """
    statements = []
    replies = []
    for i, record in tqdm(df.iterrows()):
        turns = list(split_turns(record.Context))
        statement = turns[-1] if len(turns) else '\n'  # <1>
        statements.append(statement)
        turns = list(split_turns(record.Utterance))
        reply = turns[-1] if len(turns) else '\n'
        replies.append(reply)
    df['statement'] = statements
    df['reply'] = replies
    return df

In [55]:
split = int(len(df)/2)
sub_df = df.iloc[0:split, :]

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
proc_df = preprocess_ubuntu_corpus(sub_df)
tfidf = TfidfVectorizer(min_df=8, max_df=.3, max_features=50000)
tfidf.fit(proc_df.statement)


0it [00:00, ?it/s][A
707it [00:00, 7032.12it/s][A
243939it [00:40, 6080.86it/s]A
2282it [00:00, 7568.55it/s][A
3060it [00:00, 7635.78it/s][A
3852it [00:00, 7692.07it/s][A
4623it [00:00, 7694.11it/s][A
5406it [00:00, 7713.62it/s][A
6165it [00:00, 7697.19it/s][A
6951it [00:00, 7715.05it/s][A
7697it [00:01, 7635.55it/s][A
8471it [00:01, 7644.64it/s][A
9244it [00:01, 7651.12it/s][A
9997it [00:01, 7615.04it/s][A
10766it [00:01, 7620.41it/s][A
11551it [00:01, 7635.46it/s][A
12358it [00:01, 7662.19it/s][A
13155it [00:01, 7679.89it/s][A
13942it [00:01, 7690.09it/s][A
14725it [00:01, 7691.57it/s][A
15505it [00:02, 7672.13it/s][A
16271it [00:02, 7669.39it/s][A
17036it [00:02, 7665.77it/s][A
17799it [00:02, 7657.82it/s][A
18602it [00:02, 7673.23it/s][A
19374it [00:02, 7653.98it/s][A
20159it [00:02, 7661.16it/s][A
20932it [00:02, 7663.49it/s][A
21716it [00:02, 7669.54it/s][A
22505it [00:02, 7676.84it/s][A
23283it [00:03, 7672.06it/s][A
24053it [00:03, 7670.21it/s][A

197633it [00:25, 7714.35it/s][A
198412it [00:25, 7714.63it/s][A
199253it [00:25, 7717.30it/s][A
200100it [00:25, 7720.17it/s][A
200912it [00:26, 7720.55it/s][A
201745it [00:26, 7722.83it/s][A
202585it [00:26, 7725.40it/s][A
203412it [00:26, 7727.45it/s][A
204247it [00:26, 7729.79it/s][A
205081it [00:26, 7732.09it/s][A
205910it [00:26, 7733.99it/s][A
206737it [00:26, 7735.51it/s][A
207560it [00:26, 7735.67it/s][A
208370it [00:26, 7736.87it/s][A
209179it [00:27, 7737.23it/s][A
209980it [00:27, 7737.28it/s][A
210789it [00:27, 7738.60it/s][A
211615it [00:27, 7740.48it/s][A
212432it [00:27, 7742.03it/s][A
213267it [00:27, 7744.21it/s][A
214085it [00:27, 7745.76it/s][A
214916it [00:27, 7747.78it/s][A
215738it [00:27, 7748.27it/s][A
216564it [00:27, 7750.08it/s][A
217380it [00:28, 7749.89it/s][A
218203it [00:28, 7751.59it/s][A
219036it [00:28, 7753.63it/s][A
219852it [00:28, 7755.01it/s][A
220674it [00:28, 7756.63it/s][A
221519it [00:28, 7759.05it/s][A
222363it [

401870it [00:50, 7908.68it/s][A
402682it [00:50, 7908.86it/s][A
403490it [00:51, 7908.84it/s][A
404297it [00:51, 7909.14it/s][A
405101it [00:51, 7907.03it/s][A
405873it [00:51, 7905.03it/s][A
406661it [00:51, 7904.95it/s][A
407466it [00:51, 7905.24it/s][A
408297it [00:51, 7906.01it/s][A
409112it [00:51, 7906.46it/s][A
409942it [00:51, 7907.22it/s][A
410783it [00:51, 7908.17it/s][A
411606it [00:52, 7908.73it/s][A
412425it [00:52, 7909.30it/s][A
413244it [00:52, 7909.82it/s][A
414063it [00:52, 7910.25it/s][A
414880it [00:52, 7910.47it/s][A
415701it [00:52, 7911.03it/s][A
416522it [00:52, 7911.59it/s][A
417341it [00:52, 7912.11it/s][A
418160it [00:52, 7912.63it/s][A
418978it [00:52, 7913.10it/s][A
419796it [00:53, 7913.15it/s][A
420611it [00:53, 7913.59it/s][A
421423it [00:53, 7913.96it/s][A
422235it [00:53, 7914.26it/s][A
423046it [00:53, 7914.61it/s][A
423862it [00:53, 7915.07it/s][A
424682it [00:53, 7915.59it/s][A
425497it [00:53, 7915.74it/s][A
426307it [

605527it [01:15, 7973.33it/s][A
606348it [01:16, 7973.20it/s][A
607182it [01:16, 7973.67it/s][A
608015it [01:16, 7974.14it/s][A
608837it [01:16, 7974.43it/s][A
609659it [01:16, 7974.58it/s][A
610477it [01:16, 7974.68it/s][A
611291it [01:16, 7974.78it/s][A
612105it [01:16, 7974.96it/s][A
612927it [01:16, 7975.30it/s][A
613744it [01:16, 7975.55it/s][A
614560it [01:17, 7975.41it/s][A
615382it [01:17, 7975.72it/s][A
616194it [01:17, 7975.83it/s][A
617006it [01:17, 7976.01it/s][A
617847it [01:17, 7976.57it/s][A
618692it [01:17, 7977.17it/s][A
619528it [01:17, 7977.66it/s][A
620365it [01:17, 7978.16it/s][A
621202it [01:17, 7978.66it/s][A
622048it [01:17, 7979.28it/s][A
622886it [01:18, 7979.24it/s][A
623711it [01:18, 7979.41it/s][A
624532it [01:18, 7979.50it/s][A
625348it [01:18, 7979.29it/s][A
626154it [01:18, 7979.32it/s][A
626973it [01:18, 7979.58it/s][A
627810it [01:18, 7980.06it/s][A
628651it [01:18, 7980.61it/s][A
629488it [01:18, 7981.10it/s][A
630316it [

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.3, max_features=50000, min_df=8,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [59]:
import pandas as pd
X = tfidf.transform(proc_df.statement)
X = pd.DataFrame(X.todense(), columns=tfidf.get_feature_names())

MemoryError: 

In [None]:
x = tfidf.transform(['This is an example statement that we want to retrieve the best reply for.'])
cosine_similarities = x.dot(X.T)
reply = df.loc[cosine_similarities.argmax()]