### Import Data + Modules

In [1]:
# These commands download the necessary data and modules from Luuk's github account and place them in the expected folders.
  # (NOTE: we used python 3 and a notebook on Google Colab to run this)

!rm -rf Bot
!rm -rf raw_data
!git clone --recursive https://github.com/luukru/Bot.git
!cp -r Bot/raw_data .

Cloning into 'Bot'...
remote: Counting objects: 39, done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 39 (delta 14), reused 33 (delta 8), pack-reused 0[K
Unpacking objects: 100% (39/39), done.


In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd

from Bot import data
from Bot import data_utils
from Bot import seq2seq_wrapper

### Seq2seq

In [3]:
# Preprocess data
data.process_data()

>> gathered id2line dictionary.

[['L447', 'L448'], ['L490', 'L491'], ['L716', 'L717', 'L718', 'L719', 'L720', 'L721'], ['L750', 'L751', 'L752', 'L753', 'L754', 'L755']]
>> gathered conversations.


>> Filter lines

>> 2nd layer of filtering
28% filtered from original data
q : [you hate me dont you]; a : [i dont really think you warrant that strong an emotion]
q : [then say youll spend dollar night at the track with me]; a : [and why would i do that]
q : [come on  the ponies the flat beer you with money in your eyes me with my hand on your ass]; a : [you  covered in my vomit]
q : [are you following me]; a : [i was in the laundromat i saw your car thought id say hi]

>> Segment lines into words

:: Sample from segmented list of words
q : [['you', 'hate', 'me', 'dont', 'you']]; a : [['i', 'dont', 'really', 'think', 'you', 'warrant', 'that', 'strong', 'an', 'emotion']]
q : [['then', 'say', 'youll', 'spend', 'dollar', 'night', 'at', 'the', 'track', 'with', 'me']]; a : [['and', 'why', 'woul

In [4]:
import importlib
importlib.reload(data)

<module 'Bot.data' from '/content/Bot/data.py'>

In [0]:
# Load preprocessed data from pickle and npy files
metadata, idx_q, idx_a = data.load_data(PATH='')

# Divide data into training, test and validation sets.
(trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_q, idx_a)

In [6]:
# The seq2seq model parameters 
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 16
xvocab_size = len(metadata['idx2w'])  
yvocab_size = len(metadata['idx2w'])  
print(len(metadata['idx2w']))
emb_dim = 1024

8002


In [7]:
# Initialize the model
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                               yseq_len=yseq_len,
                               xvocab_size=xvocab_size,
                               yvocab_size=yvocab_size,
                               ckpt_path='ckpt/',
                               emb_dim=emb_dim,
                               num_layers=3,
                                epochs=100000
                               )

<log> Building Graph </log>

In [0]:
# Generate validation, test and train batches by sampling from their respective datasets.
val_batch_gen = data_utils.rand_batch_gen(validX, validY, 32)
test_batch_gen = data_utils.rand_batch_gen(testX, testY, 256)
train_batch_gen = data_utils.rand_batch_gen(trainX, trainY, batch_size)

In [9]:
# Restore session and continue training

#sess = model.restore_last_session()
#sess = model.train(train_batch_gen, val_batch_gen, sess)

# Start training from scratch
sess = model.train(train_batch_gen, val_batch_gen)


<log> Training started </log>

Model saved to disk at iteration #1000
val   loss : 2.021551
Interrupted by user at iteration 1132


In [0]:
# Restore variables from disk.
sess = model.restore_last_session()
  
input_ = test_batch_gen.__next__()[0]
output = model.predict(sess, input_)
print(output.shape)

In [0]:
# Print 1 question and answer pair, for each unique answer generated by model.
replies = []
for ii, oi in zip(input_.T, output):
    q = data_utils.decode(sequence=ii, lookup=metadata['idx2w'], separator=' ')
    decoded = data_utils.decode(sequence=oi, lookup=metadata['idx2w'], separator=' ').split(' ')
    if decoded.count('unk') == 0:
        if decoded not in replies:
            print('q : [{0}]; a : [{1}]'.format(q, ' '.join(decoded)))
            replies.append(decoded)

q : [hes lying]; a : [i cant believe my triangle says triangle and balls up the coon and then everyone has to]
q : [very nice okay last one]; a : [wait hello you guys like ass with him in a bad shot]
q : [cartman come on]; a : [yeah this is awesome]
q : [they told us he has a mmuscular disease an and that and that he he might die]; a : [yeah i have those have some more ho]
q : [oh hey kid good luck good luck]; a : [yes yes am how much its a little piggy]
q : [and then give me the twenty and ill give you the pubes]; a : [fuck you fucking tricked]
q : [i think if we try kennys neighborhood we might find a]; a : [yes i hate you guys]
q : [maybe you got brain cancer]; a : [okay butters you wanna go butters]
q : [well the prostate seems to be normal no swelling of the hemorrhoidal gland]; a : [yeah is the new mom is not in the future]
q : [go fuck yourself cartman]; a : [hey guys just a good you guys have to get a gun bronco]
q : [were sorry butters our mind is made up]; a : [yeah so gonna 