### Import Data + Modules

In [1]:
# These commands download the necessary data and modules from Luuk's github account and place them in the expected folders.
  # (NOTE: we used python 3 and a notebook on Google Colab to run this)

!rm -rf Bot-sp
!rm -rf raw_data
!git clone --recursive https://github.com/luukru/Bot-sp.git
!mv Bot-sp Bot_sp # rename 
!cp -r Bot_sp/raw_data .

Cloning into 'Bot-sp'...
remote: Counting objects: 38, done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 38 (delta 13), reused 38 (delta 13), pack-reused 0[K
Unpacking objects: 100% (38/38), done.


In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd

from Bot_sp import seq2seq_wrapper
from Bot_sp import data
from Bot_sp import data_utils

### Load South Park Data

In [0]:
# Take the questions and answers from the Kaggle tv-series dataset.
def get_tv_series_q_and_a():
  # Read South Park data
  data = pd.read_csv('raw_data/All-seasons.csv')
  data = np.array(data)

  # Input & output for South park
  questions = []
  answers = []

  # Get all lines and corresponding outputs
  for i in range(0,len(data)):
    questions.append(data[i-1,3])
    answers.append(data[i,3])
  
  return questions, answers

In [0]:
# Take the questions and answers from the Cornell Movie dataset.
def get_movie_q_and_a():
    # Get the raw movie lines and conversations
    line_data = open('raw_data/southpark_lines.txt').read().split('\n')
    conv_data = open('raw_data/southpark_conversations.txt').read().split('\n')
    
    # Put the movie lines into a dictionary, with key = line-ID and value = [character, text].
    lines = {}
    for line in line_data:
        line = line.split(' +++$+++ ')
        lines[line[0]] = [line[3], line[4]]
    
    # Get the lines for each conversation 
    convs = []
    for conv in conv_data:
        conv = eval(conv.split(' +++$+++ ')[3])
        convs.append([lines[line] for line in conv])

    # Extract questions and answers
    questions = []
    answers = []
    for conv in convs:
        for i in range(0, len(conv)-1):
          questions.append(conv[i][1])
          answers.append(conv[i+1][1])

    return questions, answers

In [0]:
# Return the questions and answers from both the tv_series and the movie.
def get_combined_q_and_a():
  questions_tv, answers_tv = get_tv_series_q_and_a()
  questions_mv, answers_mv = get_movie_q_and_a()

  questions = questions_tv + questions_mv
  answers = answers_tv + answers_mv
  
  return questions, answers

### Seq2seq

In [6]:
# Load sSouth Park data
questions, answers = get_combined_q_and_a()
# Preprocess South Park data
data.process_data(questions, answers)


>> Filter lines

>> 2nd layer of filtering
30% filtered from original data
q : [hey children everybody im back  ow]; a : [great shot william hit him with another ]
q : [great shot william hit him with another ]; a : [oh ]
q : [tally ho lads i must say youre starting to become quite a thorn in my balls]; a : [wheres chef what have you done with him]
q : [wheres chef what have you done with him]; a : [hes safe  hes fasting in the deprivation room and being read the super adventure club manual weve got to undo the damage youve done]

>> Segment lines into words

:: Sample from segmented list of words
q : [['hey', 'children', 'everybody', 'im', 'back', 'ow']]; a : [['great', 'shot', 'william', 'hit', 'him', 'with', 'another']]
q : [['great', 'shot', 'william', 'hit', 'him', 'with', 'another']]; a : [['oh']]
q : [['tally', 'ho', 'lads', 'i', 'must', 'say', 'youre', 'starting', 'to', 'become', 'quite', 'a', 'thorn', 'in', 'my', 'balls']]; a : [['wheres', 'chef', 'what', 'have', 'you', 'done

In [7]:
import importlib
importlib.reload(data)

<module 'Bot_sp.data' from '/content/Bot_sp/data.py'>

In [0]:
# Load preprocessed data from pickle and npy files
metadata, idx_q, idx_a = data.load_data(PATH='')

# Divide data into training, test and validation sets.
(trainX, trainY), (testX, testY), (validX, validY) = data_utils.split_dataset(idx_q, idx_a)

In [9]:
# The seq2seq model parameters 
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 16
xvocab_size = len(metadata['idx2w'])  
yvocab_size = len(metadata['idx2w'])  
print(len(metadata['idx2w']))
emb_dim = 1024

8002


In [10]:
# Initialize the model

model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                               yseq_len=yseq_len,
                               xvocab_size=xvocab_size,
                               yvocab_size=yvocab_size,
                               ckpt_path='ckpt/',
                               emb_dim=emb_dim,
                               num_layers=3,
                                epochs=100000
                               )

<log> Building Graph </log>

In [0]:
# Generate validation, test and train batches by sampling from their respective datasets.
val_batch_gen = data_utils.rand_batch_gen(validX, validY, 32)
test_batch_gen = data_utils.rand_batch_gen(testX, testY, 256)
train_batch_gen = data_utils.rand_batch_gen(trainX, trainY, batch_size)

In [12]:
# Restore session and continue training

#sess = model.restore_last_session()
#sess = model.train(train_batch_gen, val_batch_gen, sess)

# Start training from scratch
sess = model.train(train_batch_gen, val_batch_gen)


<log> Training started </log>

Model saved to disk at iteration #1000
val   loss : 2.361985
Interrupted by user at iteration 1095


In [13]:
# Restore variables from disk.
sess = model.restore_last_session()
  
input_ = test_batch_gen.__next__()[0]
output = model.predict(sess, input_)
print(output.shape)

INFO:tensorflow:Restoring parameters from ckpt/seq2seq_model.ckpt-1000
(256, 25)


In [14]:
# Print 1 question and answer pair, for each unique answer generated by model.
replies = []
for ii, oi in zip(input_.T, output):
    q = data_utils.decode(sequence=ii, lookup=metadata['idx2w'], separator=' ')
    decoded = data_utils.decode(sequence=oi, lookup=metadata['idx2w'], separator=' ').split(' ')
    if decoded.count('unk') == 0:
        if decoded not in replies:
            print('q : [{0}]; a : [{1}]'.format(q, ' '.join(decoded)))
            replies.append(decoded)

q : [thats right doggie style we went over mhm]; a : [i i you you you]
q : [well well well guess we learned something new about you jimbo you freakin fag you wanna make out or something]; a : [oh i you you you]
q : [the boys are going to give a presentation at our rally about how the founding fathers would agree with our right to protest]; a : [i i i you you you]
q : [all right congratulations to those of you selected to stay in the end one of you will be the new kenny good luck]; a : [i i you you you you]
q : [i know i know well look ill stay over anyway so i can play for eighteen hours and then ill go to the lake]; a : [you you you you you]
