In [31]:
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text,sequence

with open("chatbot_train.txt", "rb") as f:   #list of tuples
    train =  pickle.load(f)
with open("chatbot_test.txt", "rb") as f:   
    test =  pickle.load(f)

In [2]:
len(train)

10000

In [3]:
len(test)

1000

In [4]:
# Combining train and test data into a dataframe
data = train + test

df_dict = {"story":[' '.join(tp[0]) for tp in data],"question":[' '.join(tp[1]) for tp in data],"answer":[tp[2] for tp in data]}
df = pd.DataFrame(df_dict)

In [5]:
# corpus to find vocabulary 
corpus = []

for i in range(len(data)):
    whole_text = df['story'][i] +' '+ df['question'][i] +' '+ df['answer'][i]
    corpus.append(whole_text)

In [6]:
vectorizer = CountVectorizer(token_pattern = r"(?u)\b\w\w+\b|!|\?|\"|\'|\.")
X = vectorizer.fit_transform(corpus)

In [18]:
list(vectorizer.vocabulary_.keys())

['mary',
 'moved',
 'to',
 'the',
 'bathroom',
 '.',
 'sandra',
 'journeyed',
 'bedroom',
 'is',
 'in',
 'hallway',
 '?',
 'no',
 'went',
 'back',
 'daniel',
 'kitchen',
 'office',
 'picked',
 'up',
 'football',
 'there',
 'yes',
 'john',
 'travelled',
 'garden',
 'got',
 'apple',
 'put',
 'down',
 'grabbed',
 'left',
 'dropped',
 'took',
 'milk',
 'discarded']

In [19]:
vocabulary = list(vectorizer.vocabulary_)

In [20]:
# adding a placeholder for when using keras pad_sequences

vocab_len = len(vocabulary) + 1
vocab_len

38

In [None]:
# can be used to create vocabulary

#from tensorflow.keras.preprocessing import text, sequence

#tokenizer = Tokenizer(filters=[])

#for text in [df.story.values,df.question.values,df.answer.values]:
#    tokenizer.fit_on_texts(text)

In [11]:
# finding the maximum story length for padding seuqences

max_story_len = max([len(tp[0]) for tp in data]) 
max_question_len = max([len(tp[1]) for tp in data]) 

In [None]:
#np.argmax(np.array([len(tp[0]) for tp in data]))

In [12]:
max_story_len

156

In [13]:
max_question_len

6

In [21]:
# integer encoding words
tokenizer = Tokenizer(filters=[])
tokenizer.fit_on_texts(vocabulary)

In [41]:
word_index = tokenizer.word_index # lower-cased automatically

In [25]:
train_dict = {"story":[' '.join(tp[0]) for tp in train],"question":[' '.join(tp[1]) for tp in train],"answer":[tp[2] for tp in train]}
train_df = pd.DataFrame(train_dict)

test_dict = {"story":[' '.join(tp[0]) for tp in test],"question":[' '.join(tp[1]) for tp in test],"answer":[tp[2] for tp in test]}
test_df = pd.DataFrame(test_dict)

In [26]:
train_df.head()

Unnamed: 0,story,question,answer
0,Mary moved to the bathroom . Sandra journeyed ...,Is Sandra in the hallway ?,no
1,Mary moved to the bathroom . Sandra journeyed ...,Is Daniel in the bathroom ?,no
2,Mary moved to the bathroom . Sandra journeyed ...,Is Daniel in the office ?,no
3,Mary moved to the bathroom . Sandra journeyed ...,Is Daniel in the bedroom ?,yes
4,Mary moved to the bathroom . Sandra journeyed ...,Is Daniel in the bedroom ?,yes


In [27]:
test_df.head()

Unnamed: 0,story,question,answer
0,Mary got the milk there . John moved to the be...,Is John in the kitchen ?,no
1,Mary got the milk there . John moved to the be...,Is John in the kitchen ?,no
2,Mary got the milk there . John moved to the be...,Is John in the garden ?,yes
3,Mary got the milk there . John moved to the be...,Is Daniel in the bathroom ?,yes
4,Mary got the milk there . John moved to the be...,Is Daniel in the bedroom ?,no


In [67]:
def compute_story_and_questions(dframe, tokenizer):
    
    story = tokenizer.texts_to_sequences(dframe.story.values)
    question = tokenizer.texts_to_sequences(dframe.question.values)

    story = sequence.pad_sequences(story, maxlen=156)
    question = sequence.pad_sequences(question, maxlen=6)
    
    return story, question

In [68]:
inputs_train, questions_train = compute_story_and_questions(train_df,tokenizer)
inputs_test, questions_test = compute_story_and_questions(test_df,tokenizer)

In [62]:
def compute_answer(dframe):
    
    answers = []
    
    for yes_no in dframe.answer.values:
        
        y = np.zeros(len(word_index) + 1)    
        y[word_index[yes_no]] = 1
        answers.append(y)
    
    return answers

In [70]:
answers_train = compute_answer(train_df)
answers_test = compute_answer(test_df)

In [72]:
sum(answers_train) # number of yes and no in training set

array([   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0., 4988.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0., 5012.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.])