<a href="https://colab.research.google.com/github/myrondza/Data-Science-Machine-Learning-Deep-Learning-AI-Guide-Algorithms/blob/master/ChatBot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ChatBot (Keras & Tensorflow)

In [0]:
import pickle
import numpy as np

In [0]:
with open('train_qa.txt', 'rb') as f:
    train_data = pickle.load(f)

In [0]:
with open('test_qa.txt', 'rb') as f:
    test_data = pickle.load(f)

In [0]:
train_data[10]


(['Sandra',
  'went',
  'back',
  'to',
  'the',
  'hallway',
  '.',
  'Sandra',
  'moved',
  'to',
  'the',
  'office',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'office', '?'],
 'yes')

In [0]:
' '.join(train_data[10][0])

'Sandra went back to the hallway . Sandra moved to the office .'

In [0]:
vocab = set()
for story, question, answer in train_data:
    vocab = vocab.union(set(story)) #Set returns unique words in the sentence
                                    #Union returns the unique common elements from a two sets
    vocab = vocab.union(set(question))

In [0]:
vocab.add('no')
vocab.add('yes')

In [0]:
all_data = test_data + train_data

In [0]:
all_story_lens = [len(data[0]) for data in all_data]
max_story_len = (max(all_story_lens))
max_question_len = max([len(data[1]) for data in all_data])

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [0]:
tokenizer = Tokenizer(filters = [])
tokenizer.fit_on_texts(vocab)

In [0]:
tokenizer.word_index

{'.': 14,
 '?': 36,
 'apple': 1,
 'back': 35,
 'bathroom': 34,
 'bedroom': 10,
 'daniel': 37,
 'discarded': 2,
 'down': 9,
 'dropped': 18,
 'football': 17,
 'garden': 13,
 'got': 33,
 'grabbed': 23,
 'hallway': 3,
 'in': 27,
 'is': 28,
 'john': 15,
 'journeyed': 16,
 'kitchen': 26,
 'left': 11,
 'mary': 7,
 'milk': 29,
 'moved': 8,
 'no': 20,
 'office': 31,
 'picked': 19,
 'put': 5,
 'sandra': 30,
 'the': 12,
 'there': 4,
 'to': 22,
 'took': 24,
 'travelled': 21,
 'up': 32,
 'went': 25,
 'yes': 6}

In [0]:
train_story_text = []
train_question_text = []
train_answers = []

In [0]:
for story,question,answer in train_data:
    train_story_text.append(story)
    train_question_text.append(question) 
    train_answers.append(answer)

In [0]:
train_story_seq = tokenizer.texts_to_sequences(train_story_text)

In [0]:
def vectorize_stories(data,word_index = tokenizer.word_index, max_story_len = max_story_len, max_question_len = max_question_len):
    #vectorized stories:
    X = []
    #vectorized questions:
    Xq = []
    #vectorized answers:
    Y = []
    
    for story, question, answer in data:
        #Getting indexes for each word in the story
        x = [word_index[word.lower()] for word in story]
        #Getting indexes for each word in the story
        xq = [word_index[word.lower()] for word in question]
        #For the answers
        y = np.zeros(len(word_index) + 1) #Index 0 Reserved when padding the sequences
        y[word_index[answer]] = 1
        
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        
    #Now we have to pad these sequences:
    return(pad_sequences(X,maxlen=max_story_len), pad_sequences(Xq, maxlen=max_question_len), np.array(Y))

In [0]:
inputs_train, questions_train, answers_train = vectorize_stories(train_data)

In [0]:
inputs_test, questions_test, answers_test = vectorize_stories(test_data)

In [0]:
train_story_text[0]

['Mary',
 'moved',
 'to',
 'the',
 'bathroom',
 '.',
 'Sandra',
 'journeyed',
 'to',
 'the',
 'bedroom',
 '.']

In [0]:
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

In [0]:
input_sequence = Input((max_story_len,)) #As we dont know batch size yet
question = Input((max_question_len,))

W0719 07:20:52.323915 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0719 07:20:52.370224 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



In [0]:
vocab_len = len(vocab) + 1

input_encoder_m = Sequential()
input_encoder_m.add(Embedding(input_dim=vocab_len,output_dim = 64)) #From paper
input_encoder_m.add(Dropout(0.3))

input_encoder_c = Sequential()
input_encoder_c.add(Embedding(input_dim=vocab_len,output_dim = max_question_len)) #From paper
input_encoder_c.add(Dropout(0.3))

question_encoder = Sequential()
question_encoder.add(Embedding(input_dim=vocab_len,output_dim = 64,input_length=max_question_len)) #From paper
question_encoder.add(Dropout(0.3))

input_encoded_m = input_encoder_m(input_sequence)
input_encoded_c = input_encoder_c(input_sequence)
question_encoded = question_encoder(question)

match = dot([input_encoded_m,question_encoded], axes = (2,2))
match = Activation('softmax')(match)

response = add([match,input_encoded_c])
response = Permute((2,1))(response)

answer = concatenate([response, question_encoded])

W0719 07:23:07.049050 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0719 07:23:07.082239 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0719 07:23:07.097162 140499023890304 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
answer

<tf.Tensor 'concatenate_1/concat:0' shape=(?, 6, 220) dtype=float32>

In [0]:
answer = LSTM(32)(answer)
answer = Dropout(0.5)(answer)
answer = Dense(vocab_len)(answer)
answer = Activation('softmax')(answer)
model = Model([input_sequence,question], answer)
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

W0719 07:24:56.935152 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0719 07:24:56.957403 140499023890304 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 156)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
sequential_2 (Sequential)       multiple             2432        input_1[0][0]                    
__________________________________________________________________________________________________
sequential_4 (Sequential)       (None, 6, 64)        2432        input_2[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot

In [0]:
history = model.fit([inputs_train,questions_train],answers_train, batch_size = 32, epochs = 10, validation_data = ([inputs_test,questions_test],answers_test))

Train on 10000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
filename = 'Z_chatbot_100_epochs.h5'
model.save(filename)
model.load_weights('Z_chatbot_100_epochs.h5')

In [0]:
pred_results = model.predict(([inputs_test,questions_test]))

In [0]:
test_data[0]

(['Mary',
  'got',
  'the',
  'milk',
  'there',
  '.',
  'John',
  'moved',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'John', 'in', 'the', 'kitchen', '?'],
 'no')

In [0]:
pred_results[0]

array([1.2206699e-12, 9.7833222e-13, 9.4700842e-13, 1.3117576e-12,
       9.0123026e-13, 9.8853618e-13, 4.0486011e-01, 9.8127181e-13,
       9.0073879e-13, 8.1931272e-13, 1.1237084e-12, 1.1067217e-12,
       1.0089085e-12, 1.1088811e-12, 9.8250791e-13, 9.6046782e-13,
       9.4744752e-13, 1.3115499e-12, 9.6985083e-13, 1.4125007e-12,
       5.9513986e-01, 1.0346540e-12, 1.1478644e-12, 9.7233647e-13,
       9.5466170e-13, 1.2818720e-12, 1.1742337e-12, 8.2611202e-13,
       1.1756837e-12, 1.0138101e-12, 1.1078347e-12, 1.0953019e-12,
       1.1534081e-12, 7.9476578e-13, 1.2792682e-12, 1.2058115e-12,
       1.0310764e-12, 8.3276756e-13], dtype=float32)

In [0]:
val_max = np.argmax(pred_results[0])

In [0]:
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key
print(k)

no


In [0]:
pred_results[0][val_max]

0.59513986

In [0]:
my_story = 'John travelled to office'

In [0]:
my_story.split()

['John', 'travelled', 'to', 'office']

In [0]:
my_question = 'John travelled to office ?'

In [0]:
my_question.split()

['John', 'travelled', 'to', 'office', '?']

In [0]:
my_data = [(my_story.split(), my_question.split(),'yes')]

In [0]:
my_story, my_ques, my_ans = vectorize_stories(my_data)

In [0]:
pred_results = model.predict(([my_story,my_ques]))

In [0]:
val_max = np.argmax(pred_results[0])

In [0]:
for key,val in tokenizer.word_index.items():
    if val == val_max:
        k = key
print(k)

no


In [0]:
pred_results[0][val_max]

0.61280787