In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import all the necessary libraries
import codecs
import io
import os
import re
import numpy as np
import requests
from gensim.models import Word2Vec
from keras import Input, Model
from keras.activations import softmax
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

In [3]:
# Get the data from file
def get_all_conversations():
  all_conversations = []
  with codecs.open("/content/drive/My Drive/2020_Intern_03_VIIT_03_Chatbot/Coding Part (Implementation)/Seq2Seq Transformer/Cornell Movie Dataset/cornell movie-dialogs corpus/movie_lines.txt", "rb", encoding="utf-8", errors="ignore") as f:
  
    # split corpus line line
    lines = f.read().split("\n")
    
    # get each conversation
    for line in lines:
    
      # each line has multiple columns divided by '+++$+++'
      all_conversations.append(line.split(" +++$+++ "))
  
  # return all conversation
  return all_conversations

In [4]:
# Dataset is too big hence taking only first 10000 lines
# create a function to get all sorted conversation
def get_all_sorted_chats(all_conversations):
    all_chats = {}
    for tokens in all_conversations[:2000]:

        # if the line is valid - it contains all the metadata
        if len(tokens) > 4:

            # save the line number and the text itself
            # 4 th is the index where actual dialogue is present    
            all_chats[int(tokens[0][1:])] = tokens[4]

    # then sort the result and return list of tuples
    return sorted(all_chats.items(), key=lambda x: x[0])

In [5]:
# create a function to clean the text
def clean_text(text_to_clean):

  # apply all these conditions to clean the text
  res = text_to_clean.lower()
  res = re.sub(r"i'm", "i am", res)
  res = re.sub(r"he's", "he is", res)
  res = re.sub(r"she's", "she is", res)
  res = re.sub(r"it's", "it is", res)
  res = re.sub(r"that's", "that is", res)
  res = re.sub(r"what's", "what is", res)
  res = re.sub(r"where's", "where is", res)
  res = re.sub(r"how's", "how is", res)
  res = re.sub(r"\'ll", " will", res)
  res = re.sub(r"\'ve", " have", res)
  res = re.sub(r"\'re", " are", res)
  res = re.sub(r"\'d", " would", res)
  res = re.sub(r"\'re", " are", res)
  res = re.sub(r"won't", "will not", res)
  res = re.sub(r"can't", "cannot", res)
  res = re.sub(r"n't", " not", res)
  res = re.sub(r"n'", "ng", res)
  res = re.sub(r"'bout", "about", res)
  res = re.sub(r"'til", "until", res)
  res = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", res)
  
  # return the clean text
  return res

In [6]:
# create a function to group the lines into conversations
def get_conversation_dict(sorted_chats):
  
  # create a conversation dictionary to store the index and dialouge
  conversation_dict = {}
  
  # create a temporary counter
  counter = 1

  # store all index to one list
  conversation_ids = []

  # iterate through all sorted conversations
  for i in range(1, len(sorted_chats) + 1):

    # for all conversations index range between 1 to len(sorted_chats)
    if i < len(sorted_chats):

      # if the current line number differs from the previous only by 1
      if (sorted_chats[i][0] - sorted_chats[i - 1][0]) == 1:
        
        # then this line is a part of the current conversation
        # if the previous line was not added before,
        # then we should add it now
        if sorted_chats[i - 1][1] not in conversation_ids:
          conversation_ids.append(sorted_chats[i - 1][1])
        
        # or just append the current line
        conversation_ids.append(sorted_chats[i][1])
            
      # If the difference is more than 1
      # it means new conversation has started and we should clear conversation_ids
      elif (sorted_chats[i][0] - sorted_chats[i - 1][0]) > 1:
        conversation_dict[counter] = conversation_ids
        conversation_ids = []
        counter += 1
      else:
        continue

  # return conversation dictionary with all conversations   
  return conversation_dict

In [7]:
# create a function to prepare the list of questions and answers
def get_clean_q_and_a(conversations_dictionary):

  # Create an questions and answers list
  questions_and_answer = []
  
  # iterate through each conversation
  for current_conversation in conversations_dictionary.values():
    
    # make sure that each conversation contains an even number of lines
    if len(current_conversation) % 2 != 0:
      current_conversation = current_conversation[:-1]

    # convert questions and answers to the list of tuples
    for i in range(0, len(current_conversation), 2):
      questions_and_answer.append((current_conversation[i], current_conversation[i + 1]))

  # zip with * operator unzips tuples into independent lists
  questions, answers = zip(*questions_and_answer)
  
  # get the list of the questions
  questions_list = list(questions)

  # clear questions from contracted forms, non-letter symbols and convert it to lowercase
  clean_questions = list()
    
  for i in range(len(questions_list)):
    clean_questions.append(clean_text(questions_list[i]))

  # get the list of the answers
  answer_list = list(answers)

  # do the same with the answers, but now we need to add 'start' and 'end' words
  clean_answers = list()
  
  for i in range(len(answer_list)):
    clean_answers.append('<START> ' + clean_text(answer_list[i]) + ' <END>')
  
  # return clean answers and clean questions
  return clean_questions, clean_answers

In [8]:
# run all the function to check for progress

# get all the conversation from dataset
conversations = get_all_conversations()

# get the total conversation length
total = len(conversations)
print("Total conversations in dataset: {}".format(total))

# get all the sorted conversation
all_sorted_chats = get_all_sorted_chats(conversations)

# get the conversation dictionary
conversation_dictionary = get_conversation_dict(all_sorted_chats)

# get the list of questions and answers
questions, answers = get_clean_q_and_a(conversation_dictionary)

# print total number of questions and answers
print("Questions in dataset: {}".format(len(questions)))
print("Answers in dataset: {}".format(len(answers)))

Total conversations in dataset: 304714
Questions in dataset: 928
Answers in dataset: 928


In [9]:
# main regular expression
target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'

# Tokenizer allows to vectorize our corpus by turning each sentence into a sequence of integers where each integer is an index
# of a token in an internal dictionary
tokenizer = Tokenizer(filters=target_regex)
tokenizer.fit_on_texts(questions + answers)

# get the vocab size
VOCAB_SIZE = len(tokenizer.word_index) + 1
print('Vocabulary size : {}'.format(VOCAB_SIZE))

Vocabulary size : 2664


In [10]:
# Prepare two matrices from the lists of questions and answers
# use OHE for padding

# tokenized and add padding to questions
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
print("\nMax length of questions is :- ",maxlen_questions)

# pad each question with zeros at the end to be 223 words long
encoder_input_data = pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')

# matrix of 4709x223 integers - 4709 questions 223 words each
print(encoder_input_data.shape)

# tokenized and add padding to questions
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
print("\nMax length of answers is :- ",maxlen_answers)

# pad each answer with zeros at the end to be 132 words long
decoder_input_data = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

# matrix of 4709x132 integers - 4709 answers 132 words each
print(decoder_input_data.shape)


Max length of questions is :-  106
(928, 106)

Max length of answers is :-  132
(928, 132)


In [11]:
# Create one-hot encoded answers

# remove the first 'start' word from every answer
for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1:]

# pad answers with zeros
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

# tensor of size (4709, 132, 7910)
# 4709 answers 132 words each, and each word is one-hot encoded using our vocabulary
decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

In [14]:
# encoder will be used to capture space-dependent 
# relations between words from the questions
enc_inputs = Input(shape=(None,), name="E_input_1")
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True, name="E_embedding_1")(enc_inputs)
enc_outputs, state_h, state_c = LSTM(200, return_state=True, name="E_lstm_1")(enc_embedding)
enc_states = [state_h, state_c]

# decoder will be used to capture space-dependent relations between words from the answers using encoder's internal state as a context
dec_inputs = Input(shape=(None,), name="D_input_1")
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True, name="D_embedding_1")(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True, name="D_lstm_1")
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)

# decoder is connected to the output Dense layer
dec_dense = Dense(VOCAB_SIZE, activation=softmax, name="Dense_1")
output = dec_dense(dec_outputs)
model = Model([enc_inputs, dec_inputs], output)

# output of this network will look like this:
# y_true = [0.05, 0.95, 0...]
# and expected one-hot encoded output like this:
# y_pred = [0, 1, 0...]
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')
model.summary()

Model: "functional_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
E_input_1 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
D_input_1 (InputLayer)          [(None, None)]       0                                            
__________________________________________________________________________________________________
E_embedding_1 (Embedding)       (None, None, 200)    532800      E_input_1[0][0]                  
__________________________________________________________________________________________________
D_embedding_1 (Embedding)       (None, None, 200)    532800      D_input_1[0][0]                  
_______________________________________________________________________________________

In [14]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=50, epochs=300)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f7f30492d68>

In [15]:
# save the model
model.save('model_2.h5')

In [26]:
# create an inference model encoder
def make_inference_models():
  # two inputs for the state vectors returned by encoder
  dec_state_input_h = Input(shape=(200,))
  dec_state_input_c = Input(shape=(200,))
  dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    
  # these state vectors are used as an initial state 
  # for LSTM layer in the inference decoder
  # third input is the Embedding layer as explained above   
  dec_outputs, state_h, state_c = dec_lstm(dec_embedding, initial_state=dec_states_inputs)
  dec_states = [state_h, state_c]
    
  # Dense layer is used to return OHE predicted word
  dec_outputs = dec_dense(dec_outputs)
  dec_model = Model(inputs=[dec_inputs] + dec_states_inputs, outputs=[dec_outputs] + dec_states)
   
  # single encoder input is a question, represented as a sequence 
  # of integers padded with zeros
  enc_model = Model(inputs=enc_inputs, outputs=enc_states)
   
  return enc_model, dec_model

# run the above function to get the encoding and decoding sequence
enc_model, dec_model = make_inference_models()

In [17]:
# Create a function to convert the string into tokens
def str_to_tokens(sentence: str):
  # convert input string to lowercase, then split it by whitespaces
  words = sentence.lower().split()
    
  # then convert to a sequence of integers padded with zeros
  tokens_list = list()
  for current_word in words:
    result = tokenizer.word_index.get(current_word, '')

    # if list is not empty then append the result into token_list
    if result != '':
      tokens_list.append(result)

  # return One Hot Encodding of input string
  return pad_sequences([tokens_list], maxlen=maxlen_questions, padding='post')

In [18]:
def Predictions(inputText):
  # main chatbot questions and answers
  # encode the input sequence into state vectors
  input_query = inputText
  # input_query = input('\nEnter question : ')

  # to continue the conversation
  states_values = enc_model.predict(str_to_tokens(input_query))

  # start with a target sequence of size 1 - word 'start'   
  empty_target_seq = np.zeros((1, 1))
  empty_target_seq[0, 0] = tokenizer.word_index['start']
  stop_condition = False
  decoded_translation = ''

  # loop until true to apply text generation algorithm
  while not stop_condition:
            
    # feed the state vectors and 1-word target sequence to the decoder to produce predictions for the next word
    dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)         
            
    # sample the next word using these predictions
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    sampled_word = None
            
    # append the sampled word to the target sequence
    for word, index in tokenizer.word_index.items():
      if sampled_word_index == index:
        if word != 'end':
          decoded_translation += ' {}'.format(word)
        sampled_word = word
            
    # repeat until we generate the end-of-sequence word 'end' or we hit the length of answer limit
    if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
      stop_condition = True
            
    # prepare next iteration
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = sampled_word_index
    states_values = [h, c]
        
  # print("Chatbot        :",decoded_translation)
  return decoded_translation

In [22]:
# testting for pickle
input = "who"
print(Predictions(input))

 bianca


In [23]:
import numpy as np
from flask import Flask, request, jsonify, render_template
import pickle
import threading

In [None]:
app = Flask(__name__)

In [None]:
@app.route('/', methods = ['POST'])
def response():

	user_query = request.json.get('user_query')
	output_response = Prediction(user_query)
 
	return jsonify({'user_query' : output_response})

In [None]:
if __name__ == '__main__':
	app.run()