## `Importing Libraries`

In [None]:
# import all the necessary libraries
import codecs
import io
import os
import re
import numpy as np
import requests
from gensim.models import Word2Vec
from keras import Input, Model
from keras.activations import softmax
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import RMSprop
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras_preprocessing.text import Tokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# `Data Preprocessing Data Handling`

In [None]:
# Get the data from file
def get_all_conversations():
  all_conversations = []
  with codecs.open("/content/drive/My Drive/2020_Intern_03_VIIT_03_Chatbot/Coding Part (Implementation)/Seq2Seq Transformer/Cornell Movie Dataset/cornell movie-dialogs corpus/movie_lines.txt", "rb", encoding="utf-8", errors="ignore") as f:
  
    # split corpus line line
    lines = f.read().split("\n")
    
    # get each conversation
    for line in lines:
    
      # each line has multiple columns divided by '+++$+++'
      all_conversations.append(line.split(" +++$+++ "))
  
  # return all conversation
  return all_conversations

In [None]:
# Dataset is too big hence taking only first 10000 lines
# create a function to get all sorted conversation
def get_all_sorted_chats(all_conversations):
    all_chats = {}
    for tokens in all_conversations[:6500]:

        # if the line is valid - it contains all the metadata
        if len(tokens) > 4:

            # save the line number and the text itself
            # 4 th is the index where actual dialogue is present    
            all_chats[int(tokens[0][1:])] = tokens[4]

    # then sort the result and return list of tuples
    return sorted(all_chats.items(), key=lambda x: x[0])

In [None]:
# create a function to clean the text
def clean_text(text_to_clean):

  # apply all these conditions to clean the text
  res = text_to_clean.lower()
  res = re.sub(r"i'm", "i am", res)
  res = re.sub(r"he's", "he is", res)
  res = re.sub(r"she's", "she is", res)
  res = re.sub(r"it's", "it is", res)
  res = re.sub(r"that's", "that is", res)
  res = re.sub(r"what's", "what is", res)
  res = re.sub(r"where's", "where is", res)
  res = re.sub(r"how's", "how is", res)
  res = re.sub(r"\'ll", " will", res)
  res = re.sub(r"\'ve", " have", res)
  res = re.sub(r"\'re", " are", res)
  res = re.sub(r"\'d", " would", res)
  res = re.sub(r"\'re", " are", res)
  res = re.sub(r"won't", "will not", res)
  res = re.sub(r"can't", "cannot", res)
  res = re.sub(r"n't", " not", res)
  res = re.sub(r"n'", "ng", res)
  res = re.sub(r"'bout", "about", res)
  res = re.sub(r"'til", "until", res)
  res = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", res)
  
  # return the clean text
  return res

In [None]:
# create a function to group the lines into conversations
def get_conversation_dict(sorted_chats):
  
  # create a conversation dictionary to store the index and dialouge
  conversation_dict = {}
  
  # create a temporary counter
  counter = 1

  # store all index to one list
  conversation_ids = []

  # iterate through all sorted conversations
  for i in range(1, len(sorted_chats) + 1):

    # for all conversations index range between 1 to len(sorted_chats)
    if i < len(sorted_chats):

      # if the current line number differs from the previous only by 1
      if (sorted_chats[i][0] - sorted_chats[i - 1][0]) == 1:
        
        # then this line is a part of the current conversation
        # if the previous line was not added before,
        # then we should add it now
        if sorted_chats[i - 1][1] not in conversation_ids:
          conversation_ids.append(sorted_chats[i - 1][1])
        
        # or just append the current line
        conversation_ids.append(sorted_chats[i][1])
            
      # If the difference is more than 1
      # it means new conversation has started and we should clear conversation_ids
      elif (sorted_chats[i][0] - sorted_chats[i - 1][0]) > 1:
        conversation_dict[counter] = conversation_ids
        conversation_ids = []
        counter += 1
      else:
        continue

  # return conversation dictionary with all conversations   
  return conversation_dict

In [None]:
# create a function to prepare the list of questions and answers
def get_clean_q_and_a(conversations_dictionary):

  # Create an questions and answers list
  questions_and_answer = []
  
  # iterate through each conversation
  for current_conversation in conversations_dictionary.values():
    
    # make sure that each conversation contains an even number of lines
    if len(current_conversation) % 2 != 0:
      current_conversation = current_conversation[:-1]

    # convert questions and answers to the list of tuples
    for i in range(0, len(current_conversation), 2):
      questions_and_answer.append((current_conversation[i], current_conversation[i + 1]))

  # zip with * operator unzips tuples into independent lists
  questions, answers = zip(*questions_and_answer)
  
  # get the list of the questions
  questions_list = list(questions)

  # clear questions from contracted forms, non-letter symbols and convert it to lowercase
  clean_questions = list()
    
  for i in range(len(questions_list)):
    clean_questions.append(clean_text(questions_list[i]))

  # get the list of the answers
  answer_list = list(answers)

  # do the same with the answers, but now we need to add 'start' and 'end' words
  clean_answers = list()
  
  for i in range(len(answer_list)):
    clean_answers.append('<START> ' + clean_text(answer_list[i]) + ' <END>')
  
  # return clean answers and clean questions
  return clean_questions, clean_answers

In [None]:
# run all the function to check for progress

# get all the conversation from dataset
conversations = get_all_conversations()

# get the total conversation length
total = len(conversations)
print("Total conversations in dataset: {}".format(total))

# get all the sorted conversation
all_sorted_chats = get_all_sorted_chats(conversations)

# get the conversation dictionary
conversation_dictionary = get_conversation_dict(all_sorted_chats)

# get the list of questions and answers
questions, answers = get_clean_q_and_a(conversation_dictionary)

# print total number of questions and answers
print("Questions in dataset: {}".format(len(questions)))
print("Answers in dataset: {}".format(len(answers)))

Total conversations in dataset: 304714
Questions in dataset: 3041
Answers in dataset: 3041


## `Base Model Training`

In [None]:
# main regular expression
target_regex = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n\'0123456789'

# Tokenizer allows to vectorize our corpus by turning each sentence into a sequence of integers where each integer is an index
# of a token in an internal dictionary
tokenizer = Tokenizer(filters=target_regex)
tokenizer.fit_on_texts(questions + answers)

# get the vocab size
VOCAB_SIZE = len(tokenizer.word_index) + 1
print('Vocabulary size : {}'.format(VOCAB_SIZE))

Vocabulary size : 6077


In [None]:
# Prepare two matrices from the lists of questions and answers
# use OHE for padding

# tokenized and add padding to questions
tokenized_questions = tokenizer.texts_to_sequences(questions)
maxlen_questions = max([len(x) for x in tokenized_questions])
print("\nMax length of questions is :- ",maxlen_questions)

# pad each question with zeros at the end to be 223 words long
encoder_input_data = pad_sequences(tokenized_questions, maxlen=maxlen_questions, padding='post')

# matrix of 4709x223 integers - 4709 questions 223 words each
print(encoder_input_data.shape)

# tokenized and add padding to questions
tokenized_answers = tokenizer.texts_to_sequences(answers)
maxlen_answers = max([len(x) for x in tokenized_answers])
print("\nMax length of answers is :- ",maxlen_answers)

# pad each answer with zeros at the end to be 132 words long
decoder_input_data = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

# matrix of 4709x132 integers - 4709 answers 132 words each
print(decoder_input_data.shape)


Max length of questions is :-  223
(3041, 223)

Max length of questions is :-  132
(3041, 132)


In [None]:
# Create one-hot encoded answers

# remove the first 'start' word from every answer
for i in range(len(tokenized_answers)):
  tokenized_answers[i] = tokenized_answers[i][1:]

# pad answers with zeros
padded_answers = pad_sequences(tokenized_answers, maxlen=maxlen_answers, padding='post')

# tensor of size (4709, 132, 7910)
# 4709 answers 132 words each, and each word is one-hot encoded using our vocabulary
decoder_output_data = to_categorical(padded_answers, VOCAB_SIZE)

In [None]:
# encoder will be used to capture space-dependent 
# relations between words from the questions
enc_inputs = Input(shape=(None,))
enc_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(enc_inputs)
enc_outputs, state_h, state_c = LSTM(200, return_state=True)(enc_embedding)
enc_states = [state_h, state_c]

# decoder will be used to capture space-dependent relations between words from the answers using encoder's internal state as a context
dec_inputs = Input(shape=(None,))
dec_embedding = Embedding(VOCAB_SIZE, 200, mask_zero=True)(dec_inputs)
dec_lstm = LSTM(200, return_state=True, return_sequences=True)
dec_outputs, _, _ = dec_lstm(dec_embedding, initial_state=enc_states)

# decoder is connected to the output Dense layer
dec_dense = Dense(VOCAB_SIZE, activation=softmax)
output = dec_dense(dec_outputs)
model = Model([enc_inputs, dec_inputs], output)

# output of this network will look like this:
# y_true = [0.05, 0.95, 0...]
# and expected one-hot encoded output like this:
# y_pred = [0, 1, 0...]
model.compile(optimizer=RMSprop(), loss='categorical_crossentropy')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 200)    1215400     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 200)    1215400     input_2[0][0]                    
_______________________________________________________________________________________

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=50, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fa417c36da0>

## `Inference Model`

In [None]:
# create an inference model encoder
def make_inference_models():
  # two inputs for the state vectors returned by encoder
  dec_state_input_h = Input(shape=(200,))
  dec_state_input_c = Input(shape=(200,))
  dec_states_inputs = [dec_state_input_h, dec_state_input_c]
    
  # these state vectors are used as an initial state  
  # for LSTM layer in the inference decoder
  # third input is the Embedding layer as explained above   
  dec_outputs, state_h, state_c = dec_lstm(dec_embedding, initial_state=dec_states_inputs)
  dec_states = [state_h, state_c]
    
  # Dense layer is used to return OHE predicted word
  dec_outputs = dec_dense(dec_outputs)
  dec_model = Model(inputs=[dec_inputs] + dec_states_inputs, outputs=[dec_outputs] + dec_states)
   
  # single encoder input is a question, represented as a sequence 
  # of integers padded with zeros
  enc_model = Model(inputs=enc_inputs, outputs=enc_states)
   
  return enc_model, dec_model

# run the above function to get the encoding and decoding sequence
enc_model, dec_model = make_inference_models()

In [None]:
# Create a function to convert the string into tokens
def str_to_tokens(sentence: str):
  # convert input string to lowercase, then split it by whitespaces
  words = sentence.lower().split()
    
  # then convert to a sequence of integers padded with zeros
  tokens_list = list()
  for current_word in words:
    result = tokenizer.word_index.get(current_word, '')

    # if list is not empty then append the result into token_list
    if result != '':
      tokens_list.append(result)

  # return One Hot Encodding of input string
  return pad_sequences([tokens_list], maxlen=maxlen_questions, padding='post')

# `Chatbot Loop`

In [None]:
def Predictions(inputText):
  # main chatbot questions and answers
  # encode the input sequence into state vectors
  input_query = inputText
  # input_query = input('\nEnter question : ')

  # to continue the conversation
  states_values = enc_model.predict(str_to_tokens(input_query))

  # start with a target sequence of size 1 - word 'start'   
  empty_target_seq = np.zeros((1, 1))
  empty_target_seq[0, 0] = tokenizer.word_index['start']
  stop_condition = False
  decoded_translation = ''

  # loop until true to apply text generation algorithm
  while not stop_condition:
            
    # feed the state vectors and 1-word target sequence to the decoder to produce predictions for the next word
    dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)         
            
    # sample the next word using these predictions
    sampled_word_index = np.argmax(dec_outputs[0, -1, :])
    sampled_word = None
            
    # append the sampled word to the target sequence
    for word, index in tokenizer.word_index.items():
      if sampled_word_index == index:
        if word != 'end':
          decoded_translation += ' {}'.format(word)
        sampled_word = word
            
    # repeat until we generate the end-of-sequence word 'end' or we hit the length of answer limit
    if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
      stop_condition = True
            
    # prepare next iteration
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = sampled_word_index
    states_values = [h, c]
        
  # print("Chatbot        :",decoded_translation)
  return decoded_translation

In [None]:
# testting for pickle
input = "hi"
print(Predictions(input))

 you are not a big talker are you


In [None]:
# Use pickle to load in the pre-trained model
import pickle
from threading import Thread

with open('model_1.pkl', 'wb') as file:
  pickle.dump('model_1.pkl', file)

In [None]:
!pip install flask-ngrok

Collecting flask-ngrok
  Downloading https://files.pythonhosted.org/packages/af/6c/f54cb686ad1129e27d125d182f90f52b32f284e6c8df58c1bae54fa1adbc/flask_ngrok-0.0.25-py3-none-any.whl
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
import numpy as np
from flask import Flask, request, jsonify, render_template
import pickle
import threading
from flask_ngrok import run_with_ngrok

In [None]:
import socket
print(socket.gethostbyname(socket.gethostname()))

172.28.0.2


In [None]:
app = Flask(__name__,template_folder='/content/drive/My Drive/2020_Intern_03_VIIT_03_Chatbot/Coding Part (Implementation)/Seq2Seq Transformer/Flask/templates')
run_with_ngrok(app)
model = pickle.load(open('/content/drive/My Drive/2020_Intern_03_VIIT_03_Chatbot/Coding Part (Implementation)/Seq2Seq Transformer/Flask/model_1.pkl', 'rb'))

In [None]:
@app.route('/')
def home():
  return render_template('index.html')

In [None]:
@app.route('/predict',methods=['POST'])
def predict():
  '''
  For rendering results on HTML GUI
  '''
    
  user_input = request.form.get("user_query")
  prediction = Predictions(user_input)

  output = prediction

  return render_template('index.html', prediction_text=' {}'.format(output))

In [None]:
if __name__ == "__main__":
  app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://5cc6ed40e811.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [25/Sep/2020 12:02:15] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Sep/2020 12:02:16] "[33mGET /static/css/style.css HTTP/1.1[0m" 404 -
127.0.0.1 - - [25/Sep/2020 12:02:17] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [25/Sep/2020 12:02:29] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Sep/2020 12:02:30] "[33mGET /static/css/style.css HTTP/1.1[0m" 404 -
127.0.0.1 - - [25/Sep/2020 12:02:52] "[37mPOST /predict HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/Sep/2020 12:02:52] "[33mGET /static/css/style.css HTTP/1.1[0m" 404 -
