# CS5100 - Conversational Agent Training Script

Using Seq2Seq LSTM models

In [None]:
dataset_path = 'dialogs.json'
file_id_train = '16io-SYA30hVsNlAV5kPaIaHjRrjyrcjf'
dataset_name = 'dialogs'
model_name = 'model' + dataset_name

## 1. Importing Packages

In [None]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing
from typing import Tuple, List

## 2. Preprocessing Dataset

**Cornell Movie--Dialogs Corpus**

Distributed together with:  Chameleons in Imagined Conversations.

Data and Code available in ConvoKit: a toolkit for analyzing conversations

Related corpus: Cornell Movie-Quotes Corpus

**DESCRIPTION:**
                                    
This corpus contains a large metadata-rich collection of fictional conversations extracted from raw movie scripts:

- 220,579 conversational exchanges between 10,292 pairs of movie characters

- involves 9,035 characters from 617 movies

- in total 304,713 utterances

- movie metadata included:

    - genres

    - release year

    - IMDB rating

    - number of IMDB votes

    - IMDB rating

- character metadata included:

    - gender (for 3,774 characters)

    - position on movie credits (3,321 characters)

- see the [documentation](https://convokit.cornell.edu/documentation/movie.html) for details

*    Read the JSON file



In [None]:
# Download and load the dataset
import json

!gdown $file_id_train

with open(dataset_path, 'r') as file:
    data = json.load(file)

# Filter out responses that are over 100 characters long
data = [item for item in data if len(item['answer']) <= 100 and len(item['question']) <= 100]

# Using only a portion of the dataset
# Limit the size of the dataset if lack system RAM
data = data[:15000]

Downloading...
From: https://drive.google.com/uc?id=16io-SYA30hVsNlAV5kPaIaHjRrjyrcjf
To: /content/dialogs.json
  0% 0.00/356k [00:00<?, ?B/s]100% 356k/356k [00:00<00:00, 129MB/s]


In [None]:
print(f"Sample 1: {data[0]}")
print(f"Sample 2: {data[1]}")
print(f"Sample 3: {data[2]}")

Sample 1: {'question': 'hi, how are you doing?', 'answer': "i'm fine. how about yourself?"}
Sample 2: {'question': "i'm fine. how about yourself?", 'answer': "i'm pretty good. thanks for asking."}
Sample 3: {'question': "i'm pretty good. thanks for asking.", 'answer': 'no problem. so how have you been?'}


*    Remove unwanted data types which are produced while parsing the data.
*    Append `<BOS>`, beginning-of-sentence token, and `<EOS>`, end-of-sentence token, to all the answers.
*    Create a Tokenizer and load the whole vocabulary ( questions + answers ) into it.

In [None]:
from tensorflow.keras import preprocessing, utils

questions_list = []
answers_list = []

for item in data:
    question = item['question']
    answer = item['answer']

    questions_list.append(question)
    answers_list.append('<BOS> ' + answer + ' <EOS>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(['<UNK>'] + questions_list + answers_list)

vocab_size = len(tokenizer.word_index) + 1

# Save the tokenizer
with open(f'tokenizer{dataset_name}.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(f"Vocab size: {vocab_size}")
print(f"Vocab: {tokenizer.word_index}")

Vocab size: 2523
Vocab: {'bos': 1, 'eos': 2, 'i': 3, 'you': 4, 'the': 5, 'to': 6, 'a': 7, 'it': 8, 'that': 9, 'do': 10, 'what': 11, 'is': 12, 'of': 13, 'and': 14, 'have': 15, 'are': 16, 'in': 17, 'they': 18, 'was': 19, "it's": 20, 'did': 21, 'so': 22, 'like': 23, 'yes': 24, 'for': 25, 'my': 26, "don't": 27, 'but': 28, 'he': 29, 'no': 30, 'me': 31, 'be': 32, "i'm": 33, "that's": 34, 'about': 35, 'we': 36, 'on': 37, 'how': 38, 'go': 39, 'not': 40, 'think': 41, 'too': 42, 'why': 43, 'your': 44, 'good': 45, 'going': 46, 'will': 47, 'with': 48, 'want': 49, 'really': 50, 'get': 51, 'know': 52, 'well': 53, 'all': 54, 'at': 55, 'there': 56, 'one': 57, "i'll": 58, 'just': 59, 'can': 60, 'this': 61, 'would': 62, 'if': 63, "you're": 64, 'people': 65, 'see': 66, 'then': 67, 'she': 68, 'right': 69, 'nice': 70, "didn't": 71, 'out': 72, 'should': 73, "what's": 74, 'time': 75, 'oh': 76, 'need': 77, 'her': 78, 'money': 79, 'maybe': 80, 'him': 81, 'course': 82, 'lot': 83, 'up': 84, 'when': 85, 'or': 86,

`encoder_input_data`: tokenize the list of questions. Pad them to their maximum length.

`decoder_input_data`: tokenize the list of answers. Pad them to their maximum length.

`decoder_output_data`: tokenize the list of answers. Remove the first element `<BOS>` from all the `tokenized_answers`.

In [None]:
from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

def tokenize(sentences: list) -> Tuple[List[List[str]], List[str]]:
    """
    Tokenize the sentences

    Parameters:
        sentences: list of sentences

    Returns:
        tokenized_sentences: list of tokenized sentences
        vocab_list: list of vocabulary
    """
    tokens_list = []
    vocab_list = [] # Include an unknown token for unknown words
    for sentence in sentences:
        sentence = sentence.lower() # Convert to lower case
        sentence = re.sub('[^a-zA-z0-9\']', ' ', sentence) # Remove special characters
        tokens = sentence.split() # Tokenize the sentence
        vocab_list.extend(tokens)
        tokens_list.append(tokens)

    return tokens_list, vocab_list

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions_list)
questions_max_len = max([ len(x) for x in tokenized_questions ])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=questions_max_len, padding='post')
encoder_input_data = np.array(padded_questions)
print(f"Encoder input data shape: (# Samples, Max Sequence Length): {encoder_input_data.shape}")

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers_list)
answers_max_len = max([ len(x) for x in tokenized_answers ])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=answers_max_len, padding='post')
decoder_input_data = np.array(padded_answers)
print(f"Decoder input data shape: (# Samples, Max Sequence Length): {decoder_input_data.shape}")

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences(answers_list)
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:] # Remove the <bos> token

padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=answers_max_len, padding='post')
onehot_answers = utils.to_categorical(padded_answers, vocab_size)
decoder_output_data = np.array(onehot_answers)
print(f"Decoder output data shape: (# Samples, Max Sequence Length, Vocab Size): {decoder_output_data.shape}")

Encoder input data shape: (# Samples, Max Sequence Length): (3725, 19)
Decoder input data shape: (# Samples, Max Sequence Length): (3725, 21)
Decoder output data shape: (# Samples, Max Sequence Length, Vocab Size): (3725, 21, 2523)


The model will have Embedding, LSTM and Dense layers. The basic configuration is as follows.


*    2 Input Layers : One for `encoder_input_data` and another for `decoder_input_data`.
*    Embedding layer : For converting token vectors to fix sized dense vectors. **( Note :  Don't forget the `mask_zero=True` argument here )**
*    LSTM layer : Provide access to Long-Short Term cells.

Working :

1.    The `encoder_input_data` comes in the Embedding layer (  `encoder_embedding` ).
2.    The output of the Embedding layer goes to the LSTM cell which produces 2 state vectors ( `h` and `c` which are `encoder_states` )
3.    These states are set in the LSTM cell of the decoder.
4.    The decoder_input_data comes in through the Embedding layer.
5.    The Embeddings goes in LSTM cell ( which had the states ) to produce seqeunces.

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=(questions_max_len,), name='Encoder_Inputs')
encoder_embedding = tf.keras.layers.Embedding(vocab_size, 200, mask_zero=True, name='Encoder_Embedding')(encoder_inputs)
encoder_ouputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True, name='Encoder_LSTM')(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(answers_max_len,), name='Decoder_Inputs')
decoder_embedding = tf.keras.layers.Embedding(vocab_size, 200, mask_zero=True, name='Decoder_Embedding')(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True, name='Decoder_LSTM')
decoder_ouputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax, name='Output_Layer')
output = decoder_dense(decoder_ouputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output, name='Encoder_Decoder_Model')
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "Encoder_Decoder_Model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Encoder_Inputs (InputLayer  [(None, 19)]                 0         []                            
 )                                                                                                
                                                                                                  
 Decoder_Inputs (InputLayer  [(None, 21)]                 0         []                            
 )                                                                                                
                                                                                                  
 Encoder_Embedding (Embeddi  (None, 19, 200)              504600    ['Encoder_Inputs[0][0]']      
 ng)                                                                          

## 3. Training Model

Train the model for a number of epochs with `RMSprop` optimizer and `categorical_crossentropy` loss function.

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=600 )
model.save(f"{model_name}.keras")

Epoch 1/600
Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78

## 4. Define Inference Models

**Encoder inference model**: Takes the question as input and outputs LSTM states ( `h` and `c` ).

**Decoder inference model**: Takes in 2 inputs, one are the LSTM states ( Output of encoder model ), second are the answer input seqeunces ( ones not having the `<bos>` tag ). It will output the answers for the question which we fed to the encoder model and its state values.

In [None]:
def make_inference() -> Tuple[tf.keras.models.Model, tf.keras.models.Model]:
    """
    Constructs separate encoder and decoder models for inference based on a trained Encoder-Decoder model.

    Returns:
        encoder_model (tf.keras.models.Model): A Keras model representing the encoder component for inference.
            This model takes encoder inputs (sequences of tokens) and outputs the encoder states (hidden state
            and cell state) produced by the encoder LSTM layer.

        decoder_model (tf.keras.models.Model): A Keras model representing the decoder component for inference.
            This model takes decoder inputs (sequences of tokens) along with the initial decoder states (hidden
            state and cell state) as inputs and outputs the decoder outputs and updated decoder states.
            It consists of the decoder LSTM layer and the decoder dense layer.
    """
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=(200,), name='Input_Layer_h')
    decoder_state_input_c = tf.keras.layers.Input(shape=(200,), name='Input_Layer_c')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_ouputs = decoder_dense(decoder_outs)
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_ouputs] + decoder_states)

    return encoder_model, decoder_model

## 5. Talking with Chatbot

Convert `str` into tokens with paddings.

In [None]:
def str_to_tokens(sentence: str) -> List[int]:
    words = sentence.lower()
    words = re.sub('[^a-zA-Z0-9\']', ' ', words)
    words = words.split()
    # tokens_list = [ tokenizer.word_index[word] for word in words ]
    tokens_list = list()
    for word in words:
        if word in tokenizer.word_index:
            tokens_list.append(tokenizer.word_index[word])
        else:
            tokens_list.append(tokenizer.word_index['unk'])
    padded_sequence = preprocessing.sequence.pad_sequences([tokens_list], maxlen=questions_max_len, padding='post')
    return padded_sequence

1.    First, we take a question as input and predict the state values using `encoder_model`.
2.    We set the state values in the decoder's LSTM.
3.    Then, we generate a sequence which contains the `<start>` element.
4.    We input this sequence in the `decoder_model`.
5.    We replace the `<bos>` element with the element which was predicted by the `decoder_model` and update the state values.
6.    We carry out the above steps iteratively till we hit the `<eos>` tag or the maximum answer length.

In [None]:
encoder_model, decoder_model = make_inference()

input_strs = [
    "What is your name?",
    "How are you feeling today?",
    "What is the weather like outside?",
    "Can you tell me a joke?",
    "How old are you?",
    "What is the capital of France?",
    "How do I bake a cake?",
    "What time is it?",
    "Who is the president of the United States?",
    "What is the meaning of life?"]

for input_str in input_strs:

    # Convert input string to tokens for the encoder
    states_values = encoder_model.predict(str_to_tokens(input_str), verbose=0)

    # Initialize target sequence with 'bos' (beginning of sentence) token
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['bos']

    # Initialize variables for translation and stop condition
    stop_condition = False
    decoded_translation = ''

    while not stop_condition:
        # Predict next word using the decoder model
        decoder_outputs, h, c = decoder_model.predict([empty_target_seq] + states_values, verbose=0)

        # Get index of the most probable word and fine the word corresponding to the index
        sampled_word_index = np.argmax(decoder_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word !="eos" and word !="unk":
                    decoded_translation += f" {word}"
                sampled_word = word

        if sampled_word == 'eos' or len(decoded_translation.split()) > answers_max_len:
            stop_condition = True

        # Update target sequence with sampled word index
        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index

        # Update states for the next iteration
        states_values = [h, c]

    # Print the decoded translation
    print(f"Question: {input_str}")
    print(f"Answer: {decoded_translation}\n")

Question: What is your name?
Answer:  i don't know the tag is missing

Question: How are you feeling today?
Answer:  i'm not sure yet is i having enough

Question: What is the weather like outside?
Answer:  i don't know it's the old things

Question: Can you tell me a joke?
Answer:  i've been to practice around you

Question: How old are you?
Answer:  they say about other

Question: What is the capital of France?
Answer:  i feel like chinese

Question: How do I bake a cake?
Answer:  it's only 10 minutes than a little while

Question: What time is it?
Answer:  english

Question: Who is the president of the United States?
Answer:  i think it will only take you a year or two

Question: What is the meaning of life?
Answer:  i'll use our problems with their



## 6. Downloading the Model and Tokenizer

In [None]:
# zip the files and download
import os
import shutil
from google.colab import files
from datetime import datetime

temp_dir = 'output'
files_to_include = [f'model{dataset_name}.keras', f'tokenizer{dataset_name}.pickle']

if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

try:
    for file in files_to_include:
        shutil.copy(file, temp_dir)
except Exception as e:
    print(f"Error copying files: {e}")

file_name = f"Model{dataset_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
shutil.make_archive(file_name, 'zip', 'output')
files.download(file_name + '.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>