# CS5100 - Conversational Agent

Using Seq2Seq LSTM models

In [1]:
dataset_path = 'train.json'
file_id_train = '14y--fBqtFxWmEwNVzD2gjF85HxQYZPMV' # train.json from Google Drive
dataset_name = 'WebQuestions'
model_name = 'model' + dataset_name

## 1. Importing Packages

In [2]:
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing
from typing import Tuple, List

## 2. Preprocessing Dataset

Berant et al.

PDF: https://www.aclweb.org/anthology/D13-1160.pdf


Dataset: https://github.com/brmson/dataset-factoid-webquestions


Year of Publication: 2013


Size: 5,810


Data Collection: Berant et al. use the Google Suggest API as basis for generating questions. They start with a single question ("Where was Barack Obama born") and feed the Google Suggest API with three query fragments generated from the source question: the question without the phrase before the entity, the question without the entity and the question without the phrase after the entity. Each of these queries generates 5 candidate questions. The candidate questions are added to a queue of questions and the process is repeated for each of the questions in the queue. They stop after 1M questions are generated. Due to the nature of the approach, the generated questions start with a wh-word and revolve around a single entity. 100K of these questions are given to crowdsourcing workers who select a Freebase entity, value or list of entities as answer. Questions that are answered identically by at least two workers are included in the dataset.

*    Read the JSON file



In [3]:
# Download and load the dataset
import json

!gdown $file_id_train

with open(dataset_path, 'r') as file:
    data = json.load(file)

# Using only a portion of the dataset
# Limit the size of the dataset if lack system RAM
# data = data[:5000]

Downloading...
From: https://drive.google.com/uc?id=14y--fBqtFxWmEwNVzD2gjF85HxQYZPMV
To: /content/train.json
  0% 0.00/568k [00:00<?, ?B/s]100% 568k/568k [00:00<00:00, 134MB/s]


In [4]:
print(f"Sample 1: {data[0]}")
print(f"Sample 2: {data[1]}")
print(f"Sample 3: {data[2]}")

Sample 1: {'question': 'what character did natalie portman play in star wars?', 'answer': 'Padmé Amidala'}
Sample 2: {'question': 'what state does selena gomez?', 'answer': 'New York City'}
Sample 3: {'question': 'what country is the grand bahama island in?', 'answer': 'Bahamas'}


*    Remove unwanted data types which are produced while parsing the data.
*    Append `<BOS>`, beginning-of-sentence token, and `<EOS>`, end-of-sentence token, to all the answers.
*    Create a Tokenizer and load the whole vocabulary ( questions + answers ) into it.

In [5]:
from tensorflow.keras import preprocessing, utils

questions_list = []
answers_list = []

for item in data:
    question = item['question']
    answer = item['answer']

    questions_list.append(question)
    answers_list.append('<BOS> ' + answer + ' <EOS>')

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(['<UNK>'] + questions_list + answers_list)

vocab_size = len(tokenizer.word_index) + 1

# Save the tokenizer
with open(f'tokenizer{dataset_name}.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
print(f"Vocab size: {vocab_size}")
print(f"Vocab: {tokenizer.word_index}")

Vocab size: 7297
Vocab: {'bos': 1, 'eos': 2, 'what': 3, 'the': 4, 'in': 5, 'did': 6, 'is': 7, 'of': 8, 'to': 9, 'who': 10, 'where': 11, 'does': 12, 'do': 13, 'are': 14, 'language': 15, 'was': 16, 'play': 17, 'for': 18, 'countries': 19, 'movies': 20, 'and': 21, 'speak': 22, 'have': 23, 'has': 24, 'school': 25, 'new': 26, 'from': 27, 'go': 28, 'a': 29, 'united': 30, 'when': 31, 'government': 32, 'states': 33, 'university': 34, 'see': 35, 'people': 36, 'state': 37, 'live': 38, 'world': 39, 'played': 40, 'which': 41, 'star': 42, 'time': 43, 'america': 44, 'country': 45, 'city': 46, 'with': 47, 'located': 48, 'national': 49, 'team': 50, 'on': 51, 'type': 52, 'college': 53, 'john': 54, 'border': 55, 'san': 56, 'they': 57, 'president': 58, 'all': 59, 'there': 60, 'canada': 61, 'china': 62, 'republic': 63, 'york': 64, 'michael': 65, 'south': 66, 'series': 67, 'airport': 68, 'kind': 69, 'party': 70, 'english': 71, 'system': 72, 'won': 73, 'been': 74, 'american': 75, 'win': 76, 'museum': 77, 'wr

`encoder_input_data`: tokenize the list of questions. Pad them to their maximum length.

`decoder_input_data`: tokenize the list of answers. Pad them to their maximum length.

`decoder_output_data`: tokenize the list of answers. Remove the first element `<BOS>` from all the `tokenized_answers`.

In [7]:
from gensim.models import Word2Vec
import re

vocab = []
for word in tokenizer.word_index:
    vocab.append(word)

def tokenize(sentences: list) -> Tuple[List[List[str]], List[str]]:
    """
    Tokenize the sentences

    Parameters:
        sentences: list of sentences

    Returns:
        tokenized_sentences: list of tokenized sentences
        vocab_list: list of vocabulary
    """
    tokens_list = []
    vocab_list = [] # Include an unknown token for unknown words
    for sentence in sentences:
        sentence = sentence.lower() # Convert to lower case
        sentence = re.sub('[^a-zA-z0-9\']', ' ', sentence) # Remove special characters
        tokens = sentence.split() # Tokenize the sentence
        vocab_list.extend(tokens)
        tokens_list.append(tokens)

    return tokens_list, vocab_list

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences(questions_list)
questions_max_len = max([ len(x) for x in tokenized_questions ])
padded_questions = preprocessing.sequence.pad_sequences(tokenized_questions, maxlen=questions_max_len, padding='post')
encoder_input_data = np.array(padded_questions)
print(f"Encoder input data shape: (# Samples, Max Sequence Length): {encoder_input_data.shape}")

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences(answers_list)
answers_max_len = max([ len(x) for x in tokenized_answers ])
padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=answers_max_len, padding='post')
decoder_input_data = np.array(padded_answers)
print(f"Decoder input data shape: (# Samples, Max Sequence Length): {decoder_input_data.shape}")

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences(answers_list)
for i in range(len(tokenized_answers)):
    tokenized_answers[i] = tokenized_answers[i][1:] # Remove the <bos> token

padded_answers = preprocessing.sequence.pad_sequences(tokenized_answers, maxlen=answers_max_len, padding='post')
onehot_answers = utils.to_categorical(padded_answers, vocab_size)
decoder_output_data = np.array(onehot_answers)
print(f"Decoder output data shape: (# Samples, Max Sequence Length, Vocab Size): {decoder_output_data.shape}")

Encoder input data shape: (# Samples, Max Sequence Length): (6668, 14)
Decoder input data shape: (# Samples, Max Sequence Length): (6668, 78)
Decoder output data shape: (# Samples, Max Sequence Length, Vocab Size): (6668, 78, 7297)


The model will have Embedding, LSTM and Dense layers. The basic configuration is as follows.


*    2 Input Layers : One for `encoder_input_data` and another for `decoder_input_data`.
*    Embedding layer : For converting token vectors to fix sized dense vectors. **( Note :  Don't forget the `mask_zero=True` argument here )**
*    LSTM layer : Provide access to Long-Short Term cells.

Working :

1.    The `encoder_input_data` comes in the Embedding layer (  `encoder_embedding` ).
2.    The output of the Embedding layer goes to the LSTM cell which produces 2 state vectors ( `h` and `c` which are `encoder_states` )
3.    These states are set in the LSTM cell of the decoder.
4.    The decoder_input_data comes in through the Embedding layer.
5.    The Embeddings goes in LSTM cell ( which had the states ) to produce seqeunces.

In [8]:
encoder_inputs = tf.keras.layers.Input(shape=(questions_max_len,), name='Encoder_Inputs')
encoder_embedding = tf.keras.layers.Embedding(vocab_size, 200, mask_zero=True, name='Encoder_Embedding')(encoder_inputs)
encoder_ouputs, state_h, state_c = tf.keras.layers.LSTM(200, return_state=True, name='Encoder_LSTM')(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.layers.Input(shape=(answers_max_len,), name='Decoder_Inputs')
decoder_embedding = tf.keras.layers.Embedding(vocab_size, 200, mask_zero=True, name='Decoder_Embedding')(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(200, return_state=True, return_sequences=True, name='Decoder_LSTM')
decoder_ouputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = tf.keras.layers.Dense(vocab_size, activation=tf.keras.activations.softmax, name='Output_Layer')
output = decoder_dense(decoder_ouputs)

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output, name='Encoder_Decoder_Model')
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')

model.summary()

Model: "Encoder_Decoder_Model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Encoder_Inputs (InputLayer  [(None, 14)]                 0         []                            
 )                                                                                                
                                                                                                  
 Decoder_Inputs (InputLayer  [(None, 78)]                 0         []                            
 )                                                                                                
                                                                                                  
 Encoder_Embedding (Embeddi  (None, 14, 200)              1459400   ['Encoder_Inputs[0][0]']      
 ng)                                                                          

## 3. Training Model

Train the model for a number of epochs with `RMSprop` optimizer and `categorical_crossentropy` loss function.

In [9]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=50, epochs=300 )
model.save(f"{model_name}.keras")

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

## 4. Define Inference Models

**Encoder inference model**: Takes the question as input and outputs LSTM states ( `h` and `c` ).

**Decoder inference model**: Takes in 2 inputs, one are the LSTM states ( Output of encoder model ), second are the answer input seqeunces ( ones not having the `<bos>` tag ). It will output the answers for the question which we fed to the encoder model and its state values.

In [10]:
def make_inference() -> Tuple[tf.keras.models.Model, tf.keras.models.Model]:
    """
    Constructs separate encoder and decoder models for inference based on a trained Encoder-Decoder model.

    Returns:
        encoder_model (tf.keras.models.Model): A Keras model representing the encoder component for inference.
            This model takes encoder inputs (sequences of tokens) and outputs the encoder states (hidden state
            and cell state) produced by the encoder LSTM layer.

        decoder_model (tf.keras.models.Model): A Keras model representing the decoder component for inference.
            This model takes decoder inputs (sequences of tokens) along with the initial decoder states (hidden
            state and cell state) as inputs and outputs the decoder outputs and updated decoder states.
            It consists of the decoder LSTM layer and the decoder dense layer.
    """
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=(200,), name='Input_Layer_h')
    decoder_state_input_c = tf.keras.layers.Input(shape=(200,), name='Input_Layer_c')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_ouputs = decoder_dense(decoder_outs)
    decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs, [decoder_ouputs] + decoder_states)

    return encoder_model, decoder_model

## 5. Talking with Chatbot

Convert `str` into tokens with paddings.

In [11]:
def str_to_tokens(sentence: str) -> List[int]:
    words = sentence.lower()
    words = re.sub('[^a-zA-Z0-9\']', ' ', words)
    words = words.split()
    # tokens_list = [ tokenizer.word_index[word] for word in words ]
    tokens_list = list()
    for word in words:
        if word in tokenizer.word_index:
            tokens_list.append(tokenizer.word_index[word])
        else:
            tokens_list.append(tokenizer.word_index['unk'])
    padded_sequence = preprocessing.sequence.pad_sequences([tokens_list], maxlen=questions_max_len, padding='post')
    return padded_sequence

1.    First, we take a question as input and predict the state values using `encoder_model`.
2.    We set the state values in the decoder's LSTM.
3.    Then, we generate a sequence which contains the `<start>` element.
4.    We input this sequence in the `decoder_model`.
5.    We replace the `<bos>` element with the element which was predicted by the `decoder_model` and update the state values.
6.    We carry out the above steps iteratively till we hit the `<eos>` tag or the maximum answer length.

In [12]:
encoder_model, decoder_model = make_inference()

input_strs = [
    "What is your name?",
    "How are you feeling today?",
    "What is the weather like outside?",
    "Can you tell me a joke?",
    "How old are you?",
    "What is the capital of France?",
    "How do I bake a cake?",
    "What time is it?",
    "Who is the president of the United States?",
    "What is the meaning of life?"]

for input_str in input_strs:

    # Convert input string to tokens for the encoder
    states_values = encoder_model.predict(str_to_tokens(input_str), verbose=0)

    # Initialize target sequence with 'bos' (beginning of sentence) token
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['bos']

    # Initialize variables for translation and stop condition
    stop_condition = False
    decoded_translation = ''

    while not stop_condition:
        # Predict next word using the decoder model
        decoder_outputs, h, c = decoder_model.predict([empty_target_seq] + states_values, verbose=0)

        # Get index of the most probable word and fine the word corresponding to the index
        sampled_word_index = np.argmax(decoder_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word !="eos" and word !="unk":
                    decoded_translation += f" {word}"
                sampled_word = word

        if sampled_word == 'eos' or len(decoded_translation.split()) > answers_max_len:
            stop_condition = True

        # Update target sequence with sampled word index
        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index

        # Update states for the next iteration
        states_values = [h, c]

    # Print the decoded translation
    print(f"Answer: {decoded_translation}")

Answer:  racecar driver
Answer:  black windsor
Answer:  american zealand
Answer:  the fabulous zoo
Answer:  kim acoustic guitar
Answer:  atlanta
Answer:  heart
Answer:  yearly
Answer:  franklin d
Answer:  the conservative of gathas and the world and the world


In [13]:
encoder_model, decoder_model = make_inference()

input_strs = ['Where is Barack Obama born?',
              'What is the capital of Canada?',
              'Who is the Prime Minister of India?']

for input_str in input_strs:

    # Convert input string to tokens for the encoder
    states_values = encoder_model.predict(str_to_tokens(input_str), verbose=0)

    # Initialize target sequence with 'bos' (beginning of sentence) token
    empty_target_seq = np.zeros((1, 1))
    empty_target_seq[0, 0] = tokenizer.word_index['bos']

    # Initialize variables for translation and stop condition
    stop_condition = False
    decoded_translation = ''

    while not stop_condition:
        # Predict next word using the decoder model
        decoder_outputs, h, c = decoder_model.predict([empty_target_seq] + states_values, verbose=0)

        # Get index of the most probable word and fine the word corresponding to the index
        sampled_word_index = np.argmax(decoder_outputs[0, -1, :])
        sampled_word = None
        for word, index in tokenizer.word_index.items():
            if sampled_word_index == index:
                if word !="eos" and word !="unk":
                    decoded_translation += f" {word}"
                sampled_word = word

        if sampled_word == 'eos' or len(decoded_translation.split()) > answers_max_len:
            stop_condition = True

        # Update target sequence with sampled word index
        empty_target_seq = np.zeros((1, 1))
        empty_target_seq[0, 0] = sampled_word_index

        # Update states for the next iteration
        states_values = [h, c]

    # Print the decoded translation
    print(f"Question: {input_str}")
    print(f"Answer: {decoded_translation}\n")

Question: Where is Barack Obama born?
Answer:  unitary state

Question: What is the capital of Canada?
Answer:  salem

Question: Who is the Prime Minister of India?
Answer:  hailemariam desalegn



## 6. Downloading the Model and Tokenizer

In [14]:
# zip the files and download
import os
import shutil
from google.colab import files
from datetime import datetime

temp_dir = 'output'
files_to_include = [f'model{dataset_name}.keras', f'tokenizer{dataset_name}.pickle']

if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)

try:
    for file in files_to_include:
        shutil.copy(file, temp_dir)
except Exception as e:
    print(f"Error copying files: {e}")

file_name = f"Model{dataset_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
shutil.make_archive(file_name, 'zip', 'output')
files.download(file_name + '.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>