# Generative-Based Chatbot Using Sequence to Sequence Algorithm

In [None]:
import re
import os
import time
import csv
import math
import random
import codecs
import itertools

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.jit import script, trace

In [None]:
from utils import normalize_string, filter_pairs, indexes_from_sentence, time_since, moving_average, show_plot_evaluation

## Set Configs

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2
MAX_LENGTH = 10
MIN_COUNT = 3

# model configs
LR = 0.0001
N_EPOCHS = 4000
TEACHER_FORCING_RATIO = 1.0
HIDDEN_SIZE = 500
DROPOUT = 0.1
CLIP = 50.0
BATCH_SIZE = 64

ENCODER_N_LAYERS = 2
DECODER_N_LAYERS = 2
DECODER_LEARNING_RATIO = 5.0

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Set Helpers

In [None]:
class Vocabulary:
    
    def __init__(self, name):
        super(Vocabulary, self).__init__()
        
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = { PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS' }
        self.num_words = 3 # count constant tokens PAD, SOS and EOS
        
    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)
            
    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
            
    # remove words below a certain count threshold
    def trim(self, min_count):

        if self.trimmed: return

        self.trimmed = True

        keep_words = []
        for key, value in self.word2count.items():
            if value >= min_count:
                keep_words.append(key)

        print('Keep Words: {}/{} = {:.4f}'.format(
               len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))

        # reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = { PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS' }
        self.num_words = 3  # count constant tokens PAD, SOS and EOS

        for word in keep_words:
            self.add_word(word)

In [None]:
def read_vocabulary(data_file, corpus_name):
    print('Reading lines...')
    
    # read the file and split into lines
    lines = open(data_file, encoding='utf-8').read().strip().split('\n')
    # split every line into pairs and normalize
    pairs = [[normalize_string(string) for string in l.split('\t')] for l in lines]
    vocabulary = Vocabulary(corpus_name)
    
    return vocabulary, pairs

In [None]:
def load_prepare_data(corpus, corpus_name, data_file, save_dir):
    print('Starting training data preparation...')
    vocabulary, pairs = read_vocabulary(data_file, corpus_name)
    
    print('Reading {!s} sentence pairs'.format(len(pairs)))
    pairs = filter_pairs(pairs, MAX_LENGTH)
    
    print('Trimmed to {!s} sentence pairs'.format(len(pairs)))
    print('Counting words...')
    for pair in pairs:
        vocabulary.add_sentence(pair[0])
        vocabulary.add_sentence(pair[1])
    print(f'Counted words: {vocabulary.num_words}')
    
    return vocabulary, pairs

## Load Datasets

In [None]:
# datasets grasp from here: www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
corpus_name = 'cornell_movie_dialogs'
corpus = os.path.join('datasets', corpus_name)

In [None]:
def print_lines(file, n=10):
    with open(file, 'rb') as data:
        lines = data.readlines()
    for line in lines[:n]:
        print(line)

In [None]:
print_lines(os.path.join(corpus, 'movie_lines.txt'))

## Preprocess Datasets

In [None]:
def load_lines(file_name, fields):
    
    lines = {}
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(' +++$+++ ')
            
            line_obj = {}
            for i, field in enumerate(fields):
                line_obj[field] = values[i]
            
            lines[line_obj['lineID']] = line_obj
    return lines

In [None]:
def load_conversations(file_name, lines, fields):
    
    conversations = []
    with open(file_name, 'r', encoding='iso-8859-1') as f:
        for line in f:
            values = line.split(" +++$+++ ")
            
            conv_obj = {}
            for i, field in enumerate(fields):
                conv_obj[field] = values[i]
                
            utterance_id_pattern = re.compile('L[0-9]+')
            line_ids = utterance_id_pattern.findall(conv_obj['utteranceIDs'])
            
            conv_obj['lines'] = []
            for line_id in line_ids:
                conv_obj['lines'].append(lines[line_id])
                
            conversations.append(conv_obj)
    return conversations

In [None]:
def extract_sentence_pairs(conversations):
    
    qa_pairs = []
    for conversation in conversations:
        for i in range(len(conversation['lines']) - 1):
            input_line = conversation['lines'][i]['text'].strip()
            target_line = conversation['lines'][i+1]['text'].strip()
            
            if input_line and target_line:
                qa_pairs.append([input_line, target_line])
                
    return qa_pairs

In [None]:
data_file = os.path.join(corpus, 'formatted_movie_lines.txt')

delimiter = '\t'
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

In [None]:
lines = {}
conversations = []
MOVIE_LINES_FIELDS = ['lineID', 'characterID', 'movieID', 'character', 'text']
MOVIE_CONVERSATIONS_FIELDS = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']

print('\nProcessing corpus...')
lines = load_lines(os.path.join(corpus, 'movie_lines.txt'), MOVIE_LINES_FIELDS)

print('\nLoading conversations...')
conversations = load_conversations(os.path.join(corpus, 'movie_conversations.txt'),
                                   lines, MOVIE_CONVERSATIONS_FIELDS)

print('\nWriting newly formatted file...')
with open(data_file, 'w', encoding='utf-8') as output_file:
    writer = csv.writer(output_file, delimiter=delimiter, lineterminator='\n')
    for pair in extract_sentence_pairs(conversations):
        writer.writerow(pair)
        
print('\nSample lines from file:')
print_lines(data_file)

In [None]:
save_dir = os.path.join('datasets', 'save')
vocabulary, pairs = load_prepare_data(corpus, corpus_name, data_file, save_dir)

print('\nPairs:')
for pair in pairs[:10]: print(pair)

In [None]:
def trim_rare_words(vocabulary, pairs, min_count):
    
    vocabulary.trim(min_count)
    
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        
        keep_input = True
        keep_output = True
        
        # check input sentence
        for word in input_sentence.split(' '):
            if word not in vocabulary.word2index:
                keep_input = False
                break
           
        # check output sentence
        for word in output_sentence.split(' '):
            if word not in vocabulary.word2index:
                keep_output = False
                break
                
        # only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print('Trimmed from {} pairs to {}, {:.4f} of total.'.format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs

In [None]:
pairs = trim_rare_words(vocabulary, pairs, MIN_COUNT)

print('\nPairs:')
for pair in pairs[:10]: print(pair)

## Prepare Datasets for Model

In [None]:
def zero_padding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [None]:
def binary_matrix(l, value=PAD_token):
    matrix = []
    for i, sequence in enumerate(l):
        matrix.append([])
        for token in sequence:
            if token == PAD_token:
                matrix[i].append(0)
            else:
                matrix[i].append(1)
    
    return matrix

In [None]:
# returns padded input sequence tensor and lengths
def input_var(l, vocabulary):
    
    indexes_batch = [indexes_from_sentence(vocabulary, sentence, EOS_token) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    pad_list = zero_padding(indexes_batch)
    pad_var = torch.LongTensor(pad_list)
    
    return pad_var, lengths

In [None]:
# returns padded target sequence tensor, padding mask and max target length
def output_var(l, vocabulary):
    
    indexes_batch = [indexes_from_sentence(vocabulary, sentence, EOS_token) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    pad_list = zero_padding(indexes_batch)
    mask = binary_matrix(pad_list)
    mask = torch.BoolTensor(mask)
    pad_var = torch.LongTensor(pad_list)
    
    return pad_var, mask, max_target_len

In [None]:
# returns all items for a given batch of pairs
def batch_to_train_data(vocabulary, pair_batch):
    
    pair_batch.sort(key=lambda x: len(x[0].split(' ')), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    
    inp, lengths = input_var(input_batch, vocabulary)
    output, mask, max_target_len = output_var(output_batch, vocabulary)
    
    return inp, lengths, output, mask, max_target_len

In [None]:
SMALL_BATCH_SIZE = 5

In [None]:
batches = batch_to_train_data(vocabulary, [random.choice(pairs) for _ in range(SMALL_BATCH_SIZE)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("Input Variable:", input_variable)
print("Lengths:", lengths)
print("Target Variable:", target_variable)
print("Mask:", mask)
print("Max Target Len:", max_target_len)

## Build [Seq2seq](https://arxiv.org/pdf/1409.3215.pdf) Network

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(Encoder, self).__init__()
        
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding_layer = embedding
        
        # initialize GRU: the input size and hidden size params are bot set to 'hidden_size'
        # because our input size is a word embedding with number of features equals to 'hidden_size'
        self.gru_layer = nn.GRU(hidden_size, hidden_size, n_layers,
                                dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
    def forward(self, input_seq, input_lengths, hidden=None):
        
        # convert word indexes to embeddings
        embeds = self.embedding_layer(input_seq) # input_seq: (max_length, batch_size)
        
        # pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embeds, input_lengths)
        
        # forward pass throught GRU layer
        gru_out, hidden = self.gru_layer(packed, hidden) # hidden shape: (n_layers x num_directions, batch_size, hidden_size)
        
        # unpack the padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(gru_out) # outputs shape: (max_length, batch_size, hidden_size)
        
        # sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        
        # return output and final hidden state
        return outputs, hidden

## Build [Attention](https://arxiv.org/pdf/1706.03762.pdf) Decoder Network

In [None]:
class GlobalAttention(nn.Module):
    
    def __init__(self, method, hidden_size):
        
        super(GlobalAttention, self).__init__()
        
        self.method = method
        
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, 'is not an appropriate attention method.')
        
        self.hidden_size = hidden_size
        
        if self.method == 'general':
            self.attention_layer = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))
            
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)
    
    def general_score(self, hidden, encoder_output):
        energy = self.attention_layer(encoder_output)
        return torch.sum(hidden * energy, dim=2)
    
    def concat_score(self, hidden, encoder_output):
        energy = self.attention_layer(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        # calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attention_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attention_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attention_energies = self.dot_score(hidden, encoder_outputs)
            
        # transpose max_length and batch_size dimensions
        attention_energies = attention_energies.t()
        
        # return the softmax normalized probability scores (with added dimension)
        return F.softmax(attention_energies, dim=1).unsqueeze(1)

In [None]:
class GlobalAttentionDecoder(nn.Module):
    
    # input_step shape: (1, batch_size)
    # last_hidden shape: (n_layers x num_directions, batch_size, hidden_size)
    # encoder_outputs shape: (max_length, batch_size, hidden_size)
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout=0.1, attention_model='dot'):
        
        super(GlobalAttentionDecoder, self).__init__()
        
        self.attention_model = attention_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # define layers
        self.embedding_layer = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru_layer = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat_layer = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_layer = nn.Linear(hidden_size, output_size)
        
        self.attention_layer = GlobalAttention(attention_model, hidden_size)
    
    # output shape: (batch_size, vocabulary.num_words)
    # hidden shape: (n_layers x num_directions, batch_size, hidden_size)
    def forward(self, input_step, last_hidden, encoder_outputs):
        
        # run this one step (word) at a time
        embeds = self.embedding_layer(input_step)
        embeds = self.embedding_dropout(embeds)
        
        # forward through undirectional GRU
        gru_out, hidden = self.gru_layer(embeds, last_hidden)
        
        # calculate attention weights from the current GRU output
        attention_weights = self.attention_layer(gru_out, encoder_outputs)
        
        # multiply attention weights to encoder outputs to get new 'weighted sum' context vector
        context_vector = attention_weights.bmm(encoder_outputs.transpose(0, 1))
        
        # concatenate weighted context vector and GRU output
        gru_out = gru_out.squeeze(0)
        
        context_vector = context_vector.squeeze(1)
        concat_input = torch.cat((gru_out, context_vector), 1)
        concat_output = torch.tanh(self.concat_layer(concat_input))
        
        # predict next word
        output = self.fc_layer(concat_output)
        output = F.softmax(output, dim=1)
        
        # return output and final hidden state
        return output, hidden

#### Initialize Seq2seq Network

In [None]:
EMBEDDING = nn.Embedding(vocabulary.num_words, HIDDEN_SIZE)

In [None]:
encoder = Encoder(HIDDEN_SIZE, EMBEDDING, ENCODER_N_LAYERS, DROPOUT)
encoder.to(device)

In [None]:
decoder = GlobalAttentionDecoder(EMBEDDING, HIDDEN_SIZE, vocabulary.num_words, DECODER_N_LAYERS, DROPOUT, attention_model='dot')
decoder.to(device)

## Set Loss Function

In [None]:
def mask_nll_loss(input_seq, target_seq, mask):
    
    n_total = mask.sum()
    cross_entropy = -torch.log(torch.gather(input_seq, 1, target_seq.view(-1, 1)).squeeze(1))
    loss = cross_entropy.masked_select(mask).mean()
    loss = loss.to(device)
    
    return loss, n_total.item()

## Set Optimizer

In [None]:
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=LR)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=LR * DECODER_LEARNING_RATIO)

## Train Seq2seq Network

Sequence of Operations:

1. Forward pass entire input batch through encoder
2. Initialize decoder inputs as SOS_token, and hidden state as the encoder’s final hidden state
3. Forward input batch sequence through decoder one time step at a time
4. If teacher forcing: set next decoder input as the current target; else: set next decoder input as current decoder output
5. Calculate and accumulate loss
6. Perform backpropagation
7. Clip gradients
8. Update encoder and decoder model parameters

In [None]:
tick = time.time()
losses_history = []
total_loss_print = 0
total_loss_plot = 0

print_every = 5000
plot_every = 100

training_batches = [batch_to_train_data(vocabulary, [random.choice(pairs) for _ in range(BATCH_SIZE)])
                    for _ in range(N_EPOCHS)]

print('Training the network...')
for epoch in range(1, N_EPOCHS+1):
    
    training_batch = training_batches[epoch - 1]
    
    # extract fields from batch
    input_variable, lengths, target_variable, mask, max_target_len = training_batch
    
    # zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    # initialize variables
    loss = 0
    print_losses = []
    n_totals = 0
    
    # forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
    
    # create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(BATCH_SIZE)]])
    decoder_input = decoder_input.to(device)
    
    # set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    
    # determine if we are using teacher forching this iteration
    USE_TEACHER_FORCING = True if random.random() < TEACHER_FORCING_RATIO else False
    
    # forward batch of sequences through decoder one time step at a time
    if USE_TEACHER_FORCING:
        
        for t in range(max_target_len):
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            # teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            
            # calculate and accumulate loss
            mask_loss, n_total = mask_nll_loss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total
    else:
        
        for t in range(max_target_len):
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            # no teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(BATCH_SIZE)]])
            decoder_input = decoder_input.to(device)
            
            # calculate and accumulate loss
            mask_loss, n_total = mask_nll_loss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * n_total)
            n_totals += n_total
    
    # perform backpropagation
    loss.backward()
    
    # clip gradients
    nn.utils.clip_grad_norm_(encoder.parameters(), CLIP)
    nn.utils.clip_grad_norm_(decoder.parameters(), CLIP)
    
    # adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    loss = sum(print_losses) / n_totals
    total_loss_print += loss
    total_loss_plot += loss
    
    if epoch % print_every == 0:
        avg_loss_print = print_loss_total / print_every
        total_loss_print = 0
        print('%d | %d epochs, Average Loss: %.4f, Times Taken: %s' % (epoch, N_EPOCHS, avg_loss_print,
                                                                        time_since(tick, epoch / N_EPOCHS)))
    
    if epoch & plot_every == 0:
        avg_loss_plot = total_loss_plot / plot_every
        losses_history.append(avg_loss_plot)
        total_loss_plot = 0

In [None]:
show_plot_evaluation(losses_history, 10)

## Evaluate The Network

Sequence of Operations:

1. Forward input through encoder model
2. Prepare encoder’s final hidden layer to be first hidden input to the decoder
3. Initialize decoder’s first input as SOS_token
4. Initialize tensors to append decoded words to
5. Iteratively decode one word token at a time: Forward pass through decoder, Obtain most likely word token and its softmax score, Record token and score, Prepare current token to be next decoder input
6. Return collections of word tokens and scores

In [None]:
class GreedySearchDecoder(nn.Module):
    
    def __init__(self, encoder, decoder):
        
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, input_seq, input_length, max_length):
        
        # forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        
        # prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        
        # initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        
        # initialize tensors to append decoded words
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        
        # iteratively decode one word token at a time
        for _ in range(MAX_LENGTH):
            
            # forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            # obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            
            # record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            
            # prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
            
        # return collections of word tokens and scores
        return all_tokens, all_scores

#### Initialize Search Module

In [None]:
search_decoder = GreedySearchDecoder(encoder, decoder)
search_decoder.to(device)

In [None]:
def evaluate_network(encoder, decoder, search_decoder, vocabulary, input_sequence, max_length=MAX_LENGTH):
    
    # format input sentence as a batch
    indexes_batch = [indexes_from_sentence(vocabulary, input_sequence, EOS_token)]
    
    # create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    lengths = lengths.to(device)
    
    # transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    
    # decode sentence with search decoder
    tokens, scores = search_decoder(input_batch, lengths, max_length)
    decoded_words = [vocabulary.index2word[token.item()] for token in tokens]
    
    return decoded_words

In [None]:
def input_sentence(encoder, decoder, search_decoder, vocabulary):
    
    input_sequence = ''
    
    encoder.eval()
    decoder.eval()
    while(1):
        # get input sentence
        input_sentence = input('💻: ')

        # check if it is quit case
        if input_sentence == 'q' or input_sequence == 'quit': break

        # normalize sentence
        input_sentence = normalize_string(input_sentence)

        # evaluate sentence
        output_words = evaluate_network(encoder, decoder, search_decoder, vocabulary, input_sentence)

        # format and print response sentence
        output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
        print('🔥: ', ' '.join(output_words))

## Chat with Chatbot!

In [None]:
input_sentence(encoder, decoder, search_decoder, vocabulary)

---