<a href="https://colab.research.google.com/github/lu1993/DeepLearning/blob/master/Build_a_Chat_Bot_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Building a Chatbot**

In [0]:
import torch
import torch.nn as nn 
from torch import optim 
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [0]:
CUDA = torch.cuda.is_available()
device = torch.device('cuda' if CUDA else 'cpu')
print(device)
print(torch.cuda.get_device_name(device))

cuda
Tesla P4


#### **Part 1: Data Preprocessing**

In [0]:
from google.colab import files
uploaded = files.upload()

Saving cornell_movie_dialogs_corpus.zip to cornell_movie_dialogs_corpus.zip


In [0]:
%%time
# unzip dataset in colab
!mkdir ./cornell_movie_dialogs_corpus
!unzip -q cornell_movie_dialogs_corpus.zip -d ./cornell_movie_dialogs_corpus

CPU times: user 16.2 ms, sys: 14.9 ms, total: 31.1 ms
Wall time: 3.8 s


In [0]:
ls ./cornell_movie_dialogs_corpus/'cornell movie-dialogs corpus'/ 

chameleons.pdf                 movie_lines.txt            README.txt
movie_characters_metadata.txt  movie_titles_metadata.txt
movie_conversations.txt        raw_script_urls.txt


In [0]:
dataset_dir = './cornell_movie_dialogs_corpus/'
lines_filepath = os.path.join(dataset_dir, 'cornell movie-dialogs corpus', 'movie_lines.txt')
conv_filepath = os.path.join(dataset_dir, 'cornell movie-dialogs corpus', 'movie_conversations.txt')
print(lines_filepath)

./cornell_movie_dialogs_corpus/cornell movie-dialogs corpus/movie_lines.txt


In [0]:
# visualize some lines
with open(lines_filepath, 'r', encoding='iso-8859-1') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [0]:
with open(conv_filepath, 'r', encoding='iso-8859-1') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line.strip())

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']


In [0]:
# split each line into a dictionary of fields
line_fields = ['lineID', 'characterID', 'movieID', 'character', 'text']
lines = {}
with open(lines_filepath, 'r', encoding='iso-8859-1') as f:
  for line in f:
    values = line.split(' +++$+++ ')
    lineObj = {}
    for i, field in enumerate(line_fields):
      lineObj[field] = values[i]
    lines[lineObj['lineID']] = lineObj

In [0]:
# group lines into conversations
conv_fields = ['character1ID', 'character2ID', 'movieID', 'utteranceIDs']
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
  for line in f:
    values = line.split(' +++$+++ ')
    convObj = {}
    for i, field in enumerate(conv_fields):
      convObj[field] = values[i]
    lineIds = eval(convObj['utteranceIDs'])
    convObj['lines'] = []
    for lineId in lineIds:
      convObj['lines'].append(lines[lineId])
    conversations.append(convObj)

In [0]:
# extract pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
  for i in range(len(conversation['lines']) - 1):
    inputLine = conversation['lines'][i]['text'].strip()
    targetLine = conversation['lines'][i + 1]['text'].strip()
    if inputLine and targetLine:
      qa_pairs.append([inputLine, targetLine])

In [0]:
qa_pairs[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

In [0]:
# mount google drive to colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# write transformed data to file in google drive
datafile = './drive/My Drive/formatted_movie_lines.txt'
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))
with open(datafile, 'w', encoding='utf-8') as outputfile:
  writer = csv.writer(outputfile, delimiter=delimiter)
  for pair in qa_pairs:
    writer.writerow(pair)
print('Done writing to file')

In [0]:
# visualize some lines
with open(datafile, 'rb') as file:
  lines = file.readlines()
for line in lines[:8]:
  print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\n"
b'Why?\tUnsolved myster

In [0]:
# create the vocabulary
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocabulary: 
  def __init__(self, name):
    self.name = name
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
    self.num_words = 3

  def addSentence(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if self.word2index.get(word) is None:
      self.word2index[word] = self.num_words 
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words += 1
    else:
      self.word2count[word] += 1

  def trim(self, min_count):
    keep_words = []
    for k, v in self.word2count.items():
      if v >= min_count:
        keep_words.append(k)

    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: 'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
    self.num_words = 3
    for w in keep_words:
      self.addWord(w)

In [0]:
# convert a unicode string to plain ASCII
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [0]:
def normalizeString(s):
  s = unicodeToAscii(s.lower().strip())
  s = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-z.!?]+", r" ", s)
  s = re.sub(r"\s+", r" ", s).strip()
  return s

In [0]:
normalizeString("aa123aa!s's  dd?")

'aa aa !s s dd ?'

In [0]:
# load data from file
# datafile = os.path.join(dataset_dir, 'cornell movie-dialogs corpus', 'formatted_movie_lines.txt')
datafile = './drive/My Drive/formatted_movie_lines.txt'
lines = open(datafile, encoding='utf-8').read().strip().split('\n')
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
voc = Vocabulary('cornell movie-dialogs corpus')

In [0]:
voc.num_words

3

In [0]:
pairs[:3]

[['can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well i thought we d start with pronunciation if that s okay with you .'],
 ['well i thought we d start with pronunciation if that s okay with you .',
  'not the hacking and gagging and spitting part . please .'],
 ['not the hacking and gagging and spitting part . please .',
  'okay . . . then how bout we try out some french cuisine . saturday ? night ?']]

In [0]:
# filter out sentences with longer than 10 words
MAX_LENGTH = 10
def filterPair(p):
  return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

def filterPairs(p):
  return [pair for pair in pairs if filterPair(pair)]

In [0]:
pairs = [pair for pair in pairs if len(pair) > 1]
print('There are {} pairs before filtering'.format(len(pairs)))
pairs = filterPairs(pairs)
print('There are {} pairs after filtering'.format(len(pairs)))

There are 221282 pairs before filtering
There are 64266 pairs after filtering


In [0]:
# add words to vocabulary
for pair in pairs:
  voc.addSentence(pair[0])
  voc.addSentence(pair[1])
print('Counted words:', voc.num_words)
for pair in pairs[:10]:
  print(pair)

Counted words: 18077
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [0]:
voc.num_words

18077

In [0]:
# trim words by count
MIN_COUNT = 3
def trimRareWords(voc, pairs, MIN_COUNT):
  voc.trim(MIN_COUNT)
  keep_pairs = []
  for pair in pairs:
    input_sentence = pair[0]
    output_sentence = pair[1]
    keep_input = True
    keep_output = True
    for word in input_sentence.split(' '):
      if voc.word2index.get(word) is None:
        keep_input = False
        break
    for word in output_sentence.split(' '):
      if voc.word2index.get(word) is None:
        keep_output = False
        break 

    if keep_input and keep_output:
      keep_pairs.append(pair)
  return keep_pairs

In [0]:
keep_pairs = trimRareWords(voc, pairs, MIN_COUNT)

In [0]:
print('There are {} pairs after trimming words'.format(len(keep_pairs)))

There are 53115 pairs after trimming words


In [0]:
pairs = keep_pairs

In [0]:
# convert sentence into numerical vector 
# output matrix size: batch_size * dynamic sequence length for each sentence
def indexesFromSentence(voc, sentence):
  return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [0]:
# test on some samples
# output matrix size: batch_size * max(sequence_length)
inp = []
out = []
for pair in pairs[:10]:
  inp.append(pair[0])
  out.append(pair[1])
print(inp)
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
print(indexes)

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
[[3, 4, 2], [7, 8, 9, 10, 4, 11, 12, 13, 2], [16, 4, 2], [8, 31, 22, 6, 2], [33, 34, 4, 4, 4, 2], [35, 36, 37, 38, 7, 39, 40, 41, 4, 2], [42, 2], [47, 7, 48, 40, 45, 49, 6, 2], [50, 51, 52, 6, 2], [58, 2]]


In [0]:
# zero padding
# output matrix size: max(sequence_length) * batch_size
def zeroPadding(l, fillvalue = 0):
  return list(itertools.zip_longest(*l, fillvalue = fillvalue))

In [0]:
padded_indexes = zeroPadding(indexes)
print(padded_indexes)

[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58), (4, 8, 4, 31, 34, 36, 2, 7, 51, 2), (2, 9, 2, 22, 4, 37, 0, 48, 52, 0), (0, 10, 0, 6, 4, 38, 0, 40, 6, 0), (0, 4, 0, 2, 4, 7, 0, 45, 2, 0), (0, 11, 0, 0, 2, 39, 0, 49, 0, 0), (0, 12, 0, 0, 0, 40, 0, 6, 0, 0), (0, 13, 0, 0, 0, 41, 0, 2, 0, 0), (0, 2, 0, 0, 0, 4, 0, 0, 0, 0), (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]


In [0]:
def binaryMatrix(l):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == PAD_token:
        m[i].append(0)
      else:
        m[i].append(1)
  return m 

In [0]:
binary_result = binaryMatrix(padded_indexes)
print(binary_result)

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 0], [0, 1, 0, 1, 1, 1, 0, 1, 1, 0], [0, 1, 0, 1, 1, 1, 0, 1, 1, 0], [0, 1, 0, 0, 1, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]


In [0]:
def inputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  padVar = torch.LongTensor(padList) # (batch size, max length)
  return padVar, lengths

In [0]:
def outputVar(l, voc):
  indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
  max_target_len = max([len(indexes) for indexes in indexes_batch])
  padList = zeroPadding(indexes_batch)
  mask = binaryMatrix(padList)
  mask = torch.ByteTensor(mask)
  padVar = torch.LongTensor(padList) # (batch size, max length)
  return padVar, mask, max_target_len

In [0]:
def batch2TrainData(voc, pair_batch):
  pair_batch.sort(key = lambda x: len(x[0].split(' ')), reverse=True)
  input_batch, output_batch = [], []
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  inp, lengths = inputVar(input_batch, voc)
  output, mask, max_target_len = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_target_len

In [0]:
# test function 
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
inp, lengths, output, mask, max_target_len = batches 
print('input variables')
print(inp)
print('lengths')
print(lengths)
print('output variables')
print(output)
print('mask')
print(mask)
print('maximum output length')
print(max_target_len)

input variables
tensor([[  51,   34,   47, 5331,  625],
        [ 180, 2981,    7, 5332,    4],
        [7446,   27,   18,    6,    2],
        [  66,   14,  618,    2,    0],
        [2862,  187,    6,    0,    0],
        [   4,    4,    2,    0,    0],
        [   2,    2,    0,    0,    0]])
lengths
tensor([7, 7, 6, 4, 3])
output variables
tensor([[2862,   25,  177,  122,  869],
        [7641,  118,   12,   34,  684],
        [  66,   40,  810, 2323,    4],
        [   2,  380,  234, 3699,    2],
        [   0,  187,  810,   98,    0],
        [   0,  349, 3663,  157,    0],
        [   0,    4,    4,  573,    0],
        [   0,    2,    4,  252,    0],
        [   0,    0,    4,    4,    0],
        [   0,    0,    2,    2,    0]])
mask
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [0, 1, 1, 1, 0],
        [0, 1, 1, 1, 0],
        [0, 1, 1, 1, 0],
        [0, 1, 1, 1, 0],
        [0, 0, 1, 1, 0],
        [0, 0, 1, 1, 0]

#### **Part 2: Defining the Models**

In [0]:
# implement encoder layer
class EncoderRNN(nn.Module):
  def __init__(self, hidden_size, embedding, n_layer = 1, dropout = 0):
    super(EncoderRNN, self).__init__()
    self.n_layer = n_layer
    self.hidden_size = hidden_size # dimensions of RNN cells in a hidden layer
    self.embedding = embedding
    self.gru = nn.GRU(hidden_size, hidden_size, n_layer, dropout=(0 if n_layer == 1 else dropout), bidirectional=True)

  def forward(self, input_seq, input_length, hidden = None):
    # convert word indexes to embeddings
    embedded = self.embedding(input_seq)
    # pack padded sequence together to save computation power 
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_length)
    outputs, hidden = self.gru(packed, hidden)
    # unpack outputs
    # output size: sequence * batch size * hidden size (channel * row * column)
    outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
    # add bidirectional outputs together 
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
    return outputs, hidden

In [0]:
# implement attention layer
class Attn(torch.nn.Module):
  def __init__(self, method, hidden_size):
    super(Attn, self).__init__()
    self.method = method
    self.hidden_size = hidden_size

  def dot_score(self, hidden, encoder_output):
    # element wise multiplication between current target decoder state and all encoder states
    # dim = 2: sum up across the 2nd dimension
    return torch.sum(hidden * encoder_output, dim = 2)

  def forward(self, hidden, encoder_outputs):
    # shape of hidden: 1 * batch size * hidden size
    # shape of encoder outputs: max sequence length * batch size * hidden size
    # (1, batch size, hidden size) * (max length, batch size, hidden size) = (max length, batch size, hidden size)
    # summing up across the 2nd dimension would give the shape of attention energies as follows: 
    # shape of attention energies: max sequence length * batch size 
    # shape of transposed attention energies: batch size * max sequence length 
    attn_energies = self.dot_score(hidden, encoder_outputs)
    attn_energies = attn_energies.t()
    # return softmax normalized probability score with 1 additional dimension (batch size * 1 * max length)
    return F.softmax(attn_energies, dim = 1).unsqueeze(1)

In [0]:
# implement decoder layer
class LuongAttnDecoderRNN(nn.Module):
  def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers = 1, dropout = 0.1):
    super(LuongAttnDecoderRNN, self).__init__()
    self.attn_model = attn_model
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout

    # define layers
    self.embedding = embedding
    self.embedding_dropout = nn.Dropout(dropout)
    self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else dropout))
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.attn = Attn(attn_model, hidden_size)

  def forward(self, input_step, last_hidden, encoder_outputs):
    embedded = self.embedding(input_step)
    embedded = self.embedding_dropout(embedded)
    rnn_output, hidden = self.gru(embedded, last_hidden)
    attn_weights = self.attn(rnn_output, encoder_outputs)
    context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
    rnn_output = rnn_output.squeeze(0) # transform (1, batch size, hidden size) -> (batch size, hidden size)
    context = context.squeeze(1) # transform (batch size, 1, max length) -> (batch size, max length)
    concat_input = torch.cat((rnn_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    output = self.out(concat_output)
    output = F.softmax(output, dim = 1) # apply softmax normalization across each row
    return output, hidden

In [0]:
# implement loss function
def maskNLLLoss(decoder_out, target, mask):
  nTotal = mask.sum()
  target = target.view(-1, 1)
  # select elements at the target position
  # shape of decoder_out: (batch size, vocabulary size)
  # shape of target: (batch size, 1)
  gathered_tensor = torch.gather(decoder_out, 1, target)
  # negative log likelihood
  crossEntropy = -torch.log(gathered_tensor.squeeze(1))
  # select non-zero elements
  bool_mask = mask > 0
  loss = crossEntropy.masked_select(bool_mask)
  loss = loss.mean()
  loss = loss.to(device)
  return loss, nTotal.item()

In [0]:
# import torch
# decoder_out = torch.FloatTensor([[0.1, 0.8, 0.1], 
#                                 [0.9, 0.05, 0.05]])
# target = torch.LongTensor([1, 0])
# target = target.view(-1, 1)
# gathered_tensor = torch.gather(decoder_out, 1, target)
# -torch.log(gathered_tensor)
# gathered_tensor.squeeze(1)
# gathered_tensor.dtype

torch.float32

In [0]:
# visualize model training
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches 

print('input_variable shape:', input_variable.shape)
print('lengths shape:', lengths.shape)
print('target_variabel shape', target_variable.shape)
print('mask shape:', mask.shape)
print('max_target_len:', max_target_len)

input_variable shape: torch.Size([7, 5])
lengths shape: torch.Size([5])
target_variabel shape torch.Size([10, 5])
mask shape: torch.Size([10, 5])
max_target_len: 10


In [0]:
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

In [0]:
# define parameters
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

In [0]:
# define encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
encoder.train()
decoder.train()

LuongAttnDecoderRNN(
  (embedding): Embedding(7840, 500)
  (embedding_dropout): Dropout(p=0.1, inplace=False)
  (gru): GRU(500, 500, num_layers=2, dropout=0.1)
  (concat): Linear(in_features=1000, out_features=500, bias=True)
  (out): Linear(in_features=500, out_features=7840, bias=True)
  (attn): Attn()
)

In [0]:
# initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr = 0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr = 0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

In [0]:
loss = 0
print_losses = []
n_totals = 0

In [0]:
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print('Encoder Outputs Shape:', encoder_outputs.shape)
print('Last Encoder Hidden Shape:', encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print('Initial Decoder Input Shape:', decoder_input.shape)
print(decoder_input)

decoder_hidden = encoder_hidden[:decoder.n_layers] # shape of encoder hidden state: (number of layer * number of direction, batch size, hidden size)
print('Initial Decoder Hidden State Shape:', decoder_hidden.shape)

Encoder Outputs Shape: torch.Size([9, 5, 500])
Last Encoder Hidden Shape: torch.Size([4, 5, 500])
Initial Decoder Input Shape: torch.Size([1, 5])
tensor([[1, 1, 1, 1, 1]], device='cuda:0')
Initial Decoder Hidden State Shape: torch.Size([2, 5, 500])


In [0]:
print("Take a look at what's happening in every timestep of the GRU")

In [0]:
# assume we use teacher forcing
for t in range(max_target_len):
  decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
  print('Decoder Output Shape:', decoder_output.shape)
  print('Decoder Hidden Shape:', decoder_hidden.shape)

  decoder_input = target_variable[t].view(1, -1)
  print('The target variable at the current timestep before reshaping:', target_variable[t])
  print('The target variable at the current timestep shape before reshaping:', target_variable[t].shape)
  print('The decoder input shape (reshape the target variable):', decoder_input.shape)

  print('The mask at the current timestep:', mask[t])
  print('The mask shape at the current timestep:', mask[t].shape)
  mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
  print('Mask loss:', mask_loss)
  print('Total:', nTotal)
  loss += mask_loss 
  print_losses.append(mask_loss.item() * nTotal)
  print(print_losses)
  n_totals += nTotal 
  print(n_totals)
  encoder_optimizer.step()
  decoder_optimizer.step()
  returned_loss = sum(print_losses) / n_totals
  print('Returned Loss:', returned_loss)
  print('Done One Timestep')

Decoder Output Shape: torch.Size([5, 7840])
Decoder Hidden Shape: torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping: tensor([  25,  511,  401,  575, 1334], device='cuda:0')
The target variable at the current timestep shape before reshaping: torch.Size([5])
The decoder input shape (reshape the target variable): torch.Size([1, 5])
The mask at the current timestep: tensor([1, 1, 1, 1, 1], device='cuda:0', dtype=torch.uint8)
The mask shape at the current timestep: torch.Size([5])
Mask loss: tensor(8.9358, device='cuda:0', grad_fn=<MeanBackward0>)
Total: 5
[44.67446804046631, 44.678993225097656]
10
Returned Loss: 8.935346126556396
Done One Timestep
Decoder Output Shape: torch.Size([5, 7840])
Decoder Hidden Shape: torch.Size([2, 5, 500])
The target variable at the current timestep before reshaping: tensor([102,   4, 159,  40,  66], device='cuda:0')
The target variable at the current timestep shape before reshaping: torch.Size([5])
The decoder input shape (r

In [0]:
print('Decoder Output Shape:', decoder_output.shape)
print('Decoder Hidden Shape:', decoder_hidden.shape)

In [0]:
train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, 
      encoder_optimizer, decoder_optimizer, batch_size, clip=50)

Returned Loss: 8.975261838812576
Done One Timestep


In [0]:
training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]
# initialize
start_iteration = 1
print_loss = 0

# training
for iteration in range(start_iteration, n_iteration + 1):
  training_batch = training_batches[iteration - 1]
  input_variable, lengths, target_variable, mask, max_target_len = training_batch
  loss = train(input_variable, lengths, target_variable, mask, max_target_len, 
                encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
  print_loss += loss

In [0]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, 
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length = MAX_LENGTH):
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_variable = input_variable.to(device)
  lengths = lengths.to(device)
  target_variable = target_variable.to(device)
  mask = mask.to(device)

  loss = 0
  print_losses = []
  n_totals = 0

  encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
  decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
  decoder_input = decoder_input.to(device)
  decoder_hidden = encoder_hidden[:decoder.n_layers] 

  use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False 

  if use_teacher_forcing:
    for t in range(max_target_len):
      decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
      decoder_input = target_variable[t].view(1, -1)
      mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
      loss += mask_loss 
      print_losses.append(mask_loss.item() * nTotal)
      n_totals += nTotal 
  else:
      for t in range(max_target_len):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
        _, topi = decoder_output.topk(1)
        decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
        decoder_input = decoder_input.to(device)
        mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
        loss += mask_loss 
        print_losses.append(mask_loss.item() * nTotal)
        n_totals += nTotal 

  # perform backpropagation 
  loss.backward()

  # prevent gradient from becoming too large using clipping
  _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
  _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

  # adjust model weights
  encoder_optimizer.step()
  decoder_optimizer.step()
  returned_loss = sum(print_losses) / n_totals
  print('Returned Loss:', returned_loss)
  print('Done One Timestep')
  return returned_loss


In [0]:
# decoder_output.topk(1)

torch.return_types.topk(values=tensor([[0.0002],
        [0.0002],
        [0.0002],
        [0.0002],
        [0.0002]], device='cuda:0', grad_fn=<TopkBackward>), indices=tensor([[1771],
        [ 358],
        [1361],
        [4019],
        [ 295]], device='cuda:0'))

In [0]:
# run n iterations of training
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
               embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
               print_every, save_every, clip, corpus_name, loadFilename):
  training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)]
  # initialize
  start_iteration = 1
  print_loss = 0
  if loadFilename:
    start_iteration = checkpoint['iteration'] + 1

  # training
  for iteration in range(start_iteration, n_iteration + 1):
    training_batch = training_batches[iteration - 1]
    input_variable, lengths, target_variable, mask, max_target_len = training_batch
    loss = train(input_variable, lengths, target_variable, mask, max_target_len, 
                 encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
    print_loss += loss

    # print progress
    if iteration % print_every == 0:
        print_loss_avg = print_loss / print_every
        print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
        print_loss = 0

    # save checkpoint
    if (iteration % save_every == 0):
      directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
      if not os.path.exists(directory):
          os.makedirs(directory)
      torch.save({
          'iteration': iteration,
          'en': encoder.state_dict(),
          'de': decoder.state_dict(),
          'en_opt': encoder_optimizer.state_dict(),
          'de_opt': decoder_optimizer.state_dict(),
          'loss': loss,
          'voc_dict': voc.__dict__,
          'embedding': embedding.state_dict()
      }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


In [0]:
# apply the model on sentence
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [0]:
# evaluate my text
def evaluate(encoder, decoder, searcher, voc, sentence, max_length = MAX_LENGTH):
  # convert words to indices
  indexes_batch = [indexesFromSentence(voc, sentence)]

  # create lengths tensor
  lengths = torch.tensor([len(indexes) for indexes in indexes_batch])

  # transpose (batch size, max length) to (max length, batch size)
  input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
  input_batch = input_batch.to(device)
  lengths = lengths.to(device)

  # decode sentence with searcher
  tokens, scores = searcher(input_batch, lengths, max_length)
  
  # covert indexes to words
  decoded_words = [voc.index2word[token.item()] for token in tokens]
  return decoded_words

In [0]:
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [0]:
# run model
save_dir = './drive/My Drive/'
corpus_name = 'cornell movie-dialogs corpus'
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

In [0]:
# checkpoint_iter = 4000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))
# print(loadFilename)

./drive/My Drive/cb_model/cornell movie-dialogs corpus/2-2_500/4000_checkpoint.tar


In [0]:
# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [0]:
# run training
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Iteration: 2334; Percent complete: 58.4%; Average loss: 3.0051
Returned Loss: 2.976948558909318
Done One Timestep
Iteration: 2335; Percent complete: 58.4%; Average loss: 2.9769
Returned Loss: 3.445969024466357
Done One Timestep
Iteration: 2336; Percent complete: 58.4%; Average loss: 3.4460
Returned Loss: 3.1500064609512743
Done One Timestep
Iteration: 2337; Percent complete: 58.4%; Average loss: 3.1500
Returned Loss: 2.9226317298028897
Done One Timestep
Iteration: 2338; Percent complete: 58.5%; Average loss: 2.9226
Returned Loss: 2.879020522515625
Done One Timestep
Iteration: 2339; Percent complete: 58.5%; Average loss: 2.8790
Returned Loss: 3.000420889900629
Done One Timestep
Iteration: 2340; Percent complete: 58.5%; Average loss: 3.0004
Returned Loss: 3.039529778227024
Done One Timestep
Iteration: 2341; Percent complete: 58.5%; Average loss: 3.0395
Returned Loss: 2.776061857991678
Done One Timestep
Iteration: 2342; Perc

In [0]:
# run evaluation 

# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting 
evaluateInput(encoder, decoder, searcher, voc)