In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

In [3]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [4]:
with open (lines_filepath, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines[:8]:
        print(line.strip()) 

L1045	u0	m0	BIANCA	They do not!
L1044	u2	m0	CAMERON	They do to!
L985	u0	m0	BIANCA	I hope so.
L984	u2	m0	CAMERON	She okay?
L925	u0	m0	BIANCA	Let's go.
L924	u2	m0	CAMERON	Wow
L872	u0	m0	BIANCA	Okay -- you're gonna need to learn how to lie.
L871	u2	m0	CAMERON	No


In [5]:
line_fields = ["lineID", "characterID","movieID","character","text"]
lines = {}
with open (lines_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split("\t")
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj

In [6]:
conv_fields = ["character1ID", "character2ID","movieID","utteranceIDs"]
conversations = []
with open (conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split("\t")
        convObj = {}
        for i,field in enumerate(conv_fields):
            convObj[field] = values[i]        
        lineIds = eval(convObj["utteranceIDs"])
        split_lineIds = [re.findall(r'L\d+', string[0]) for string in lineIds]
        temp = []
        for string in lineIds:
            temp = re.findall(r'L\d+', string)
        
        convObj["lines"] = []
        for lineId in temp:
            convObj["lines"].append(lines[lineId])
        conversations. append(convObj)

In [7]:
qa_pairs = []
for conversation in conversations:
    for i in range(len(conversation["lines"]) -1):
        inputLine = conversation["lines"][i]["text"].replace("\r\r\n","").strip()
        targetLine = conversation["lines"][i+1]["text"].replace("\r\r\n","").strip()
        if inputLine and targetLine:
            qa_pairs.append([inputLine,targetLine])

In [8]:
datafile = os.path.join("cornell movie-dialogs corpus", "formated_movie_lines.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\nWriting newly formatted file...")
with open(datafile,'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing file")


Writing newly formatted file...
Done writing file


In [9]:
with open(datafile, 'rb') as file:
    lines = file.readlines()
    for line in lines[:8]:
        print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell I thought we'd start with pronunciation if that's okay with you.\r\r\n"
b"Well I thought we'd start with pronunciation if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No no it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tUnsolved 

In [10]:
PAD_token = 0 # used for padding short sentences
SOS_token = 1 # start of sentence token <START
EOS_token = 2 # end of sentence token <END>

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print("keep words {} / {} = {:.4f}".format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3
        for word in keep_words:
            self.addWord(word)

In [11]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [12]:
unicodeToAscii("Montréal,Françoise....")

'Montreal,Francoise....'

In [13]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])",r" ", s)
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)
    s = re.sub(r"\s", r" ", s).strip()
    return s

In [14]:
normalizeString("aa123aa!s's  dd?")

'aa aa s s dd'

In [15]:
print("Reading and processing file....Please wait")
lines = open(datafile, 'r', encoding='utf-8').read().strip().split('\n')
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines ]
print("Done reading")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file....Please wait
Done reading


In [16]:
MAX_LENGTH = 10
# return true if the pairs are under the threshhold
def filterPair(p):
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [17]:
pairs = [pair for pair in pairs if len(pair)>1]
print("There are {} pairs/conversations in the dataset)".format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/conversations".format(len(pairs)))

There are 221275 pairs/conversations in the dataset)
After filtering, there are 84210 pairs/conversations


In [18]:
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted words: 21492
['gosh if only we could find kat a boyfriend', 'let me see what i can do']
['c esc ma tete this is my head', 'right see you re ready for the quiz']
['that s because it s such a nice one', 'forget french']
['there', 'where']
['you have my word as a gentleman', 'you re sweet']
['hi', 'looks like things worked out tonight huh']
['you know chastity', 'i believe we share an art instructor']
['have fun tonight', 'tons']
['well no', 'then that s all you had to say']
['then that s all you had to say', 'but']


In [19]:
MIN_COUNT = 3
def trimRareWords(voc,pairs, MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index: 
                keep_input =False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index: 
                keep_output =False
                break
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs),  len(keep_pairs)/len(pairs)))
    return keep_pairs

pairs = trimRareWords(voc,pairs,MIN_COUNT)

keep words 9978 / 21489 = 0.4643
Trimmed from 84210 pairs to 71445, 0.8484 of total


In [20]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [21]:
indexesFromSentence(voc, pairs[1][0])

[31, 32, 33, 34, 32, 35, 10, 36, 37, 2]

In [22]:
inp = []
out = []
i = 0
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['gosh if only we could find kat a boyfriend', 'that s because it s such a nice one', 'there', 'you have my word as a gentleman', 'hi', 'have fun tonight', 'well no', 'then that s all you had to say', 'but', 'do you listen to this crap']
10


[[3, 4, 5, 6, 7, 8, 9, 10, 11, 2],
 [31, 32, 33, 34, 32, 35, 10, 36, 37, 2],
 [40, 2],
 [26, 42, 23, 43, 44, 10, 45, 2],
 [47, 2],
 [42, 60, 53, 2],
 [62, 63, 2],
 [64, 31, 32, 65, 26, 66, 67, 68, 2],
 [69, 2],
 [18, 26, 73, 67, 21, 74, 2]]

In [23]:
def zeroPadding(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [24]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [25]:
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 31, 40, 26, 47, 42, 62, 64, 69, 18),
 (4, 32, 2, 42, 2, 60, 63, 31, 2, 26),
 (5, 33, 0, 23, 0, 53, 2, 32, 0, 73),
 (6, 34, 0, 43, 0, 2, 0, 65, 0, 67),
 (7, 32, 0, 44, 0, 0, 0, 26, 0, 21),
 (8, 35, 0, 10, 0, 0, 0, 66, 0, 74),
 (9, 10, 0, 45, 0, 0, 0, 67, 0, 2),
 (10, 36, 0, 2, 0, 0, 0, 68, 0, 0),
 (11, 37, 0, 0, 0, 0, 0, 2, 0, 0),
 (2, 2, 0, 0, 0, 0, 0, 0, 0, 0)]

In [26]:
def binaryMatrix(l, values=0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token: 
                m[i].append(0)
            else: 
                m[i].append(1)
    return m

In [27]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 0, 1, 0, 1, 1, 1, 0, 1],
 [1, 1, 0, 1, 0, 1, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 1, 0, 0],
 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [28]:
# return padded input sequence tensor and as well as a tensor of lengths for each of the sequences in the batch
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [29]:
# returns the padded target sequence tensor, paddinbg mask and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [30]:
# return all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch,voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [31]:
small = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small)])
inputvar, lengths, targetvar, mask, maxtar = batches
print("input variable:")
print(inputvar)
print("lengths variable:")
print(lengths)
print("target variable:")
print(targetvar)
print("max variable:")
print(maxtar)
print("mask variable:")
print(mask)

input variable:
tensor([[  31,  431,   26,  212,   26],
        [  32,  179,   27,   18,   27],
        [  65,   96,  111,   26,  184],
        [  26,   30,   10,   49, 2693],
        [ 638, 1425, 2097,   34,   25],
        [ 809, 1347,    2,    2,    2],
        [ 621,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths variable:
tensor([8, 7, 6, 6, 6])
target variable:
tensor([[ 242,   16, 1006,   34,  212],
        [ 187,  145, 5571,   48,    2],
        [ 191,   34,   22,   75,    0],
        [ 621,   98,  361,  221,    0],
        [  18,   58,    2,   26,    0],
        [  26, 1390,    0,    2,    0],
        [ 260,   49,    0,    0,    0],
        [   2, 1428,    0,    0,    0],
        [   0, 1429,    0,    0,    0],
        [   0,    2,    0,    0,    0]])
max variable:
10
mask variable:
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 0, 1, 0],
        [1, 1, 0, 0, 

In [32]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

In [33]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [34]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [35]:
def maskNLLLose(decoder_out, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(decoder_out, 1, target.view(-1, 1)).squeeze(1))

    # Convert mask to the appropriate type based on device (CPU or CUDA)
    mask_dtype = torch.bool if crossEntropy.device.type == 'cpu' else torch.bool
    mask = mask.to(dtype=mask_dtype)

    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()


In [36]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLose(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLose(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [37]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))


In [38]:
batch_size = 64
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
input_variable, lenghts, target_variable, mask, max_target_len = batches

hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

# define encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
# ensure dropout layers are in train mode
encoder.train()
decoder.train()

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
clip = 50.0
loadFilename = None
# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters("movie quotes chatbot", voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, "cornell movie-dialogs corpus", n_iteration, batch_size,
           print_every, save_every, clip, "", loadFilename)



Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1; Percent complete: 0.0%; Average loss: 9.2060
Iteration: 2; Percent complete: 0.1%; Average loss: 9.1384
Iteration: 3; Percent complete: 0.1%; Average loss: 9.0414
Iteration: 4; Percent complete: 0.1%; Average loss: 8.8843
Iteration: 5; Percent complete: 0.1%; Average loss: 8.5974
Iteration: 6; Percent complete: 0.1%; Average loss: 8.1305
Iteration: 7; Percent complete: 0.2%; Average loss: 7.5748
Iteration: 8; Percent complete: 0.2%; Average loss: 7.3061
Iteration: 9; Percent complete: 0.2%; Average loss: 7.6243
Iteration: 10; Percent complete: 0.2%; Average loss: 7.7215
Iteration: 11; Percent complete: 0.3%; Average loss: 7.3834
Iteration: 12; Percent complete: 0.3%; Average loss: 7.3972
Iteration: 13; Percent complete: 0.3%; Average loss: 6.7381
Iteration: 14; Percent complete: 0.4%; Average loss: 6.4962
Iteration: 15; Percent complete: 0.4%; Average loss: 6.2368
Iteration: 16; Percent complete: 0.4%

In [39]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [40]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [41]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)


In [None]:
 evaluateInput(encoder, decoder, searcher, voc)

>  where is my money?


Bot: i don t know


>  don't lie


Bot: i m not going to be


>  so where is it?


Bot: it s a blue


>  stop talking nonsens


Error: Encountered unknown word.


>  stop talking nonsense


Bot: i m going to the police


>  you will die before that happens


Bot: i m not sure
