In [37]:
import numpy as np
import pandas as pd

from six import iteritems

from io import open
import unicodedata
import string
import re
import random
import pickle
import spacy
nlp = spacy.load('en')

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

%matplotlib inline

In [2]:

def read_sentence_embeddings_jw( pathname ):
    '''Read John Wieting sentence embeddings'''
    with open(pathname , 'rb') as f:
        # [ numpy.ndarray(95283, 300), numpy.ndarray(74664, 300), (trigram_dict, word_dict)]
        x = pickle.load(f, encoding='latin1')
        word_vocab_size, embedding_size = x[1].shape
        trigram_embeddings, word_embeddings, _ = x
        
        trigram_to_id, word_to_id = x[2]
        word_to_id['<START>'] = word_vocab_size
        word_to_id['<END>'] = word_vocab_size + 1
        idx_to_word = { idx: word for word, idx in iteritems(word_to_id) }
        word_embeddings = np.vstack((word_embeddings, np.random.randn(2, embedding_size)))

        return ( word_to_id, 
                 idx_to_word, 
                 word_embeddings, 
                 word_to_id['<START>'], 
                 word_to_id['<END>'], 
                 word_to_id['UUUNKKK'], 
                 word_to_id['★']
               )
    

pathname='../rec/data/ngram-word-concat-40.pickle'
word_to_id, id_to_word, embeddings, start_id, end_id, unk_id, mask_id = read_sentence_embeddings_jw( pathname )

print( [ (word_to_id[id_to_word[i]], id_to_word[i]) for i in range(10) ] )
print( len(word_to_id) )
print( embeddings.shape )



[(0, ','), (1, '.'), (2, 'the'), (3, 'and'), (4, 'to'), (5, 'of'), (6, 'a'), (7, 'in'), (8, ':'), (9, 'is')]
74602
(74666, 300)


In [5]:


class Dictionary:
    def __init__(self ):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.n_words = 0
        self.embeddings = []
        
        self.PAD_token = 0  # Used for padding short sentences
        self.SOS_token = 1  # Start-of-sentence token
        self.EOS_token = 2  # End-of-sentence token
        self.UNK_token = 3 


    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def load_embeddings_jw( self, pathname ):
        '''Read John Wieting sentence embeddings'''
        with open(pathname , 'rb') as f:
            # [ numpy.ndarray(95283, 300), numpy.ndarray(74664, 300), (trigram_dict, word_dict)]
            x = pickle.load(f, encoding='latin1')
            
            word_vocab_size, embedding_size = x[1].shape
            trigram_embeddings, word_embeddings, _ = x

            trigram_to_id, word_to_id = x[2]
            
            word_to_id['<START>'] = word_vocab_size
            word_to_id['<END>']   = word_vocab_size + 1
                                   
            idx_to_word = { idx: word for word, idx in iteritems(word_to_id) }
            word_embeddings = np.vstack((word_embeddings, np.random.randn(2, embedding_size)))
            word_to_count = { word: 1 for word, idx in iteritems(word_to_id) }
            
            self.word2index = word_to_id
            self.index2word = idx_to_word
            self.word2count = word_to_count
            self.embeddings = word_embeddings
            self.n_words    = len(word_to_id)
            
            self.PAD_token = word_to_id['★']        # Used for padding short sentences
            self.SOS_token = word_to_id['<START>']  # Start-of-sentence token
            self.EOS_token = word_to_id['<END>']    # End-of-sentence token
            self.UNK_token = word_to_id['UUUNKKK'] 


pathname='../rec/data/ngram-word-concat-40.pickle'
dictionaty = Dictionary()
dictionaty.load_embeddings_jw( pathname )
print( len(dictionaty.word2index) )
print( len(dictionaty.index2word) )
print( len(dictionaty.word2count) )
print( dictionaty.embeddings.shape )



74602
74602
74602
(74666, 300)


In [32]:


def openmp_nlp_pipeline(lines, n_threads=12):
    ''' Execute spacy's openmp nlp pipeline '''
    return [ [ token.lower_ for token in doc ] for doc in nlp.pipe(lines, n_threads=n_threads, disable=['parser', 'tagger', 'ner']) ]


# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def read_paraphraser( pathname ):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(pathname, encoding='utf-8').\
        read().strip().split('\n')    
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]    # normalizeString, openmp_nlp_pipeline,
        
#     source_sentences = []
#     ref_sentences = []    
#     with open(pathname, 'r') as f:
#         for i, line in enumerate(f):
#             source, ref = line.split('\t')
#             source_sentences.append(source.strip())
#             ref_sentences.append(ref.strip())    
#     source_sentences = openmp_nlp_pipeline( source_sentences )
#     ref_sentences = openmp_nlp_pipeline( ref_sentences )
#     pairs = np.stack( ( source_sentences, ref_sentences), axis=1 )    
    
    return pairs


pathname = '../rec/data/para-nmt-50m-small.txt'
pairs = read_paraphraser( pathname )
print( pairs[1] )



Reading lines...
['of course you did .', 'of course it is .']


In [33]:
MAX_LENGTH = 10

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH 

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


In [35]:

def prepare_data( pathname, pathdictionary ):
    pairs = read_paraphraser( pathname)
    dictionary = Dictionary()
    dictionary.load_embeddings_jw( pathdictionary )
    
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
        
    print("Counted words:")
    print(dictionary.n_words)
    return dictionary, pairs


pathname = '../rec/data/para-nmt-50m-small.txt'
pathdictionary = '../rec/data/ngram-word-concat-40.pickle'

dictionary, pairs = prepare_data( pathname, pathdictionary )
# print(random.choice(pairs))
for pair in pairs[:10]:
    print(pair)
    
    

Reading lines...
Read 99968 sentence pairs
Counted words:
74602
['of course you did .', 'of course it is .']
[' why not ?', ' why not ?']
['an old man s mistake ', 'an old man s fault . . . ']
['he loved that little man by the way .', 'he liked the little boy .']
['provide the following information', 'enter the following information']
['i ve been very very lucky .', 'i was just lucky .']
['why are you carrying around grass ?', 'why did you pull the grass ?']
['not as much as i d like', 'not as often as i d like .']
['i do n t like that crap .', 'i do n t like it .']
['from a marine in da nang ', 'from the sailor in da nang .']


In [38]:

import itertools

def indexesFromSentence(voc, sentence):
    return [voc.word2index.get(word, voc.UNK_token )  for word in sentence.split(' ')] + [ voc.EOS_token ]

def zeroPadding(l, fillvalue):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == value:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch, voc.EOS_token)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch, voc.EOS_token)
    mask = binaryMatrix(padList, voc.EOS_token)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData( dictionary , pair_batch):
    
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    
    inp, lengths = inputVar(input_batch, dictionary)
    output, mask, max_target_len = outputVar(output_batch, dictionary)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData( dictionary, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[    74,     57,  33443,    723,   1336],
        [    65,      2,   2620,     11,  31953],
        [  3682,  11568,      5,   4932,      1],
        [  7372,    234,      2,     24,  74665],
        [     4,      6,  74663,    155,  74665],
        [ 13730,    200,  50875,      7,  74665],
        [    30,   5434,      1,      1,  74665],
        [    74,     35,  74665,  74665,  74665],
        [    35,  74665,  74665,  74665,  74665],
        [ 74665,  74665,  74665,  74665,  74665]])
lengths: tensor([ 10,   9,   8,   8,   4])
target_variable: tensor([[    74,     57,     14,     11,    407],
        [    65,      2,    234,   4932,  31953],
        [  3682,   3861,  33443,     24,      1],
        [   926,     21,   2620,    155,  74665],
        [     4,    200,      5,     78,  74665],
        [ 13730,   5437,      2,      1,  74665],
        [    35,     35,  74663,  74665,  74665],
        [ 74665,  74665,   9021,  74665,  74665],
        [ 74665,  74665

In [39]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)

    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden


In [40]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [41]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [42]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [44]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [45]:
def trainIters(
    model_name, 
    dictionary, 
    pairs, 
    encoder, 
    decoder, 
    encoder_optimizer, 
    decoder_optimizer, 
    embedding, 
    encoder_n_layers, 
    decoder_n_layers, 
    save_dir, 
    n_iteration, 
    batch_size, 
    print_every, 
    save_every, 
    clip, 
    corpus_name, 
    loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(dictionary, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    plot_losses = []
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            plot_losses.append(print_loss_avg)
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'dictionary': dictionary.__dict__,
                'embedding': embedding.state_dict()
                
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))
         
    return plot_losses


In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [None]:
def evaluate(encoder, decoder, searcher, input_lang, output_lang, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(input_lang, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [output_lang.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, input_lang, output_lang):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, input_lang, output_lang, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")
            
            
def evaluateRandomly(encoder, decoder, searcher, input_lang, output_lang, pair, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        input_sentence = normalizeString( pair[0] )
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, searcher, input_lang, output_lang, input_sentence )
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
        

In [1]:

import os

# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64
corpus_name='anki'
save_dir = os.path.join("../out", "save")

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
# checkpoint_iter = 4000
# loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    input_lang.__dict__ = checkpoint['input_lang_dict']
    output_lang.__dict__ = checkpoint['output_lang_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding( input_lang.n_words, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
    
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, output_lang.n_words, decoder_n_layers, dropout)


#input_lang, output_lang

if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
    
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...


NameError: name 'nn' is not defined

In [None]:
print(encoder)
print(decoder)

In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 10000
print_every = 400
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
plot_losses = trainIters(model_name, dictionary, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)



In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure( )
    #fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    #loc = ticker.MultipleLocator(base=0.2)
    #ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.title('Training error')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    #plt.axis('off')
    plt.show()

    
showPlot(plot_losses)