# Dataset


In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')

source_path = os.path.join(os.getcwd(), "drive", "My Drive", "P3-NeuralMachineTranslation", "data", "raw", "source.txt")
target_path = os.path.join(os.getcwd(), "drive", "My Drive", "P3-NeuralMachineTranslation", "data", "raw", "target.txt")
test_path = os.path.join(os.getcwd(), "drive", "My Drive", "P3-NeuralMachineTranslation", "data", "raw", "source_test.txt")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import libraries and connect to Google Drive

In [None]:
!pip install -U gensim

In [3]:
!pip3 install sentencepiece
from collections import Counter, namedtuple
from itertools import chain
import json
import math
import os
from pathlib import Path
import random
import time
import sys
from tqdm.notebook import tqdm, trange
from typing import List, Tuple, Dict, Set, Union

import shutil
import gensim
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
import numpy as np
import sentencepiece as spm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

from tqdm.notebook import tqdm, trange

In [None]:
nltk.download("punkt")

In [5]:
!pip install -qqq wandb

In [6]:
import wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mbotrosmark200[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
!wandb online

W&B online. Running your script from this directory will now sync to the cloud.


# RNN Implementation







## Data loading

In [7]:
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

In [8]:
def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
        The paddings should be at the end of each sentence.
    :param sents: list of sentences, where each sentence
                                    is represented as a list of words
    :type sents: list[list[str]]
    :param pad_token: padding token
    :type pad_token: str
    :returns sents_padded: list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentence in the batch now has equal length.
    :rtype: list[list[str]]
    """
    sents_padded = []

    max_len = max([len(sent) for sent in sents])
    sents_padded = [(sent + ([pad_token] * (max_len - len(sent)))) for sent in sents]

    return sents_padded

In [9]:
def read_corpus(file_path, source):
    """ Read file, where each sentence is dilineated by a `\n`.
    :param file_path: path to file containing corpus
    :type file_path: str
    :param source: "tgt" or "src" indicating whether text
        is of the source language or target language
    :type source: str
    """
    data = []
    for line in open(file_path):
        sent = nltk.word_tokenize(line)
        # only append <s> and </s> to the target sentence
        if source == 'tgt':
            sent = ['<s>'] + sent + ['</s>']
        data.append(sent)

    return data

In [10]:
class Vocab(object):
    """ Vocabulary, i.e. structure containing either
    src or tgt language terms.
    """
    def __init__(self, word2id=None):
        """ Init Vocab Instance.
        
        :param word2id: dictionary mapping words 2 indices
        :type word2id: dict[str, int]
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<s>'] = 1     # Start Token
            self.word2id['</s>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        
        :param word: word to look up
        :type word: str
        :returns: index of word
        :rtype: int
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by Vocab.
        
        :param word: word to look up
        :type word: str
        :returns: whether word is in vocab
        :rtype: bool
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the Vocab directly.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in Vocab.
        
        :returns: number of words in Vocab
        :rtype: int
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of Vocab to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        
        :param wid: word index
        :type wid: int
        :returns: word corresponding to index
        :rtype: str
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to Vocab, if it is previously unseen.
        
        :param word: to add to Vocab
        :type word: str
        :returns: index that the word has been assigned
        :rtype: int
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        
        :param sents: sentence(s) in words
        :type sents: Union[List[str], List[List[str]]]
        :returns: sentence(s) in indices
        :rtype: Union[List[int], List[List[int]]]
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        
        :param word_ids: list of word ids
        :type word_ids: List[int]
        :returns: list of words
        :rtype: List[Str]
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents: List[List[str]], device: torch.device) -> torch.Tensor:
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.
        
        :param sents: list of sentences (words)
        :type sents: List[List[str]]
        :param device: Device on which to load the tensor, ie. CPU or GPU
        :type device: torch.device
        :returns: Sentence tensor of (max_sentence_length, batch_size)
        :rtype: torch.Tensor
        """

        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab.
        
        :param corpus: corpus of text produced by read_corpus function
        :type corpus: List[str]
        :param size: # of words in vocabulary
        :type size: int
        :param freq_cutoff: if word occurs n < freq_cutoff times, drop the word
        :type freq_cutoff: int
        :returns: Vocab instance produced from provided corpus
        :rtype: Vocab
        """
        vocab_entry = Vocab()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry
    
    @staticmethod
    def from_subword_list(subword_list):
        """Given a list of subwords, construct the Vocab.
        
        :param subword_list: list of subwords in corpus
        :type subword_list: List[str]
        :returns: Vocab instance produced from provided list
        :rtype: Vocab
        """
        vocab_entry = Vocab()
        for subword in subword_list:
            vocab_entry.add(subword)
        return vocab_entry

In [11]:
print('initialize source vocabulary ..')
src_sents = read_corpus(source_path, "src")
src = Vocab.from_corpus(src_sents, 20000, 2) # 7098, 9422

print('initialize target vocabulary ..')
tgt_sents = read_corpus(target_path, "tgt")
tgt = Vocab.from_corpus(tgt_sents, 20000, 2) # 6893, 10956

initialize source vocabulary ..
number of word types: 13251, number of word types w/ frequency >= 2: 9166
initialize target vocabulary ..
number of word types: 15213, number of word types w/ frequency >= 2: 10723


In [12]:
# Train embeddings or load embeddings
def create_embed_matrix(vocab, sents, vector_size, epochs): ## Mod A
  w2id = vocab.word2id
  emb = np.zeros(shape=(len(w2id), vector_size))
  emb[0] = np.zeros(vector_size)
  emb[1] = np.zeros(vector_size)
  emb[2] = np.zeros(vector_size)
  emb[3] = np.zeros(vector_size)

  model = gensim.models.Word2Vec(sents, vector_size=vector_size, min_count=2, window=5, epochs=epochs)

  for word,id in w2id.items():
    if word in ['<pad>', '<s>', '</s>', '<unk>']:
      continue
    emb[id] = model.wv[word]
  
  return torch.FloatTensor(emb)

In [13]:
# Split into training and validation data
train_data_src, val_data_src, train_data_tgt, val_data_tgt = train_test_split(src_sents, tgt_sents, test_size=0.045922, random_state=42)

In [14]:
train_data = list(zip(train_data_src, train_data_tgt))
val_data = list(zip(val_data_src, val_data_tgt))

In [15]:
def generate_sent_masks(enc_hiddens: torch.Tensor, source_lengths: List[int], device: torch.device) -> torch.Tensor:
    """ Generate sentence masks for encoder hidden states.

    :param enc_hiddens: encodings of shape (b, src_len, 2*h), where b = batch size,
        src_len = max source length, h = hidden size.
    :type enc_hiddens: torch.Tensor
    :param source_lengths: List of actual lengths for each of the sentences in the batch.   
    :type source_lengths: List[int]
    :param device: Device on which to load the tensor, ie. CPU or GPU
    :type device: torch.device
    :returns: Tensor of sentence masks of shape (b, src_len),
        where src_len = max source length, h = hidden size.
    :rtype: torch.Tensor
    """
    enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
    for e_id, src_len in enumerate(source_lengths):
        enc_masks[e_id, src_len:] = 1
    return enc_masks.to(device)

## Encoder

In [16]:
class Encoder(nn.Module):
    def __init__(self, embed_size, hidden_size, source_embeddings):
        """
        """
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.embedding = source_embeddings
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional RNN with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} above.
        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.h_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)
        self.c_projection = nn.Linear(hidden_size*2, hidden_size, bias=False)
        
    def forward(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        """
        enc_hiddens, dec_init_state = None, None
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the encoder.
        ###     2. Compute `enc_hiddens`, `last_hidden` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = init_decoder_hidden:
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forward and backwards.
        ###             Concatenate the forward and backward tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in above in the writeup. Here b = batch size, h = hidden size
        X = self.embedding(source_padded)

        X_packed = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed)
        enc_hiddens,_ = pad_packed_sequence(enc_hiddens)

        init_decoder_hidden = self.h_projection(torch.cat((last_hidden[0], last_hidden[1]), dim=1))
        init_decoder_cell = self.c_projection(torch.cat((last_cell[0], last_cell[1]), dim=1))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        enc_hiddens = enc_hiddens.permute(1, 0, 2)

        return enc_hiddens, dec_init_state

## Decoder

In [26]:
class Decoder(nn.Module):
    # def __init__(self, embed_size, hidden_size, target_embedding, device):
    def __init__(self, embed_size, hidden_size, target_embedding, device, d_rate=None): ## Mod B
        """
        """
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.device = device
        self.embedding = target_embedding
        output_vocab_size = self.embedding.weight.size(0)
        self.softmax = nn.Softmax(dim=1)
        self.d_rate = d_rate ## Mod B

        ### TODO:
        ###     self.decoder (RNN Cell with bias)
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{v} above.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} above.

        self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size, bias=True)
        self.att_projection = nn.Linear(hidden_size*2, hidden_size, bias=False) ## Mod B
        self.combined_output_projection = nn.Linear(hidden_size*3, hidden_size, bias=False) ## Mod B
        # self.combined_output_projection = nn.Linear(hidden_size, hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size, output_vocab_size, bias=False)
        self.dropout = nn.Dropout(self.d_rate) ## Mod B
    
    # def forward(self, enc_hiddens: torch.Tensor, dec_init_state: torch.Tensor, 
    #             target_padded: torch.Tensor) -> torch.Tensor:
    def forward(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: torch.Tensor, target_padded: torch.Tensor) -> torch.Tensor: ## Mod B
        """
        """
        # Chop off the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### TODO:
        ###     1. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     2. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e). 
        ###             - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     3. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.

        enc_hiddens_proj = self.att_projection(enc_hiddens) ## Mod B
        Y = self.embedding(target_padded)
        
        for Y_t in torch.split(Y, split_size_or_sections=1):
            Y_t = Y_t.squeeze(0)
            Ybar_t = torch.cat([Y_t, o_prev], dim=-1)
            # dec_state, o_t = self.step(Ybar_t, dec_state, enc_hiddens)
            dec_state, o_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) ## Mod B
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs)

        return combined_outputs
    
    # def step(self, Ybar_t: torch.Tensor,
    #          dec_state: Tuple[torch.Tensor, torch.Tensor],
    #          enc_hiddens: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
    def step(self, Ybar_t: torch.Tensor,
                 dec_state: Tuple[torch.Tensor, torch.Tensor],
                 enc_hiddens: torch.Tensor, 
                 enc_hiddens_proj: torch.Tensor,
                 enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: ## Mod B
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        :param Ybar_t: Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        :type Ybar_t: torch.Tensor
        :param dec_state: Tensors with shape (b, h), where b = batch size, h = hidden size.
                Tensor is decoder's prev hidden state
        :type dec_state: torch.Tensor
        :param enc_hiddens: Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        :type enc_hiddens: torch.Tensor

        :returns dec_state: Tensors with shape (b, h), where b = batch size, h = hidden size.
                Tensor is decoder's new hidden state. For an LSTM, this should be a tuple
                of the hidden state and cell state.
        returns combined_output: Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        """
        combined_output = None

        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state` to obtain the new dec_state.
        ###     2. Rename dec_state to dec_hidden

        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(2)).squeeze(2) ## Mod B

        ### TODO:
        ###     1. Apply the combined output projection layer to h^dec_t to compute tensor V_t
        ###     2. Compute tensor O_t by applying the Tanh function.

        alpha_t = self.softmax(e_t) ## Mod B
        a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens).squeeze(1) ## Mod B
        U_t = torch.cat([dec_hidden, a_t], 1) ## Mod B
        V_t = self.combined_output_projection(U_t) ## Mod B
        O_t = self.dropout(torch.tanh(V_t)) ## Mod B
        # V_t = self.combined_output_projection(dec_hidden)
        # O_t = torch.tanh(V_t)

        combined_output = O_t

        return dec_state, combined_output

## NMT

In [33]:
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional RNN Encoder
        - Unidirection RNN Decoder
    """
    
    # def __init__(self, embed_size, hidden_size, src_vocab, tgt_vocab,
    #              device=torch.device("cpu"), pretrained_source=None,pretrained_target=None):
    def __init__(self, embed_size, hidden_size, src_vocab, tgt_vocab, d_rate,
                 device=torch.device("cpu"), pretrained_source=None,pretrained_target=None): ## Mod B
        """ Init NMT Model.

        :param embed_size: Embedding size (dimensionality)
        :type embed_size: int
        :param hidden_size: Hidden Size, the size of hidden states (dimensionality)
        :type hidden_size: int
        :param src_vocab: Vocabulary object containing src language
        :type src_vocab: Vocab
        :param tgt_vocab: Vocabulary object containing tgt language
        :type tgt_vocab: Vocab
        :param device: torch device to put all modules on
        :type device: torch.device
        :param pretrained_source: Matrix of pre-trained source word embeddings
        :type pretrained_source: Optional[torch.Tensor]
        :param pretrained_target: Matrix of pre-trained target word embeddings
        :type pretrained_target: Optional[torch.Tensor]
        """
        super(NMT, self).__init__()
        self.device=device
        self.embed_size = embed_size
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.d_rate = d_rate ## Mod B
        src_pad_token_idx = src_vocab['<pad>']
        tgt_pad_token_idx = tgt_vocab['<pad>']
        self.source_embedding = nn.Embedding(len(src_vocab), embed_size, padding_idx=src_pad_token_idx)
        self.target_embedding = nn.Embedding(len(tgt_vocab), embed_size, padding_idx=tgt_pad_token_idx)

        with torch.no_grad():
            if pretrained_source is not None:
                self.source_embedding.weight.data = pretrained_source
                # TODO: Decide if we want the embeddings to update as we train
                self.source_embedding.weight.requires_grad = True ## Mod A
        
            if pretrained_target is not None:
                self.target_embedding.weight.data = pretrained_target
                # TODO: Decide if we want the embeddings to update as we train
                self.target_embedding.weight.requires_grad = True ## Mod A
        
        self.hidden_size = hidden_size

        self.encoder = Encoder(
            embed_size=embed_size,
            hidden_size=hidden_size,
            source_embeddings=self.source_embedding,
        )
        self.decoder = Decoder(
            embed_size=embed_size,
            hidden_size=hidden_size,
            target_embedding=self.target_embedding,
            device=self.device,            
            d_rate = self.d_rate ## Mod B
        )


    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        :param source: list of source sentence tokens
        :type source: List[List[str]]
        :param target: list of target sentence tokens, wrapped by `<s>` and `</s>`
        :type target: List[List[str]]
        :returns scores: a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        :rtype: torch.Tensor
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.src_vocab.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.tgt_vocab.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)
        
        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = generate_sent_masks(enc_hiddens, source_lengths, self.device)
        # combined_outputs = self.decode(enc_hiddens, dec_init_state, target_padded)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) ## Mod B
        P = F.log_softmax(self.decoder.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.tgt_vocab['<pad>']).float()
        
        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        :param source_padded: Tensor of padded source sentences with shape (src_len, b), where
            b = batch_size, src_len = maximum source sentence length. Note that these have
            already been sorted in order of longest to shortest sentence.
        :type source_padded: torch.Tensor
        :param source_lengths: List of actual lengths for each of the source sentences in the batch
        :type source_lengths: List[int]
        :returns: Tuple of two items. The first is Tensor of hidden units with shape (b, src_len, h*2),
            where b = batch size, src_len = maximum source sentence length, h = hidden size. The second is
            Tuple of tensors representing the decoder's initial hidden state and cell.
        :rtype: Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
        """
        return self.encoder(source_padded, source_lengths)

    # def decode(self, enc_hiddens: torch.Tensor, dec_init_state: torch.Tensor,
    #            target_padded: torch.Tensor) -> torch.Tensor:
    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: torch.Tensor, target_padded: torch.Tensor) -> torch.Tensor: ## Mod B
        """Compute combined output vectors for a batch.

        :param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        :param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        :param target_padded: Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        :returns combined_outputs: combined output tensor  (tgt_len, b,  h), where
                                    tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        :rtype: torch.Tensor
        """
        # return self.decoder(enc_hiddens, dec_init_state, target_padded)
        return self.decoder(enc_hiddens, enc_masks, dec_init_state, target_padded) ## Mod B

    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        :param src_sent: a single source sentence (words)
        :type src_sent: List[str]
        :param beam_size: beam size
        :type beam_size: int
        :param max_decoding_time_step: maximum number of time steps to unroll the decoding RNN
        :type max_decoding_time_step: int
        :returns hypotheses: a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        :rtype: List[Hypothesis]
        """
        src_sents_var = self.src_vocab.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.decoder.att_projection(src_encodings) ## Mod B

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.tgt_vocab['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))
            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, 
                                                                           src_encodings_att_linear.size(1), 
                                                                           src_encodings_att_linear.size(2)) ## Mod B


            y_tm1 = torch.tensor([self.tgt_vocab[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.target_embedding(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            # h_t, att_t = self.decoder.step(x, h_tm1, exp_src_encodings)
            h_t, att_t = self.decoder.step(x, h_tm1,
                                exp_src_encodings,
                                exp_src_encodings_att_linear, enc_masks=None) ## Mod B
            
            ## TODO: Uncomment the line below if this is an LSTM
            h_t, c_t = h_t

            # log probabilities over target words
            log_p_t = F.log_softmax(self.decoder.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = torch.div(top_cand_hyp_pos, len(self.tgt_vocab), rounding_mode='floor')
            hyp_word_ids = top_cand_hyp_pos % len(self.tgt_vocab)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.tgt_vocab.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)

            # h_tm1 = h_t[live_hyp_ids]
            ### TODO: Uncomment the below if it is an LSTM and comment out line
            # above. Otherwise leave.
            h_tm1 = h_t[live_hyp_ids], c_t[live_hyp_ids]
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses


    def greedy(self, src_sent: List[str], max_decoding_time_step: int=70) -> List[Hypothesis]:
        return self.beam_search(src_sent, beam_size=1, max_decoding_time_step=max_decoding_time_step)


    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(
            src_vocab=params['vocab']['source'],
            tgt_vocab=params['vocab']['target'],
            d_rate=0.2, ## Mod B
            **args
        )
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.embed_size, hidden_size=self.hidden_size),
            'vocab': dict(source=self.src_vocab, target=self.tgt_vocab),
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

# Model training

In [19]:
def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    :param data: list of tuples containing source and target sentence. ie.
        (list of (src_sent, tgt_sent))
    :type data: List[Tuple[List[str], List[str]]]
    :param batch_size: batch size
    :type batch_size: int
    :param shuffle: whether to randomly shuffle the dataset
    :type shuffle: boolean
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

In [20]:
def evaluate_ppl(model, val_data, batch_size=32):
    """ Evaluate perplexity on dev sentences
    :param model: NMT Model
    :type model: NMT
    :param dev_data: list of tuples containing source and target sentence.
        i.e. (list of (src_sent, tgt_sent))
    :param val_data: List[Tuple[List[str], List[str]]]
    :param batch_size: size of batches to extract
    :type batch_size: int
    :returns ppl: perplexity on val sentences
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(val_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)
        avg_val_loss = cum_loss / len(val_data)

        wandb.log({
            'avg. val loss' : avg_val_loss,
            'avg. val perplexity' : ppl
        })

    if was_training:
        model.train()

    return ppl


def compute_corpus_level_bleu_score(references: List[List[str]], hypotheses: List[Hypothesis]) -> float:
    """ Given decoding results and reference sentences, compute corpus-level BLEU score.
    :param references: a list of gold-standard reference target sentences
    :type references: List[List[str]]
    :param hypotheses: a list of hypotheses, one for each reference
    :type hypotheses: List[Hypothesis]
    :returns bleu_score: corpus-level BLEU score
    """
    if references[0][0] == '<s>':
        references = [ref[1:-1] for ref in references]
    bleu_score = corpus_bleu([[ref] for ref in references],
                             [hyp.value for hyp in hypotheses])
    return bleu_score


def evaluate_bleu(references, model, source):
    """Generate decoding results and compute BLEU score.
    :param model: NMT Model
    :type model: NMT
    :param references: a list of gold-standard reference target sentences
    :type references: List[List[str]]
    :param source: a list of source sentences
    :type source: List[List[str]]
    :returns bleu_score: corpus-level BLEU score
    """
    with torch.no_grad():
        top_hypotheses = []
        for s in tqdm(source, leave=False):
            hyps = model.beam_search(s, beam_size=16, max_decoding_time_step=(len(s)+10))
            top_hypotheses.append(hyps[0])
    
    s1 = compute_corpus_level_bleu_score(references, top_hypotheses)
    
    return s1

In [21]:
def train_and_evaluate(model, train_data, val_data, optimizer, epochs=10, train_batch_size=32, clip_grad=2, log_every = 100, valid_niter = 500, model_save_path="NMT_model.ckpt"):
    num_trail = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0

    print('Begin Maximum Likelihood training')
    train_time = begin_time = time.time()

    val_data_tgt = [tgt for _, tgt in val_data]
    val_data_src = [src for src, _ in val_data]

    for epoch in tqdm(range(epochs)):
        wandb.log({'epoch' : epoch})
        for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
            train_iter += 1
            
            optimizer.zero_grad()
            
            batch_size = len(src_sents)
            
            example_losses = -model(src_sents, tgt_sents)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size
            loss.backward()
            
            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
            
            optimizer.step()
            
            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val
            
            tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                        'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                            report_loss / report_examples,
                                                                                            math.exp(report_loss / report_tgt_words),
                                                                                            cum_examples,
                                                                                            report_tgt_words / (time.time() - train_time),
                                                                                            time.time() - begin_time))
                
                wandb.log({
                    'avg. train loss' : (report_loss / report_examples),
                    'avg. train perplexity' : math.exp(report_loss / report_tgt_words)
                })
                
                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

                

            # perform validation
            if train_iter % valid_niter == 0:
                print('epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter,
                                                                                            cum_loss / cum_examples,
                                                                                            np.exp(cum_loss / cum_tgt_words),
                                                                                            cum_examples))
                
                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...')

                # compute dev. ppl and bleu
                dev_ppl = evaluate_ppl(model, val_data, batch_size=128)   # dev batch size can be a bit larger
                valid_metric = -dev_ppl
                
                bleu_score = evaluate_bleu(val_data_tgt, model, val_data_src)*100

                print('validation: iter %d, dev. ppl %f, bleu_score %f' % (train_iter, dev_ppl, bleu_score))

                wandb.log({
                    'bleu score' : bleu_score
                })

                is_better = len(hist_valid_scores) == 0 or bleu_score > max(hist_valid_scores)
                hist_valid_scores.append(bleu_score)

                if is_better:
                    print('save currently the best model to [%s]' % model_save_path)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(), model_save_path + '.optim')


In [22]:
wandb.config = {
    'embed_size' : 350,
    'hidden_size' : 512,
    'epochs' : 10,
    'train_batch_size' : 32, #16
    'clip_grad' : 2,
    'lr' : 1e-3,
    'dropout rate' : .2 #.4
}

# batch_size of 32, dropout rate of .2 yield better results

config = wandb.config

In [23]:
src_vocab = src
tgt_vocab = tgt
pretrained_src = create_embed_matrix(src_vocab, src_sents, config['embed_size'], 100)
pretrained_tgt = create_embed_matrix(tgt_vocab, tgt_sents, config['embed_size'], 100)

log_every = 100
valid_niter = 500
model_save_path="NMT_model.ckpt"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [28]:
# baseline = NMT(
#     config['embed_size'],
#     config['hidden_size'],
#     src_vocab,
#     tgt_vocab,
#     device=device,
#     pretrained_source=None,
#     pretrained_target=None,
# )

# modA = NMT(
#     config['embed_size'],
#     config['hidden_size'],
#     src_vocab,
#     tgt_vocab,
#     device=device,
#     pretrained_source=pretrained_src,
#     pretrained_target=pretrained_tgt,
# )

# modB = NMT(
#     config['embed_size'],
#     config['hidden_size'],
#     src_vocab,
#     tgt_vocab,
#     config['dropout rate'],
#     device=device,
#     pretrained_source=None,
#     pretrained_target=None,
# )

modAB = NMT(
    config['embed_size'],
    config['hidden_size'],
    src_vocab,
    tgt_vocab,
    config['dropout rate'],
    device=device,
    pretrained_source=pretrained_src,
    pretrained_target=pretrained_tgt,
)

model = modAB
model.to(device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

In [29]:
# Define each of the variables then you can run this command!
wandb.init(project='P3-NeuralMachineTranslation', entity='botrosmark200', reinit=True)
wandb.watch(model)
train_and_evaluate(
    model,
    train_data,
    val_data,
    optimizer,
    config['epochs'],
    config['train_batch_size'],
    config['clip_grad'],
    log_every,
    valid_niter,
    model_save_path
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁

0,1
epoch,0


Begin Maximum Likelihood training


  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0, iter 100, avg. loss 84.95, avg. ppl 506.47 cum. examples 3200, speed 4990.33 words/sec, time elapsed 8.75 sec
epoch 0, iter 200, avg. loss 80.91, avg. ppl 288.05 cum. examples 6400, speed 4563.02 words/sec, time elapsed 18.77 sec
epoch 0, iter 300, avg. loss 72.49, avg. ppl 193.70 cum. examples 9600, speed 5131.42 words/sec, time elapsed 27.35 sec
epoch 0, iter 400, avg. loss 67.32, avg. ppl 132.95 cum. examples 12800, speed 5003.64 words/sec, time elapsed 36.16 sec
epoch 0, iter 500, avg. loss 66.05, avg. ppl 109.69 cum. examples 16000, speed 4811.83 words/sec, time elapsed 45.52 sec
epoch 0, iter 500, cum. loss 74.34, cum. ppl 209.96 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 500, dev. ppl 92.499990, bleu_score 2.220536
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 0, iter 600, avg. loss 60.48, avg. ppl 85.17 cum. examples 3200, speed 689.31 words/sec, time elapsed 108.69 sec
epoch 0, iter 700, avg. loss 60.94, avg. ppl 76.12 cum. examples 6400, speed 4785.88 words/sec, time elapsed 118.09 sec
epoch 0, iter 800, avg. loss 57.70, avg. ppl 64.69 cum. examples 9600, speed 5002.36 words/sec, time elapsed 126.95 sec
epoch 0, iter 900, avg. loss 56.49, avg. ppl 55.40 cum. examples 12800, speed 4599.02 words/sec, time elapsed 136.74 sec
epoch 0, iter 1000, avg. loss 54.20, avg. ppl 49.72 cum. examples 16000, speed 4840.08 words/sec, time elapsed 145.92 sec
epoch 0, iter 1000, cum. loss 57.96, cum. ppl 64.88 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 1000, dev. ppl 45.661943, bleu_score 6.568591
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 0, iter 1100, avg. loss 52.84, avg. ppl 45.93 cum. examples 3200, speed 719.35 words/sec, time elapsed 207.33 sec
epoch 1, iter 1200, avg. loss 52.98, avg. ppl 41.54 cum. examples 6387, speed 4980.81 words/sec, time elapsed 216.43 sec
epoch 1, iter 1300, avg. loss 47.67, avg. ppl 30.63 cum. examples 9587, speed 4770.82 words/sec, time elapsed 225.78 sec
epoch 1, iter 1400, avg. loss 47.91, avg. ppl 30.13 cum. examples 12787, speed 5173.48 words/sec, time elapsed 234.48 sec
epoch 1, iter 1500, avg. loss 46.15, avg. ppl 27.52 cum. examples 15987, speed 4929.83 words/sec, time elapsed 243.52 sec
epoch 1, iter 1500, cum. loss 49.51, cum. ppl 34.44 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 1500, dev. ppl 29.840159, bleu_score 14.276758
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 1, iter 1600, avg. loss 42.50, avg. ppl 24.27 cum. examples 3200, speed 668.96 words/sec, time elapsed 307.27 sec
epoch 1, iter 1700, avg. loss 45.38, avg. ppl 25.63 cum. examples 6400, speed 4807.04 words/sec, time elapsed 316.59 sec
epoch 1, iter 1800, avg. loss 45.06, avg. ppl 24.52 cum. examples 9600, speed 4627.33 words/sec, time elapsed 326.34 sec
epoch 1, iter 1900, avg. loss 43.18, avg. ppl 23.12 cum. examples 12800, speed 5022.98 words/sec, time elapsed 335.10 sec
epoch 1, iter 2000, avg. loss 43.64, avg. ppl 23.18 cum. examples 16000, speed 4916.59 words/sec, time elapsed 344.14 sec
epoch 1, iter 2000, cum. loss 43.95, cum. ppl 24.13 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 2000, dev. ppl 22.811098, bleu_score 16.838616
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 1, iter 2100, avg. loss 44.07, avg. ppl 22.88 cum. examples 3200, speed 680.80 words/sec, time elapsed 410.31 sec
epoch 1, iter 2200, avg. loss 43.24, avg. ppl 21.43 cum. examples 6400, speed 4963.73 words/sec, time elapsed 419.42 sec
epoch 1, iter 2300, avg. loss 42.64, avg. ppl 21.25 cum. examples 9600, speed 4706.46 words/sec, time elapsed 428.91 sec
epoch 2, iter 2400, avg. loss 40.24, avg. ppl 18.35 cum. examples 12787, speed 5046.84 words/sec, time elapsed 437.65 sec
epoch 2, iter 2500, avg. loss 36.25, avg. ppl 14.53 cum. examples 15987, speed 4557.47 words/sec, time elapsed 447.17 sec
epoch 2, iter 2500, cum. loss 41.29, cum. ppl 19.49 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 2500, dev. ppl 19.640193, bleu_score 18.594361
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 2, iter 2600, avg. loss 36.45, avg. ppl 14.34 cum. examples 3200, speed 707.69 words/sec, time elapsed 509.05 sec
epoch 2, iter 2700, avg. loss 38.05, avg. ppl 15.04 cum. examples 6400, speed 4686.14 words/sec, time elapsed 518.64 sec
epoch 2, iter 2800, avg. loss 36.36, avg. ppl 14.00 cum. examples 9600, speed 5118.19 words/sec, time elapsed 527.25 sec
epoch 2, iter 2900, avg. loss 37.53, avg. ppl 14.84 cum. examples 12800, speed 4318.02 words/sec, time elapsed 537.57 sec
epoch 2, iter 3000, avg. loss 39.53, avg. ppl 15.55 cum. examples 16000, speed 4524.88 words/sec, time elapsed 547.76 sec
epoch 2, iter 3000, cum. loss 37.58, cum. ppl 14.75 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 3000, dev. ppl 17.755107, bleu_score 21.034564
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 2, iter 3100, avg. loss 37.15, avg. ppl 14.63 cum. examples 3200, speed 685.18 words/sec, time elapsed 612.45 sec
epoch 2, iter 3200, avg. loss 36.14, avg. ppl 13.91 cum. examples 6400, speed 4947.89 words/sec, time elapsed 621.32 sec
epoch 2, iter 3300, avg. loss 36.76, avg. ppl 14.08 cum. examples 9600, speed 5075.13 words/sec, time elapsed 630.09 sec
epoch 2, iter 3400, avg. loss 36.36, avg. ppl 13.73 cum. examples 12800, speed 4710.27 words/sec, time elapsed 639.52 sec
epoch 2, iter 3500, avg. loss 37.31, avg. ppl 13.94 cum. examples 16000, speed 4491.45 words/sec, time elapsed 649.61 sec
epoch 2, iter 3500, cum. loss 36.74, cum. ppl 14.06 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 3500, dev. ppl 15.786347, bleu_score 22.997659
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 3, iter 3600, avg. loss 36.76, avg. ppl 13.49 cum. examples 3187, speed 721.40 words/sec, time elapsed 712.03 sec
epoch 3, iter 3700, avg. loss 32.26, avg. ppl 10.11 cum. examples 6387, speed 4939.02 words/sec, time elapsed 721.07 sec
epoch 3, iter 3800, avg. loss 32.94, avg. ppl 10.58 cum. examples 9587, speed 4823.91 words/sec, time elapsed 730.34 sec
epoch 3, iter 3900, avg. loss 32.39, avg. ppl 10.41 cum. examples 12787, speed 4758.31 words/sec, time elapsed 739.64 sec
epoch 3, iter 4000, avg. loss 34.01, avg. ppl 11.23 cum. examples 15987, speed 4915.45 words/sec, time elapsed 748.80 sec
epoch 3, iter 4000, cum. loss 33.67, cum. ppl 11.11 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 4000, dev. ppl 14.962503, bleu_score 22.155497
epoch 3, iter 4100, avg. loss 31.44, avg. ppl 10.27 cum. examples 3200, speed 730.17 words/sec, time elapsed 807.95 sec
epoch 3, iter 4200, avg. loss 32.58, avg. ppl 10.34 cum. examples 6400, speed 4702.58 words/sec, time elapsed 817.46 sec
epoch 3, iter 4300, avg. loss 33.21, avg. ppl 10.56 cum. examples 9600, speed 4421.55 words/sec, time elapsed 827.65 sec
epoch 3, iter 4400, avg. loss 33.15, avg. ppl 10.83 cum. examples 12800, speed 5409.34 words/sec, time elapsed 835.89 sec
epoch 3, iter 4500, avg. loss 32.25, avg. ppl 10.19 cum. examples 16000, speed 4691.00 words/sec, time elapsed 845.36 sec
epoch 3, iter 4500, cum. loss 32.53, cum. ppl 10.44 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 4500, dev. ppl 14.097452, bleu_score 25.528641
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 3, iter 4600, avg. loss 33.07, avg. ppl 10.79 cum. examples 3200, speed 685.97 words/sec, time elapsed 910.21 sec
epoch 3, iter 4700, avg. loss 32.78, avg. ppl 10.75 cum. examples 6400, speed 4939.00 words/sec, time elapsed 919.15 sec
epoch 4, iter 4800, avg. loss 32.57, avg. ppl 10.03 cum. examples 9587, speed 4681.10 words/sec, time elapsed 928.77 sec
epoch 4, iter 4900, avg. loss 29.04, avg. ppl 8.00 cum. examples 12787, speed 5013.80 words/sec, time elapsed 937.69 sec
epoch 4, iter 5000, avg. loss 30.12, avg. ppl 8.70 cum. examples 15987, speed 4861.76 words/sec, time elapsed 946.86 sec
epoch 4, iter 5000, cum. loss 31.52, cum. ppl 9.58 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 5000, dev. ppl 13.301697, bleu_score 25.906059
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 4, iter 5100, avg. loss 30.13, avg. ppl 8.64 cum. examples 3200, speed 693.37 words/sec, time elapsed 1011.34 sec
epoch 4, iter 5200, avg. loss 29.02, avg. ppl 8.16 cum. examples 6400, speed 4807.22 words/sec, time elapsed 1020.54 sec
epoch 4, iter 5300, avg. loss 29.70, avg. ppl 8.68 cum. examples 9600, speed 5028.91 words/sec, time elapsed 1029.29 sec
epoch 4, iter 5400, avg. loss 30.06, avg. ppl 8.58 cum. examples 12800, speed 4745.22 words/sec, time elapsed 1038.74 sec
epoch 4, iter 5500, avg. loss 29.63, avg. ppl 8.46 cum. examples 16000, speed 4706.61 words/sec, time elapsed 1048.17 sec
epoch 4, iter 5500, cum. loss 29.71, cum. ppl 8.50 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 5500, dev. ppl 13.254245, bleu_score 27.686209
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 4, iter 5600, avg. loss 30.70, avg. ppl 8.99 cum. examples 3200, speed 691.22 words/sec, time elapsed 1112.87 sec
epoch 4, iter 5700, avg. loss 28.89, avg. ppl 8.29 cum. examples 6400, speed 4801.58 words/sec, time elapsed 1121.98 sec
epoch 4, iter 5800, avg. loss 30.82, avg. ppl 8.81 cum. examples 9600, speed 4958.97 words/sec, time elapsed 1131.12 sec
epoch 4, iter 5900, avg. loss 29.36, avg. ppl 8.52 cum. examples 12800, speed 4571.10 words/sec, time elapsed 1140.72 sec
epoch 5, iter 6000, avg. loss 29.35, avg. ppl 7.86 cum. examples 15987, speed 4955.23 words/sec, time elapsed 1149.87 sec
epoch 5, iter 6000, cum. loss 29.82, cum. ppl 8.49 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 6000, dev. ppl 12.242672, bleu_score 26.835240
epoch 5, iter 6100, avg. loss 28.22, avg. ppl 7.21 cum. examples 3200, speed 735.86 words/sec, time elapsed 1211.99 sec
epoch 5, iter 6200, avg. loss 26.57, avg. ppl 6.78 cum. examples 6400, speed 4986.42 words/sec, time elapsed 1220.90 sec
epoch 5, iter 6300, avg. loss 27.67, avg. ppl 7.44 cum. examples 9600, speed 4797.35 words/sec, time elapsed 1230.11 sec
epoch 5, iter 6400, avg. loss 27.97, avg. ppl 7.30 cum. examples 12800, speed 4850.44 words/sec, time elapsed 1239.40 sec
epoch 5, iter 6500, avg. loss 26.75, avg. ppl 7.00 cum. examples 16000, speed 4759.85 words/sec, time elapsed 1248.64 sec
epoch 5, iter 6500, cum. loss 27.44, cum. ppl 7.14 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 6500, dev. ppl 12.566782, bleu_score 27.083182
epoch 5, iter 6600, avg. loss 28.16, avg. ppl 7.75 cum. examples 3200, speed 715.68 words/sec, time elapsed 1310.13 sec
epoch 5, iter 6700, avg. loss 27.43, avg. ppl 7.23 cum. examples 6400, speed 4962.63 words/sec, time elapsed 1319.09 sec
epoch 5, iter 6800, avg. loss 27.93, avg. ppl 7.50 cum. examples 9600, speed 4536.52 words/sec, time elapsed 1328.86 sec
epoch 5, iter 6900, avg. loss 28.26, avg. ppl 7.46 cum. examples 12800, speed 4901.10 words/sec, time elapsed 1338.05 sec
epoch 5, iter 7000, avg. loss 28.13, avg. ppl 7.33 cum. examples 16000, speed 4862.90 words/sec, time elapsed 1347.35 sec
epoch 5, iter 7000, cum. loss 27.98, cum. ppl 7.45 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 7000, dev. ppl 12.172065, bleu_score 28.389693
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 5, iter 7100, avg. loss 27.73, avg. ppl 7.84 cum. examples 3200, speed 689.42 words/sec, time elapsed 1409.86 sec
epoch 6, iter 7200, avg. loss 27.06, avg. ppl 6.98 cum. examples 6387, speed 4813.95 words/sec, time elapsed 1419.08 sec
epoch 6, iter 7300, avg. loss 25.09, avg. ppl 6.00 cum. examples 9587, speed 4868.40 words/sec, time elapsed 1428.28 sec
epoch 6, iter 7400, avg. loss 25.53, avg. ppl 6.14 cum. examples 12787, speed 4631.10 words/sec, time elapsed 1438.02 sec
epoch 6, iter 7500, avg. loss 24.62, avg. ppl 5.90 cum. examples 15987, speed 4897.69 words/sec, time elapsed 1447.08 sec
epoch 6, iter 7500, cum. loss 26.00, cum. ppl 6.52 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 7500, dev. ppl 11.944434, bleu_score 27.508788
epoch 6, iter 7600, avg. loss 26.07, avg. ppl 6.38 cum. examples 3200, speed 741.46 words/sec, time elapsed 1507.79 sec
epoch 6, iter 7700, avg. loss 26.42, avg. ppl 6.57 cum. examples 6400, speed 4752.13 words/sec, time elapsed 1517.24 sec
epoch 6, iter 7800, avg. loss 26.27, avg. ppl 6.63 cum. examples 9600, speed 5173.54 words/sec, time elapsed 1525.83 sec
epoch 6, iter 7900, avg. loss 26.62, avg. ppl 6.78 cum. examples 12800, speed 4807.00 words/sec, time elapsed 1535.09 sec
epoch 6, iter 8000, avg. loss 25.60, avg. ppl 6.48 cum. examples 16000, speed 5078.83 words/sec, time elapsed 1543.72 sec
epoch 6, iter 8000, cum. loss 26.20, cum. ppl 6.57 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 8000, dev. ppl 11.892612, bleu_score 26.815679
epoch 6, iter 8100, avg. loss 27.37, avg. ppl 7.09 cum. examples 3200, speed 717.25 words/sec, time elapsed 1606.09 sec
epoch 6, iter 8200, avg. loss 27.15, avg. ppl 7.11 cum. examples 6400, speed 4825.40 words/sec, time elapsed 1615.28 sec
epoch 6, iter 8300, avg. loss 27.69, avg. ppl 7.35 cum. examples 9600, speed 4530.47 words/sec, time elapsed 1625.09 sec
epoch 7, iter 8400, avg. loss 23.49, avg. ppl 5.83 cum. examples 12787, speed 5208.58 words/sec, time elapsed 1633.25 sec
epoch 7, iter 8500, avg. loss 23.85, avg. ppl 5.44 cum. examples 15987, speed 4951.16 words/sec, time elapsed 1642.35 sec
epoch 7, iter 8500, cum. loss 25.91, cum. ppl 6.52 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 8500, dev. ppl 11.515043, bleu_score 30.023968
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 7, iter 8600, avg. loss 24.44, avg. ppl 5.74 cum. examples 3200, speed 705.87 words/sec, time elapsed 1705.77 sec
epoch 7, iter 8700, avg. loss 25.00, avg. ppl 5.78 cum. examples 6400, speed 4723.98 words/sec, time elapsed 1715.42 sec
epoch 7, iter 8800, avg. loss 24.25, avg. ppl 5.94 cum. examples 9600, speed 4855.85 words/sec, time elapsed 1724.39 sec
epoch 7, iter 8900, avg. loss 24.78, avg. ppl 5.86 cum. examples 12800, speed 4871.02 words/sec, time elapsed 1733.61 sec
epoch 7, iter 9000, avg. loss 24.32, avg. ppl 5.96 cum. examples 16000, speed 4856.36 words/sec, time elapsed 1742.60 sec
epoch 7, iter 9000, cum. loss 24.56, cum. ppl 5.85 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 9000, dev. ppl 11.461352, bleu_score 29.461122
epoch 7, iter 9100, avg. loss 25.26, avg. ppl 5.98 cum. examples 3200, speed 716.11 words/sec, time elapsed 1805.71 sec
epoch 7, iter 9200, avg. loss 24.08, avg. ppl 5.72 cum. examples 6400, speed 4776.79 words/sec, time elapsed 1814.96 sec
epoch 7, iter 9300, avg. loss 25.37, avg. ppl 6.14 cum. examples 9600, speed 4446.61 words/sec, time elapsed 1825.02 sec
epoch 7, iter 9400, avg. loss 25.57, avg. ppl 6.22 cum. examples 12800, speed 4967.65 words/sec, time elapsed 1834.04 sec
epoch 7, iter 9500, avg. loss 26.03, avg. ppl 6.49 cum. examples 16000, speed 4860.75 words/sec, time elapsed 1843.20 sec
epoch 7, iter 9500, cum. loss 25.26, cum. ppl 6.11 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 9500, dev. ppl 11.249738, bleu_score 28.705507
epoch 8, iter 9600, avg. loss 23.67, avg. ppl 5.51 cum. examples 3187, speed 723.23 words/sec, time elapsed 1904.31 sec
epoch 8, iter 9700, avg. loss 22.10, avg. ppl 4.98 cum. examples 6387, speed 5216.90 words/sec, time elapsed 1912.75 sec
epoch 8, iter 9800, avg. loss 23.46, avg. ppl 5.20 cum. examples 9587, speed 4680.85 words/sec, time elapsed 1922.48 sec
epoch 8, iter 9900, avg. loss 24.00, avg. ppl 5.47 cum. examples 12787, speed 4732.55 words/sec, time elapsed 1932.04 sec
epoch 8, iter 10000, avg. loss 23.56, avg. ppl 5.39 cum. examples 15987, speed 4991.62 words/sec, time elapsed 1941.01 sec
epoch 8, iter 10000, cum. loss 23.36, cum. ppl 5.31 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 10000, dev. ppl 11.388376, bleu_score 29.824903
epoch 8, iter 10100, avg. loss 23.57, avg. ppl 5.45 cum. examples 3200, speed 699.32 words/sec, time elapsed 2004.58 sec
epoch 8, iter 10200, avg. loss 23.30, avg. ppl 5.64 cum. examples 6400, speed 4994.18 words/sec, time elapsed 2013.22 sec
epoch 8, iter 10300, avg. loss 24.67, avg. ppl 5.73 cum. examples 9600, speed 4467.03 words/sec, time elapsed 2023.35 sec
epoch 8, iter 10400, avg. loss 23.81, avg. ppl 5.73 cum. examples 12800, speed 4897.61 words/sec, time elapsed 2032.27 sec
epoch 8, iter 10500, avg. loss 24.37, avg. ppl 5.90 cum. examples 16000, speed 4930.63 words/sec, time elapsed 2041.19 sec
epoch 8, iter 10500, cum. loss 23.94, cum. ppl 5.69 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 10500, dev. ppl 11.493174, bleu_score 29.774766
epoch 8, iter 10600, avg. loss 25.08, avg. ppl 5.94 cum. examples 3200, speed 729.23 words/sec, time elapsed 2102.95 sec
epoch 8, iter 10700, avg. loss 24.45, avg. ppl 5.74 cum. examples 6400, speed 4875.47 words/sec, time elapsed 2112.14 sec
epoch 9, iter 10800, avg. loss 22.17, avg. ppl 4.87 cum. examples 9587, speed 4978.53 words/sec, time elapsed 2121.10 sec
epoch 9, iter 10900, avg. loss 20.86, avg. ppl 4.59 cum. examples 12787, speed 4997.38 words/sec, time elapsed 2129.87 sec
epoch 9, iter 11000, avg. loss 21.10, avg. ppl 4.77 cum. examples 15987, speed 4548.76 words/sec, time elapsed 2139.37 sec
epoch 9, iter 11000, cum. loss 22.73, cum. ppl 5.16 cum. examples 15987
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 11000, dev. ppl 11.453068, bleu_score 30.408098
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 9, iter 11100, avg. loss 23.16, avg. ppl 5.03 cum. examples 3200, speed 724.44 words/sec, time elapsed 2202.71 sec
epoch 9, iter 11200, avg. loss 22.12, avg. ppl 4.96 cum. examples 6400, speed 4754.95 words/sec, time elapsed 2212.02 sec
epoch 9, iter 11300, avg. loss 22.87, avg. ppl 5.26 cum. examples 9600, speed 4923.30 words/sec, time elapsed 2220.97 sec
epoch 9, iter 11400, avg. loss 22.11, avg. ppl 4.87 cum. examples 12800, speed 4900.33 words/sec, time elapsed 2230.09 sec
epoch 9, iter 11500, avg. loss 23.30, avg. ppl 5.36 cum. examples 16000, speed 4744.04 words/sec, time elapsed 2239.46 sec
epoch 9, iter 11500, cum. loss 22.71, cum. ppl 5.09 cum. examples 16000
begin validation ...


  0%|          | 0/1837 [00:00<?, ?it/s]

validation: iter 11500, dev. ppl 11.237166, bleu_score 31.299686
save currently the best model to [NMT_model.ckpt]


save model parameters to [NMT_model.ckpt]


epoch 9, iter 11600, avg. loss 23.56, avg. ppl 5.35 cum. examples 3200, speed 683.50 words/sec, time elapsed 2305.29 sec
epoch 9, iter 11700, avg. loss 23.44, avg. ppl 5.35 cum. examples 6400, speed 5163.33 words/sec, time elapsed 2313.95 sec
epoch 9, iter 11800, avg. loss 24.70, avg. ppl 5.77 cum. examples 9600, speed 4453.17 words/sec, time elapsed 2324.08 sec
epoch 9, iter 11900, avg. loss 23.51, avg. ppl 5.43 cum. examples 12800, speed 4678.99 words/sec, time elapsed 2333.59 sec


In [30]:
shutil.move("NMT_model.ckpt","drive/MyDrive/P3-NeuralMachineTranslation/models/mod_ab.ckpt")

'drive/MyDrive/P3-NeuralMachineTranslation/models/mod_ab.ckpt'

# Model testing
The function below will be useful for analyzing translations by piecing back together the prediction into a cohesive sequence of tokens.

In [31]:
import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

In [34]:
# baseline_nmt = NMT.load("drive/MyDrive/P3-NeuralMachineTranslation/models/baseline.ckpt")
# baseline_nmt.to(device)
# baseline_nmt.device = device
# baseline_nmt.decoder.device = device

# mod_a_nmt = NMT.load("drive/MyDrive/P3-NeuralMachineTranslation/models/mod_a.ckpt")
# mod_a_nmt.to(device)
# mod_a_nmt.device = device
# mod_a_nmt.decoder.device = device

# mod_b_nmt = NMT.load("drive/MyDrive/P3-NeuralMachineTranslation/models/mod_b.ckpt")
# mod_b_nmt.to(device)
# mod_b_nmt.device = device
# mod_b_nmt.decoder.device = device

mod_ab_nmt = NMT.load("drive/MyDrive/P3-NeuralMachineTranslation/models/mod_ab.ckpt")
mod_ab_nmt.to(device)
mod_ab_nmt.device = device
mod_ab_nmt.decoder.device = device

## Live running demo

In [36]:
test_outputs = {
    "baseline_nmt": [],
    "mod_a_nmt": [],
    "mod_b_nmt": [],
    "mod_ab_nmt": []
}

In [45]:
test_sents = read_corpus(test_path, "test")
test_sents = [untokenize(sent) for sent in test_sents]
test_sents[1700]

'it looks just like the dead king!'

In [46]:
nmt_document_preprocessor = lambda x: nltk.word_tokenize(x) # This is for your RNN

#@title Translation
#@markdown Enter a sentence to see the translation
input_string = "it looks just like the dead king!" #@param {type:"string"}
model_type = "mod_ab_nmt" #@param ["baseline_nmt", "mod_a_nmt", "mod_b_nmt", "mod_ab_nmt"]
from IPython.display import HTML

import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

output = ""

# BAD THING TO DO BELOW!!
model_used = globals()[model_type]
model_used.to(device)

with torch.no_grad():
    # RUN MODEL
    translation = untokenize(model_used.beam_search(
        nmt_document_preprocessor(input_string),
        beam_size=64,
        max_decoding_time_step=len(input_string)+10
    )[0].value)

test_outputs[model_type].append(f"input: {input_string} ----> output: {translation}")

# Generate nice display
output += '<p style="font-family:verdana; font-size:110%;">'
output += " Input sequence: "+input_string+"</p>"
output += '<p style="font-family:verdana; font-size:110%;">'
output += f" Translation to Shakespeare: {translation}</p><hr>"
output = "<h3>Results:</h3>" + output

display(HTML(output))