In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from io import open
from collections import Counter
from functools import partial
import unicodedata
import re
from torch.autograd import Variable
from gensim.models import KeyedVectors
import sacrebleu
import random
import time
from datetime import datetime
import pickle as pkl
import string
import os
from os import listdir 
from ast import literal_eval
from sklearn.metrics import confusion_matrix
import matplotlib.style
import matplotlib as mpl

pd.set_option('max_colwidth',100)
mpl.style.use('bmh')
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Process Data

In [2]:
RESERVED_TOKENS = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}

In [3]:
def text2tokens(raw_text_fp, lang_type): 
    """ Takes filepath to raw text and outputs a list of lists, each representing a sentence of words (tokens) """
    with open(raw_text_fp) as f:
        tokens_data = [line.lower().split() for line in f.readlines()]
        if lang_type == 'source': 
            tokens_data = [datum + ['<EOS>'] for datum in tokens_data]
        elif lang_type == 'target': 
            tokens_data = [['<SOS>'] + datum + ['<EOS>'] for datum in tokens_data]
    return tokens_data 

def load_word2vec(lang): 
    """ Loads pretrained vectors for a given language """
    filepath = "data/pretrained_word2vec/wiki.zh.vec".format(lang)
    word2vec = KeyedVectors.load_word2vec_format(filepath)
    return word2vec

def build_vocab(token_lists, max_vocab_size, word2vec): 
    # UPDATE 11/28: take the most frequently occuring N words even if it doesn't exist in word2vec
    """ Takes lists of tokens (representing sentences of words), max_vocab_size, word2vec model and returns: 
        - id2token: list of tokens, where id2token[i] returns token that corresponds to i-th token 
        - token2id: dictionary where keys represent tokens and corresponding values represent their indices
        Note that the vocab will comprise N=max_vocab_size-len(RESERVED_TOKENS) tokens that are in word2vec model 
    """
    num_vocab = max_vocab_size - len(RESERVED_TOKENS)
    all_tokens = [token for sublist in token_lists for token in sublist]
    token_counter = Counter(all_tokens)
    vocab, count = zip(*token_counter.most_common(num_vocab))
    id2token = list(RESERVED_TOKENS.keys()) + list(vocab)
    token2id = dict(zip(id2token, range(max_vocab_size)))
    
    # check out how many words are in word2vec vs. not 
    not_in_word2vec = [1 for token in token2id if token not in word2vec]
    pct_of_corpus = 100 * sum([token_counter[token] for token in token_counter if token not in word2vec]) / len(all_tokens)
    
    print("A vocabulary of {} is generated from a set of {} unique tokens.".format(len(token2id), len(token_counter)))
    print("{} vocab tokens are not in word2vec, comprising {:.1f}% of entire corpus.".format(len(not_in_word2vec), pct_of_corpus))
    
    return token2id, id2token 

def tokens2indices(tokens_data, token2id): 
    """ Takes tokenized data and token2id dictionary and returns indexed data """
    indices_data = [] 
    for datum in tokens_data: 
        indices_datum = [token2id[token] if token in token2id else RESERVED_TOKENS['<UNK>'] for token in datum ]
        indices_data.append(indices_datum)    
    return indices_data

def get_filepath(split, src_lang, targ_lang, lang_type): 
    """ Locates data filepath given data split type (train/dev/test), translation pairs (src_lang -> targ_lang), 
        and the language type (source or target)
    """
    folder_name = "data/iwslt-{}-{}/".format(src_lang, targ_lang)
    if lang_type == 'source': 
        file_name = "{}.tok.{}".format(split, src_lang)
    elif lang_type == 'target': 
        file_name = "{}.tok.{}".format(split, targ_lang)
    return folder_name + file_name 

def get_filepaths(src_lang, targ_lang): 
    """ Takes language names to be translated from and to (in_lang and out_lang respectively) as inputs, 
        returns a nested dictionary containing the filepaths for input/output data for train/dev/test sets  
    """
    fps = {} 
    
    # store language names 
    fps['languages'] = {} 
    fps['languages']['source'] = src_lang
    fps['languages']['target'] = targ_lang 
    
    # store filepaths 
    for split in ['train', 'dev', 'test']: 
        fps[split] = {} 
        for lang_type in ['source', 'target']: 
            fps[split][lang_type] = {} 
            fps[split][lang_type]['filepath'] = get_filepath(split, src_lang, targ_lang, lang_type)
            
    return fps 

In [4]:
SRC_LANG = 'zh'
TARG_LANG = 'en'

### Define Vocab 

In [5]:
SRC_VOCAB_SIZE = 30000
TARG_VOCAB_SIZE = 30000
# ENC_EMBED_DIM = 300 
# DEC_EMBED_DIM = 300

In [6]:
# generate language dictionaries from train sets 

def generate_vocab(src_lang, targ_lang, src_vocab_size, targ_vocab_size):
    # UPDATE 11/28: take the most frequently occuring N words even if it doesn't exist in word2vec
    """ Outputs a nested dictionary containing token2id, id2token, and word embeddings 
    for source and target lang's vocab """
    
    vocab = {} 
    for lang, vocab_size in zip([src_lang, targ_lang], [src_vocab_size, targ_vocab_size]): 
        
        # load train data 
        train_data_fp = get_filepath(split='train', src_lang=SRC_LANG, targ_lang=TARG_LANG, 
                                     lang_type='target' if lang == 'en' else 'source')
        with open(train_data_fp) as f:
            train_tokens = [line.lower().split() for line in f.readlines()]        
        
        # load word embeddings, generate token2id and id2token 
        word2vec_full = load_word2vec(lang)
        token2id, id2token = build_vocab(train_tokens, vocab_size, word2vec_full) 
        word2vec_reduced = {word: word2vec_full[word] for word in token2id if word in word2vec_full} 
        
        # store token2id, id2token, and word embeddings as a dict in nested dict lang 
        vocab[lang] = {'token2id': token2id, 'id2token': id2token, 'word2vec': word2vec_reduced}
        
    return vocab 

In [7]:
vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)

A vocabulary of 30000 is generated from a set of 88421 unique tokens.
11854 vocab tokens are not in word2vec, comprising 11.0% of entire corpus.
A vocabulary of 30000 is generated from a set of 60694 unique tokens.
8015 vocab tokens are not in word2vec, comprising 6.2% of entire corpus.


### Create Data Loaders 

In [8]:
def process_data(src_lang, targ_lang, sample_limit=None): 
    # UPDATE 11/27: added sample_limit parameter to output only a subset of sentences 
    """ Takes source language and target language names and respective max vocab sizes as inputs 
        and returns as a nested dictionary containing: 
        - train_indices, val_indices, test_indices (as lists of source-target tuples)
        - train_tokens, val_tokens, test_tokens (as lists of source-target tuples)
        - source language's token2id and id2token 
        - target language's token2id and id2token
    """
    
    # get filepaths 
    data = get_filepaths(src_lang, targ_lang)
    
    # loop through each file, read in text, convert to tokens, then to indices 
    for split in ['train', 'dev', 'test']: 
        for lang_type in ['source', 'target']: 
            # read in tokens 
            tokens = text2tokens(data[split][lang_type]['filepath'], lang_type)
            if sample_limit is not None: 
                tokens = tokens[:sample_limit]
            # convert tokens to indices 
            indices = tokens2indices(tokens, vocab[data['languages'][lang_type]]['token2id'])
            # save to dictionary 
            data[split][lang_type]['tokens'] = tokens
            data[split][lang_type]['indices'] = indices
            
    return data

In [9]:
data = process_data(SRC_LANG, TARG_LANG)
limited_data = process_data(SRC_LANG, TARG_LANG, sample_limit=BATCH_SIZE) 

NameError: name 'BATCH_SIZE' is not defined

In [None]:
# print example for sanity check  
print("Example Source: {}".format(data['train']['source']['tokens'][5]))
print("Example Target: {}".format(data['train']['target']['tokens'][5]))

In [None]:
# check distribution of source sentence lengths 
pd.Series(np.array([len(l) for l in data['train']['source']['indices']])).hist(bins=100);

In [None]:
# check distribution of target sentence lengths 
pd.Series(np.array([len(l) for l in data['train']['target']['indices']])).hist(bins=100); 

# Create Data Loaders

In [None]:
class TranslationDataset(Dataset): 
    """ 
    Class that represents a train/validation/test/dataset that's readable for Pytorch. 
    Note that this class inherits torch.utils.data.Dataset
    """
    def __init__(self, src_indices, targ_indices, src_max_sentence_len, targ_max_sentence_len):
        """ 
        Initialize dataset by passing in a list of input indices and a list of output indices 
        """
        self.src_indices = src_indices
        self.targ_indices = targ_indices
        self.src_max_sentence_len = src_max_sentence_len
        self.targ_max_sentence_len = targ_max_sentence_len
        assert (len(self.src_indices) == len(self.targ_indices))
        
    def __len__(self): 
        return len(self.src_indices)
    
    def __getitem__(self, key): 
        """ 
        Triggered when dataset[i] is called, outputs lists of input and output indices, as well as their 
        respective lengths
        """
        src_idx = self.src_indices[key][:self.src_max_sentence_len]
        src_len = len(src_idx)
        targ_idx = self.targ_indices[key][:self.targ_max_sentence_len]
        targ_len = len(targ_idx)
        return [src_idx, targ_idx, src_len, targ_len]
    
def collate_func(src_max_sentence_len, targ_max_sentence_len, batch): 
    """ Customized function for DataLoader that dynamically pads the batch so that all data have the same length"""
    
    src_idxs = [] 
    targ_idxs = [] 
    src_lens = [] 
    targ_lens = [] 
    
    for datum in batch: 
        # append original lengths of sequences 
        src_lens.append(datum[2]) 
        targ_lens.append(datum[3])
        
        # pad sequences before appending 
        src_idx_padded = np.pad(array=np.array(datum[0]), pad_width = ((0, src_max_sentence_len - datum[2])), 
                                mode='constant', constant_values=RESERVED_TOKENS['<PAD>'])
        targ_idx_padded = np.pad(array=np.array(datum[1]), pad_width = ((0, targ_max_sentence_len - datum[3])),
                                 mode='constant', constant_values=RESERVED_TOKENS['<PAD>'])
        src_idxs.append(src_idx_padded)
        targ_idxs.append(targ_idx_padded)
    
    return [torch.from_numpy(np.array(src_idxs)), torch.from_numpy(np.array(targ_idxs)), 
            torch.LongTensor(src_lens), torch.LongTensor(targ_lens)]

def create_dataloaders(processed_data, src_max_sentence_len, targ_max_sentence_len, batch_size): 
    """ Takes processed_data as dictionary output from process_data func, maximum sentence lengths, 
        and outputs train_loader, dev_loader, and test_loaders 
    """
    loaders = {} 
    for split in ['train', 'dev', 'test']: 
        dataset = TranslationDataset(processed_data[split]['source']['indices'], processed_data[split]['target']['indices'], 
                                     src_max_sentence_len, targ_max_sentence_len)
        loaders[split] = DataLoader(dataset, batch_size=batch_size, shuffle=False, 
                                    collate_fn=partial(collate_func, src_max_sentence_len, targ_max_sentence_len))
    return loaders['train'], loaders['dev'], loaders['test']

In [None]:
BATCH_SIZE = 32
SRC_MAX_SENTENCE_LEN = 40 
TARG_MAX_SENTENCE_LEN = 40

In [None]:
train_loader, dev_loader, test_loader = create_dataloaders(
    data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)
train_loader_limited, dev_loader_limited, test_loader_limited = create_dataloaders(
    limited_data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

In [None]:
# check that loader works 
for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(train_loader):
    print(i)
    print(src_idxs.size())
    print(src_idxs)
    print(src_lens)
    print(targ_idxs.size())
    print(targ_idxs)
    print(targ_lens)
    break 

# Model Architecture

In [None]:
def get_pretrained_emb(word2vec, token2id): 
    """ Given word2vec model and the vocab's token2id, extract pretrained word embeddings """
    pretrained_emb = np.zeros((len(token2id), 300)) 
    for token in token2id: 
        try: 
            pretrained_emb[token2id[token]] = word2vec[token]
        except: 
            pretrained_emb[token2id[token]] = np.random.normal(size=(300,))
    return torch.from_numpy(pretrained_emb.astype(np.float32)).to(device)

In [None]:
class EncoderRNN(nn.Module):
    
    def __init__(self, enc_hidden_dim, num_layers, pretrained_word2vec): 
        super(EncoderRNN, self).__init__()
        self.enc_embed_dim = 300
        self.enc_hidden_dim = enc_hidden_dim 
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True)
        self.gru = nn.GRU(input_size=self.enc_embed_dim, hidden_size=self.enc_hidden_dim, num_layers=self.num_layers, 
                          batch_first=True, bidirectional=True)
    
    def forward(self, enc_input, enc_input_lens):
        batch_size = enc_input.size()[0]
        _, idx_sort = torch.sort(enc_input_lens, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        enc_input, enc_input_lens = enc_input.index_select(0, idx_sort), enc_input_lens.index_select(0, idx_sort)
        embedded = self.embedding(enc_input)
        embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, enc_input_lens, batch_first=True)
        hidden = self.initHidden(batch_size).to(device)
        output, hidden = self.gru(embedded, hidden)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, 
                                                           total_length=SRC_MAX_SENTENCE_LEN,
                                                           padding_value=RESERVED_TOKENS['<PAD>'])
        output = output.index_select(0, idx_unsort)
        hidden = hidden.index_select(1, idx_unsort).transpose(0, 1).contiguous().view(self.num_layers, batch_size, -1)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(2*self.num_layers, batch_size, self.enc_hidden_dim, device=device)
    
class DecoderRNN(nn.Module):

    def __init__(self, dec_hidden_dim, enc_hidden_dim, num_layers, pretrained_word2vec):
        super(DecoderRNN, self).__init__()
        self.dec_embed_dim = 300
        self.dec_hidden_dim = dec_hidden_dim 
        self.enc_hidden_dim = enc_hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True)
        self.gru = nn.GRU(self.dec_embed_dim + 2 * self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers)
        self.out = nn.Linear(dec_hidden_dim, TARG_VOCAB_SIZE)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, dec_input, dec_hidden, enc_outputs): 
        batch_size = dec_input.size()[0]
        embedded = self.embedding(dec_input).view(1, batch_size, -1)
        context = torch.cat([enc_outputs[:, -1, :self.enc_hidden_dim], 
                             enc_outputs[:, 0, self.enc_hidden_dim:]], dim=1).unsqueeze(0)
        concat = torch.cat([embedded, context], 2)
        output, hidden = self.gru(concat, dec_hidden)
        output = self.softmax(self.out(output[0]))    
        return output, hidden
        
class EncoderDecoder(nn.Module): 
    
    def __init__(self, encoder, decoder, decoder_token2id): 
        super(EncoderDecoder, self).__init__() 
        self.encoder = encoder 
        self.decoder = decoder 

    def forward(self, src_idx, targ_idx, src_lens, targ_lens, teacher_forcing_ratio): 
        batch_size = src_idx.size()[0]
        enc_outputs, enc_hidden = self.encoder(src_idx, src_lens)
        dec_hidden = enc_hidden 
        dec_outputs = Variable(torch.zeros(TARG_MAX_SENTENCE_LEN, batch_size, TARG_VOCAB_SIZE))
        hypotheses = Variable(torch.zeros(TARG_MAX_SENTENCE_LEN, batch_size))
        dec_output = targ_idx[:, 0] # initialize with <SOS>
        for di in range(1, TARG_MAX_SENTENCE_LEN): 
            dec_output, dec_hidden = self.decoder(dec_output, dec_hidden, enc_outputs)
            dec_outputs[di] = dec_output 
            teacher_labels = targ_idx[:, di-1] 
            greedy_labels = dec_output.data.max(1)[1]
            dec_output = teacher_labels if random.random() < teacher_forcing_ratio else greedy_labels 
            hypotheses[di] = greedy_labels

        return dec_outputs, hypotheses.transpose(0,1)
    
class Attention(nn.Module): 
    
    """ Implements the attention mechanism by Bahdanau et al. (2015) """
    
    def __init__(self, enc_hidden_dim, dec_hidden_dim, num_annotations, num_layers): 
        super(Attention, self).__init__() 
        self.num_annotations = num_annotations
        self.input_dim = enc_hidden_dim * 2 + dec_hidden_dim
        self.attn = nn.Linear(self.input_dim, self.num_annotations)
        self.v = nn.Parameter(torch.rand(self.num_annotations))
        self.num_layers = num_layers 
        nn.init.normal_(self.v)
        
    def forward(self, encoder_outputs, last_dec_hidden): 
        batch_size = encoder_outputs.size()[0]
        last_dec_hidden = last_dec_hidden.transpose(0, 1)[:, -1, :].unsqueeze(1) 
        hidden_broadcast = last_dec_hidden.repeat(1, self.num_annotations, 1)
        v_broadcast = self.v.repeat(batch_size, 1, 1)
        concat = torch.cat([encoder_outputs, hidden_broadcast], dim=2)
        energies = v_broadcast.bmm(torch.tanh(self.attn(concat)))
        attn_weights = F.softmax(energies, dim=2).squeeze(1)
        return attn_weights

class DecoderAttnRNN(nn.Module):
    
    def __init__(self, dec_hidden_dim, enc_hidden_dim, num_layers, pretrained_word2vec):
        super(DecoderAttnRNN, self).__init__()
        self.dec_embed_dim = 300
        self.dec_hidden_dim = dec_hidden_dim 
        self.enc_hidden_dim = enc_hidden_dim
        self.num_layers = num_layers 
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True)
        self.attn = Attention(self.enc_hidden_dim, self.dec_hidden_dim, 
                              num_annotations=SRC_MAX_SENTENCE_LEN, num_layers=self.num_layers)
        self.gru = nn.GRU(self.dec_embed_dim + 2 * self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers)
        self.out = nn.Linear(self.dec_hidden_dim, TARG_VOCAB_SIZE)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, dec_input, dec_hidden, enc_outputs): 
        batch_size = dec_input.size()[0]
        embedded = self.embedding(dec_input).view(1, batch_size, -1)
        attn_weights = self.attn(encoder_outputs=enc_outputs, last_dec_hidden=dec_hidden).unsqueeze(1)
        context = attn_weights.bmm(enc_outputs).transpose(0, 1)
        concat = torch.cat([embedded, context], 2)
        output, hidden = self.gru(concat, dec_hidden)
        output = self.softmax(self.out(output[0]))    
        return output, hidden

# Train and Evaluate

In [None]:
def filter_output_indices(list_indices): 
    # NEW 11/28
    """ Filters out any tokens predicted after <EOS>, as well as <EOS>, <SOS>, and <PAD> themselves """
    
    # drops everything after <EOS> 
    try: 
        output = list_indices[:list_indices.index(RESERVED_TOKENS['<EOS>'])]
    except: 
        output = list_indices
    # drops <SOS>, <EOS>, <PAD>  
    ignored_idx = [RESERVED_TOKENS[token] for token in ['<SOS>', '<EOS>', '<PAD>']] 
    output = [idx for idx in output if idx not in ignored_idx]
    return output 

def tensor2corpus(tensor, id2token): 
    # UPDATED 11/28: Use filter_output_indices to filter out tokens predicted after <EOS> as described above 
    """ Takes a tensor (num_sentences x max_sentence_length) representing the corpus, 
        returns its string equivalent 
    """    
    
    # convert input tensor to a list of lists 
    list_of_lists = tensor.numpy().astype(int).tolist()
    
    # filter each list using above function 
    filtered = [filter_output_indices(l) for l in list_of_lists]
    
    # use dictionary to return string equivalent 
    corpus = ' '.join([id2token[idx] for l in filtered for idx in l])
    
    return corpus

In [None]:
def evaluate(model, loader, id2token, teacher_forcing_ratio=0.0): 
    """ 
    Helper function that tests the model's performance on a given dataset 
    @param: loader = data loader for the dataset to test against 
    """
    
    model.eval() 
    criterion = nn.NLLLoss()
    total_loss = 0 
    reference_corpus = []
    hypothesis_corpus = [] 
    
    for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(loader): 
        batch_size = src_idxs.size()[0]        
        outputs, hypotheses = model(src_idxs, targ_idxs, src_lens, targ_lens, 
                                    teacher_forcing_ratio=teacher_forcing_ratio)
        outputs = outputs[1:].view(-1, TARG_VOCAB_SIZE)
        targets = targ_idxs[:,1:]
        hypothesis_corpus.append(hypotheses)
        reference_corpus.append(targets)
 
        loss = F.nll_loss(outputs.view(-1, TARG_VOCAB_SIZE), targets.contiguous().view(-1), 
                          ignore_index=RESERVED_TOKENS['<PAD>'])
        total_loss += loss.item()  

    # reconstruct corpus and compute bleu score 
    hypothesis_corpus = torch.cat(hypothesis_corpus, dim=0) 
    reference_corpus = torch.cat(reference_corpus, dim=0)
    hypothesis_corpus = tensor2corpus(hypothesis_corpus, id2token)
    reference_corpus = tensor2corpus(reference_corpus, id2token)
    bleu_score = sacrebleu.corpus_bleu(hypothesis_corpus, reference_corpus).score
    
    return total_loss / len(loader), bleu_score, hypothesis_corpus

In [None]:
# helper functions to save results to and load results from a pkl logfile 

RESULTS_LOG = 'experiment_results/experiment_results_log.pkl'

def check_dir_exists(filename): 
    """ Helper function to check that the directory of filename exists, otherwise creates it """
    if not os.path.exists(os.path.dirname(filename)):
        os.makedirs(os.path.dirname(filename))
    else: 
        pass 
        
def append_to_log(hyperparams, results, runtime, experiment_name, dt_created, filename=RESULTS_LOG): 
    """ Appends results and details of a single experiment to a log file """
    
    # check directory exists, else creates it 
    check_dir_exists(filename)
        
    # store experiment details in a dictionary 
    new_result = {'experiment_name': experiment_name, 'hyperparams': hyperparams, 'results': results, 
                  'runtime': runtime, 'dt_created': dt_created}
    
    # if log already exists, append to log 
    try: 
        results_log = pkl.load(open(filename, "rb"))
        results_log.append(new_result)

    # if log doesn't exists, initialize first result as the log 
    except (OSError, IOError) as e:
        results_log = [new_result]
    
    # save to pickle 
    pkl.dump(results_log, open(filename, "wb"))    

In [None]:
def load_experiment_log(experiment_name=None, filename=RESULTS_LOG): 
    """ Loads experiment log, with option to filter for a specific experiment_name """
    
    results_log = pkl.load(open(filename, "rb"))
    
    if experiment_name is not None: 
        results_log = [r for r in results_log if r['experiment_name'] == experiment_name]
        
    return results_log

In [None]:
def inspect_model(model, data_split, train_loader_, dev_loader_, batch=0, num_samples=5): 
    # NEW 11/27 
    """ Use the model and output translates for first num_samples in chosen batch in chosen loader """
    
    # set loader based on data_split choice 
    if data_split == 'train': 
        loader = train_loader_ 
    elif data_split == 'val': 
        loader = dev_loader_ 
        
    for i, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(loader):
        if i == batch: 
            src_idxs = src_idxs[:num_samples, :]
            targ_idxs = targ_idxs[:num_samples, :]
            src_lens = src_lens[:num_samples]
            targ_lens = targ_lens[:num_samples]              
            output, hypotheses = model(src_idxs, targ_idxs, src_lens, targ_lens, teacher_forcing_ratio=0)
            
            if data_split == 'train': 
                print("Inspecting model on training data...")
            elif data_split == 'val': 
                print("Inspecting model on validation data...")
                
            print("REFERENCE TRANSLATION: {}".format(tensor2corpus(targ_idxs, vocab[TARG_LANG]['id2token'])))
            print("MODEL TRANSLATION: {}".format(tensor2corpus(torch.cat([hypotheses], dim=0), vocab[TARG_LANG]['id2token'])))
            break 
        else: 
            pass 

In [None]:
def train_and_eval(model, id2token, learning_rate, num_epochs, 
                   print_intermediate, save_checkpoint, model_name, lazy_eval, lazy_train, inspect): 
    
    # UPDATED 11/27: Added options to lazy_eval (skip eval on training data), lazy_train (overfit on 1 mini-batch), 
    # and inspect (print sentences)
    
    if lazy_train: 
        train_loader_ = train_loader_limited 
        dev_loader_ = dev_loader_limited 
    else: 
        train_loader_ = train_loader
        dev_loader_ = dev_loader      
    
    # initialize optimizer and criterion 
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(ignore_index=RESERVED_TOKENS['<PAD>'])
    results = [] 
    
    # loop through train data in batches and train 
    for epoch in range(num_epochs): 
        train_loss = 0 
        for batch, (src_idxs, targ_idxs, src_lens, targ_lens) in enumerate(train_loader_):
            model.train()
            optimizer.zero_grad()
            final_outputs, hypotheses = model(src_idxs, targ_idxs, src_lens, targ_lens, teacher_forcing_ratio=0.5) 
            loss = criterion(final_outputs[1:].view(-1, TARG_VOCAB_SIZE), targ_idxs[:,1:].contiguous().view(-1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
            optimizer.step()
            
            if batch % 100 == 0 or ((epoch==num_epochs-1) & (batch==len(train_loader_)-1)):
                result = {} 
                result['epoch'] = epoch + batch / len(train_loader_) 
                result['val_loss'], result['val_bleu'], val_hypotheses = evaluate(
                    model, dev_loader_, id2token, teacher_forcing_ratio=1)
                if lazy_eval: 
                    # eval on full train set is very expensive 
                    result['train_loss'], result['train_bleu'], train_hypotheses = 0, 0, None
                else: 
                    result['train_loss'], result['train_bleu'], train_hypotheses = evaluate(
                        model, train_loader_, id2token, teacher_forcing_ratio=1)
                
                results.append(result)
                
                if print_intermediate: 
                    print('Epoch: {:.2f}, Train Loss: {:.2f}, Val Loss: {:.2f}, Train BLEU: {:.2f}, Val BLEU: {:.2f}'\
                          .format(result['epoch'], result['train_loss'], result['val_loss'], 
                                  result['train_bleu'], result['val_bleu']))
                    
                if inspect: 
                    inspect_model(model, 'train', train_loader_, dev_loader_)
                    inspect_model(model, 'val', train_loader_, dev_loader_)
                    
                if save_checkpoint: 
                    if result['val_loss'] == pd.DataFrame.from_dict(results)['val_loss'].min(): 
                        checkpoint_fp = 'model_checkpoints/{}.pth.tar'.format(model_name)
                        check_dir_exists(filename=checkpoint_fp)
                        torch.save(model.state_dict(), checkpoint_fp)
                
    return results 

In [None]:
def run_experiment(model_type, num_epochs=10, learning_rate=0.0005, num_layers=2, enc_hidden_dim=300, 
                   dec_hidden_dim=2*300, experiment_name='NA', model_name='NA', inspect=True, lazy_eval=True, 
                   lazy_train=False, save_to_log=True, save_checkpoint=False, print_summary=True, print_intermediate=True):  
    
    # UPDATED 11/27: Added options to lazy_eval, lazy_train, and inspect
    
    """ Wraps all processing, training and evaluation steps in a function to facilitate hyperparam tuning. 
        Note that the function takes as input tokenized data rather than raw data since there's significant 
        lag time in generating tokens.  
    """
    
    start_time = time.time() 
    
    # TODO: try dropout and optimization algorithms. for now use as default: 
    optimizer = 'Adam' 
    enc_dropout = 0 
    dec_dropout = 0 
    
    # instantiate model and optimizer 
    if model_type == 'without_attention': 
        encoder = EncoderRNN(enc_hidden_dim=enc_hidden_dim, num_layers=num_layers, 
                             pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
        decoder = DecoderRNN(dec_hidden_dim=dec_hidden_dim, enc_hidden_dim=enc_hidden_dim, num_layers=num_layers, 
                             pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
        model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id']) 
        
    elif model_type == 'attention_bahdanau': 
        encoder = EncoderRNN(enc_hidden_dim=enc_hidden_dim, num_layers=num_layers, 
                             pretrained_word2vec=get_pretrained_emb(vocab[SRC_LANG]['word2vec'], vocab[SRC_LANG]['token2id']))
        decoder = DecoderAttnRNN(dec_hidden_dim=dec_hidden_dim, enc_hidden_dim=enc_hidden_dim, num_layers=num_layers,
                                 pretrained_word2vec=get_pretrained_emb(vocab[TARG_LANG]['word2vec'], vocab[TARG_LANG]['token2id']))
        model = EncoderDecoder(encoder, decoder, vocab[TARG_LANG]['token2id'])
        
    else: 
        raise ValueError("Invalid model_type. Must be either 'without_attention' or 'attention_bahdanau'")
        
    # train and evaluate 
    results = train_and_eval(model, id2token=vocab[TARG_LANG]['id2token'], 
                             learning_rate=learning_rate, num_epochs=num_epochs, 
                             print_intermediate=print_intermediate, save_checkpoint=save_checkpoint, 
                             model_name=model_name, lazy_eval=lazy_eval, lazy_train=lazy_train, inspect=inspect)
    
    # store, print, and save results 
    runtime = (time.time() - start_time) / 60 
    dt_created = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    hyperparams = {'model_type': model_type, 'num_epochs': num_epochs, 'learning_rate': learning_rate, 
                   'enc_hidden_dim': enc_hidden_dim, 'dec_hidden_dim': dec_hidden_dim, 'num_layers': num_layers, 
                   'optimizer': optimizer, 'enc_dropout': enc_dropout, 'dec_dropout': dec_dropout, 
                   'batch_size': BATCH_SIZE, 'src_lang': SRC_LANG, 'targ_lang': TARG_LANG, 
                   'src_vocab_size': SRC_VOCAB_SIZE, 'targ_vocab_size': TARG_VOCAB_SIZE, 
                   'src_max_sentence_len': SRC_MAX_SENTENCE_LEN, 'targ_max_sentence_len': TARG_MAX_SENTENCE_LEN}  
        
    if save_to_log: 
        append_to_log(hyperparams, results, runtime, experiment_name, dt_created)
    if print_summary: 
        print("Experiment completed in {} minutes with {:.2f} validation loss and {:.2f} validation BLEU.".format(
            int(runtime), pd.DataFrame.from_dict(results)['val_loss'].min(), 
            pd.DataFrame.from_dict(results)['val_bleu'].max()))
        
    return results, hyperparams, runtime, model

In [None]:
# helper methods to summarize, evaluate, and plot results 

def summarize_results(results_log): 
    """ Summarizes results_log (list) into a dataframe, splitting hyperparameters string into columns, and reducing 
        the val_acc dict into the best validation accuracy obtained amongst all the epochs logged """
    results_df = pd.DataFrame.from_dict(results_log)
    results_df = pd.concat([results_df, results_df['hyperparams'].apply(pd.Series)], axis=1)
    results_df['val_loss'] = results_df['results'].apply(lambda d: pd.DataFrame.from_dict(d)['val_loss'].min())
    return results_df.sort_values(by='val_loss', ascending=True) 

def plot_multiple_learning_curves(results_df, plot_variable, figsize=(8, 5), legend_loc='best'):
    """ Plots learning curves of MULTIPLE experiments, includes only validation accuracy """
    plt.figure(figsize=figsize)
    for index, row in results_df.iterrows():
        val_loss_hist = pd.DataFrame.from_dict(row['results']).set_index('epoch')['val_loss'] 
        plt.plot(val_loss_hist, label="{} ({}%)".format(row[plot_variable], val_loss_hist.max()))
    plt.legend(title=plot_variable, loc=legend_loc)    
    plt.ylabel('Validation Loss')
    plt.xlabel('Epoch')

def plot_single_learning_curve(results, figsize=(8, 5)): 
    """ Plots learning curve of a SINGLE experiment, includes both train and validation accuracy """
    results_df = pd.DataFrame.from_dict(results)
    results_df = results_df.set_index('epoch')
    results_df.plot(figsize=figsize)
    plt.ylabel('Validation Lossy')
    plt.xlabel('Epoch')

In [None]:
# helper function to count parameters 
def count_parameters(model): 
    all_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return all_params, trainable_params

In [None]:
results, hyperparams, runtime, model = \
    run_experiment(model_type='attention_bahdanau', num_epochs=2000, learning_rate=0.0005,
                   num_layers=2, enc_hidden_dim=300, dec_hidden_dim=2*300, experiment_name='test_run', 
                   model_name='test_run', inspect=False, lazy_eval=False, lazy_train=True, 
                   save_to_log=True, save_checkpoint=True, print_summary=True, print_intermediate=True)

In [None]:
inspect_model(model, 'train', train_loader_limited, dev_loader_limited)

In [None]:
count_parameters(model)

In [None]:
all_results = summarize_results(load_experiment_log(experiment_name='test_run', filename=RESULTS_LOG))
all_results

In [None]:
# TODO: Split BLEU and Loss 
plot_single_learning_curve(all_results.iloc[1]['results'])