In [None]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [None]:
import NMTDataset as NMTDataset
import NMTEncoder as NMTEncoder
import NMTDecoder as NMTDecoder
import NMTModel as NMTModel
import NMTSampler as NMTSampler
import NMTVectorizer as NMTVectorizer
import SequenceVocabulary as SequenceVocabulary

In [None]:
from training_utils import *

In [None]:
# Generating minibatches for NMT

def generate_nmt_batches(dataset, batch_size, shuffle=True,
                         drop_last=True, device="cpu"):
    """
    A generator function which wraps the Pytorch DataLoader; NMT version
    """

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
    
    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()

        out_data_dict = {}
        for name , tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)

        yield out_data_dict

In [None]:

def verbose_attention(encoder_state_vectors, query_vector):
    """A descriptive version of the neural attention mechanism 
    
    Args:
        encoder_state_vectors (torch.Tensor): 3dim tensor from bi-GRU in encoder
        query_vector (torch.Tensor): hidden state in decoder GRU
    Returns:
        
    """
    batch_size , num_vectors, vector_size = encoder_state_vectors.size()
    vector_scores = torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, ))

    vector_probabilities = F.softmax(vector_scores, dim=1)
    weighted_vectors = encoder_state_vectors * vector_probabilities.view(batch_size, )

    context_vectors = torch.sum(weighted_vectors, dim=1)

    return context_vectors, vector_probabilities, vector_scores

In [None]:

def terse_attention(encoder_state_vectors, query_vector):
    """A shorter and more optimized version of the neural attention mechanism
    
    Args:
        encoder_state_vectors (torch.Tensor): 3dim tensor from bi-GRU in encoder
        query_vector (torch.Tensor): hidden state
    """
    vector_scores = torch.matmul(encoder_state_vectors, query_vector.unsqueze(dim=2))

    vector_probabilities = F.softmax(vector_scores, dim=-1)
    context_vectors = torch.matmul(encoder_state_vectors.transpose(-2, -1),
                                   vector_probabilities.unsqueeze(dim=2)).squeeze()
    
    return context_vectors, vector_probabilities

In [None]:
args = Namespace(
    dataset_csv="data/nmt/simplest_eng_fra.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch8/nmt_luong_sampling",
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
    cuda=True,
    seed=1337,
    learning_rate=5e-4,
    batch_size=32,
    num_epochs=100,
    early_stopping_criteria=5,
    source_embedding_size=24,
    target_embedding_size=24,
    encoding_size=32,
    catch_keyboard_interrupt=True
)


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)
    
    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expand filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

    # check CUDA
    if not torch.cuda.is_available():
        args.cuda = False

    args.device = torch.device("cuda" if args.cuda else "cpu")

    print("Using CUDA: {}".format(args.cuda))

    # set seed for reproducibility
    set_seed_everywhere(args.seed, args.cuda)

    # handle dirs
    handle_dirs(args.save_dir)

In [None]:
if args.reload_from_files and os.path.exists(args.vectorizer_file):
    # training from a checkpoint
    dataset = NMTDataset.load_dataset_and_load_vectorizer(args.dataset_csv,
                                                          args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NMTDataset.load_dataset_and_make_vectorizer(args.dataset_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()

In [None]:
model = NMTModel(source_vocab_size=len(vectorizer.source_vocab),
                 source_embedding_size=args.source_embedding_size,
                 target_vocab_size=len(vectorizer.target_vocab),
                 target_embedding_size=args.target_embedding_size,
                 encoding_size=args.encoding_size,
                 target_bos_index=vectorizer.target_vocab.begin_seq_index)


if args.reload_from_files and os.path.exist(args.model_state_file):
    model.load_state_dict(torch.load(args.model_state_file))
    print("Reload model")

else:
    print("New Model")

In [None]:
model = model.to(args.device)

optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

mask_index = vectorizer.target_vocab.mask_index
train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc='training routine',
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size),
                          position=1,
                          leave=True)

dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size),
                        position=1,
                        leave=True)
try:
    for epoch_index in range(args.num_epochs):
        sample_probability = (20 + epoch_index) / args.num_epochs

        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # Setup: batch generator , set loss and acc to 0, set train mode on 
        dataset.set_split('train')
        batch_generator = generate_nmt_batches(dataset,
                                               batch_size=args.batch_size,
                                               device=args.device)
        
        running_loss = 0.0
        running_acc = 0.0
        model.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is the 5 steps:
            # Step 1. Zero the gradient 
            optimizer.zero_grad()

            # step 2, compute the outputs
            y_pred = model(batch_dict['x_source'],
                           batch_dict['x_source_length'],
                           batch_dict['x_target'],
                           sample_probability=sample_probability)
            
            # step 3: compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # step 4: use loss tp produce gradients 
            loss.backwards()

            # step 5: use the optimizer to take gradient step 
            optimizer.step()

            # compute the runninng loss and running accuarcy

            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)

            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch_generator ,set loss and acc to 0 ; set eval mode on

        dataset.split('val')
        batch_generator = generate_nmt_batches(dataset,
                                               batch_size=args.batch_size,
                                               device=args.device)
        
        running_loss = 0.0
        running_acc = 0.0
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output

            y_pred = model(batch_dict['x_source'],
                           batch_dict['x_source_length'],
                           batch_dict['x_target'],
                           sample_probability=sample_probability)
            
            # step 3: compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the running loss and accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args.args, model=model, train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.set_postfix(best_val=train_state['early_stopping_best_val'])
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")


In [None]:
from nltk.translate import bleu_score
import seaborn as sns
import matplotlib.pyplot as plt

chencherry = bleu_score.SmoothingFunection()

In [None]:
def sentence_from_indices(indices, vocab, strict=True, return_string=True):
    ignore_indices = set([vocab.mask_index, vocab.begin_seq_index, vocab.end_seq_index])
    out = []
    for index in indices:
        if index == vocab.begin_seq_index and strict:
            continue
        elif index == vocab.end_seq_index and strict:
            break
        else:
            out.append(vocab.lookup_index(index))

    if return_string:
        return " ".join(out)
    else:
        return out
     

In [None]:
model = model.eval().to(args.device)

sampler = NMTSampler(vectorizer, model)

dataset.set_split('test')
batch_generator = generate_nmt_batches(dataset,
                                       batch_size=args.batch_size,
                                       device=args.device)

test_results = []
for batch_dict in batch_generator:
    sampler.apply_to_batch(batch_dict)
    for i in range(args.batch_size):
        test_results.append(sampler.get_ith_item(i, False))

In [None]:
plt.hist([r['bleu-4'] for r in test_results], bins=100)
np.mean([r['bleu-4'] for r in test_results]) , np.median([r['bleu-4'] for r in test_results])

In [None]:
dataset.set_split('val')
batch_generator = generate_nmt_batches(dataset,
                                       batch_size=args.batch_size,
                                       device=args.device)

batch_dict = next(batch_generator)

model = model.eval().to(args.device)
sampler = NMTSampler(vectorizer.model)
sampler.apply_to_batch(batch_dict)

In [None]:
all_results = []
for i in range(args.batch_size):
    all_results.append(sampler.get_ith_item(i, False))
    

In [None]:
top_results = [x for x in all_results if x['bleu-4'] > 0.5]
len(top_results)

In [None]:
for sample in top_results:
    plt.figure()
    target_len = len(sample['sampled'])
    source_len = len(sample['source'])

    attention_matrix = sample['attention'][:target_len, :source_len+2].transpose()
    ax = sns.heatmap(attention_matrix, center=0.0)
    ylabs = ["<BOS>"]+sample['source']+["<EOS>"]

    ax.set_yticklabels(ylabs, rotation=0)
    ax.set_xtickslabels(sample['sampled'], rotation=90)
    ax.set_xlabel("Target Sentence")
    ax.set_ylabels("Source Sentence\n\n")


In [None]:
def get_source_sentence(vectorizer, batch_dict, index):
    indices = batch_dict['x_source'][index].cpu().numpy()
    vocab = vectorizer.source_vocab
    return sentence_from_indices(indices, vocab)

def get_true_sentence(vectorizer, batch_dict, index):
    return sentence_from_indices(batch_dict['y_target'].cpu().data.numpy()[index], vectorizer.target_vocab)

def get_sampled_sentence(vectorizer, batch_dict, index):
    y_pred = model(x_source=batch_dict['x_source'],
                   x_source_lengths=batch_dict['x_source_length'],
                   target_sequence=batch_dict['x_target'],
                   sample_probability=1.0)
    return sentence_from_indices(torch.max(y_pred, dim=2)[1].cpu().data.numpy()[index], vectorizer.target_vocab)

def get_all_sentences(vectorizer, batch_dict, index):
    return {
        "source": get_source_sentence(vectorizer, batch_dict, index),
        "truth": get_true_sentence(vectorizer, batch_dict, index),
        "sampled": get_sampled_sentence(vectorizer, batch_dict, index)
    }

def sentence_from_indices(indices, vocab, strict=True):
    ignore_indices = set([vocab.mask_index, vocab.begin_seq_index, vocab.end_seq_index])
    out = []
    for index in indices:
        if index == vocab.begin_seq_index and strict:
            continue
        elif index == vocab.end_seq_index and strict:
            return " ".join(out)
        else:
            out.append(vocab.lookup_index(index))
    return " ".join(out)

results = get_all_sentences(vectorizer, batch_dict, 1)
results
