In [1]:
import numpy as np
from collections import Counter
import string
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from argparse import Namespace
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import re

## Sequence Vocab

In [2]:
class SequenceVocabulary(object):
    """Class to extract and process vocabularies for mapping"""
    
    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()
        }
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        
        # add begin and end sequence token
        self._begin_of_seq_token = "<BEGIN-OF-SEQUENCE>"
        self._end_of_seq_token = "<END-OF-SEQUENCE>"
        
        self.begin_seq_index = self.add_token(self._begin_of_seq_token)
        self.end_seq_index = self.add_token(self._end_of_seq_token)

        self.mask_index = self.add_token(mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializeable(self):
        """return a serializeable dictionary"""
        return {
            'token_to_idx': self._token_to_idx,
            'mask_token': self._mask_token,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }
    
    @classmethod
    def from_serializeable(cls, contents):
        """create vocabulary object from serialize dictionary"""
        return cls(**contents)
    
    def add_token(self, token):
        """Add a token and return it's index"""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """get the index of a token 
        if not exist returns the unk_index"""
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer

In [4]:
class NMTVectorizer(object):
    def __init__(self, source_vocab, target_vocab,
                max_source_length, max_target_length):
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        
    @classmethod
    def from_dataframe(cls, lang_df):
        source_vocab = SequenceVocabulary()
        target_vocab = SequenceVocabulary()
        
        max_source_length, max_target_length = 0, 0
        
        for rows in lang_df.iterrows():
            # source
            source_token = rows["source_languange"].split(" ")
            if len(source_token) > max_source_length:
                max_source_length = len(source_token)
            for token in source_token:
                source_vocab.add_token(token)
            
            # target
            target_token = rows["target_language"].split(" ")
            if len(target_token) > max_target_length:
                max_target_length = len(target_token)
            for token in target_token:
                target_vocab.add_token(token)
                
        return cls(source_vocab, target_vocab,
                  max_source_length, max_target_length)
    
    def _vectorize(self, indices, vector_length=-1, mask_index=0):
        if vector_length < 0:
            vector_length = len(indices)
        
        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        vector[len(indices):] = mask_index
        return vector
    
    def _get_source_indices(self, source_text):
        """
        Source indices adding begin_seq_index and
        end_seq_index
        """
        indices = [self.source_vocab.begin_seq_index]
        indices.extend(self.source_vocab.lookup_token(token) for token in
                       source_text.split(" "))
        indices.append(self.source_vocab.end_seq_index)
        
        return indices
    
    def _get_target_indices(self, target_text):
        indices = [self.target_vocab.lookup_token(token)
                   for token in target_text.split(" ")]
        
        x_indices = [self.target_vocab.begin_seq_index] + indices
        y_indices = indices + [self.target_vocab.end_seq_index]
        
        return x_indices, y_indices
    
    def vectorize(self, source_text, target_text, use_dataset_max_length=True):
        source_length = -1
        target_length = -1
        
        if use_dataset_max_length:
            source_length = self.max_source_length + 2
            target_length = self.max_target_length + 1
        
        source_indices = self._get_source_indices(source_text)
        source_vector = self._vectorize(source_indices,
                                       source_length,
                                       mask_index= self.source_vocab.mask_index)
        
        target_x_indices, target_y_indices = self._get_target_indices(target_text)
        
        target_x_vector = self._vectorize(target_x_indices,
                                         target_length,
                                         self.target_vocab.mask_index)
        target_y_vector = self._vectorize(target_y_indices,
                                         target_length,
                                         self.target_vocab.mask_index)
        return {"source_vector": source_vector,
                "target_x_vector": target_x_vector,
                "target_y_vector": target_y_vector,
                "source_length": len(source_indices)}

## Dataset

In [10]:
class NMTDataset(Dataset):
    def __init__(self, text_df, vectorizer):
        self.text_df = text_df
        self._vectorizer = vectorizer

        self._max_seq_length = max(map(len, self.text_df.surname)) + 2
        
        self.train_df = self.text_df[self.text_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.text_df[self.text_df.split == 'val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.text_df[self.text_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, text_csv):
        """Load dataset from csv and returns the dataset object
        and vectorizer"""
        text_df = pd.read_csv(text_csv)
        train_text_df = surname_df[text_df.split == 'train']
        return cls(text_df,
                   NMTVectorizer.from_dataframe(train_text_df))
    
    def get_vectorizer(self):
        """Get vectorizer"""
        return self._vectorizer
    
    def set_split(self, split='train'):
        """Set the split from data"""
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        data_dict = self._vectorizer.vectorize(self.test_df['source_language'],
                                                                    self.test_df['target_language'])
        
        return {
            'x_source': data_dict["source_vector"],
            'x_target': data_dict["target_x_vector"],
            'y_target': data_dict["target_y_vector"],
            'x_source_length': data_dict["source_length"]
        }
    
    def get_num_batches(self, batch_size):
        """Given the batch size return the number of batches in the dataset"""
        return len(self) // batch_size


def generate_nmt_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    Batch Generator
    """
    dataloader = DataLoader(dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last= drop_last)
    
    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        
        yield out_data_dict

## Encoder

In [6]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NMTEncoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        super(NMTEncoder, self).__init__()
        
        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.birnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
        
    def forward(self, x_source, x_lengths):
        x_embedded = self.source_embedding(x_source)
        x_lengths = x_lengths.detach().cpu().numpy()
        x_packed = pack_padded_sequence(x_embedded, x_lengths, batch_first=True)
        
        x_birnn_out, x_birnn_h = self.birnn(x_packed)
        x_birnn_h = x_birnn_h.permute(1,0,2)
        
        x_birnn_h = x_birnn_h.contigous().view(x_birnn_h.size(0), -1)
        
        x_unpacked, _ = pad_packed_sequence(x_birnn_out, batch_first=True)
        return x_unpacked, x_birnn_h

## Decoder

In [7]:
def verbose_attention(encoder_state_vectors, query_vector):
    batch_size, num_vectors, vector_size = encoder_state_vectors.size()
    
    vector_scores = \
        torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size), dim=2)
    
    vector_probabilities = torch.softmax(vector_scores, dim=1)
    
    weighted_vectors = \
        encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1)
    
    context_vectors = torch.sum(weighted_vectors, dim=1)
    return context_vectors, vector_probabilities

def terse_attention(encoder_state_vectors, query_vector):
    vector_scores = torch.matmul(encoder_state_vectors, query_vector.unsqueeze(dim=2)).squeeze()
    
    vector_probabilities = torch.softmax(encoder_state_vectors, dim=-1)
    context_vectors = torch.matmul(encoder_state_vectors.transpose(-2, -1),
                                  vector_probabilities.unsqueeze(dim=2)).squeeze()
    
    return context_vectors, vector_probabilities

class NMTDecoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        super(NMTDecoder, self).__init__()
        
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,
                                            embedding_dim=embedding_size,
                                            padding_idx=0)
        
        self.gru_cell = nn.GRUCell(embedding_size + rnn_hidden_size, rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        
        self.classifier = nn.Linear(rnn_hidden_size*2, num_embeddings)
        self.bos_index = bos_index
        
    def _init_indices(self, batch_size):
        """returns the BOS index vector"""
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
            
    def _init_context_vectors(self, batch_size):
        """returns a zeros vector for initializing the context"""
        return torch.zeros(batch_size, self._rnn_hidden_size)
    
    def forward(self, encoder_state, initial_hidden_state, target_sequence):
        target_sequence = target_sequence.permute(1,0)
        
        h_t = self.hidden_map(initial_hidden_state)
        
        batch_size = encoder_state.size(0)
        
        # initialize context vector
        context_vectors = self._init_context_vectors(batch_size)
        y_t_index = self._init_indices(batch_size)
        
        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)
        context_vectors = context_vectors.to(encoder_state.device)
        
        output_vectors = []
        self._cached_p_attn = []
        self._cached_ht = []
        self._cached_decoder_state = encoder_state.cpu().detach().numpy()
        
        output_sequence_size = target_sequence.size(0)
        for i in range(output_sequence_size):
            
            # decoding the vectors
            # 1. embed word and concat with previous context
            y_input_vector = self.target_embedding(target_sequence)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)
            
            # 2. make a GRU step, getting a new hidden vector
            h_t = self.gru_cell(rnn_input, h_t)
            self._cached_ht.append(h_t.cpu().data.numpy())
            
            # 3. use current vector to attend to encoder state
            context_vectors, p_attn, _ = verbose_attention(encoder_state, h_t)
            
            # cache the attention probabilities for visualization
            self._cached_p_attn.append(p_attn.cpu().detach().numpy())
            
            # 4 use current hidden and context vectors
            # to make a prediction for the next word
            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier(prediction_vector)
            
            # collect the prediction scores
            output_vectors.append(score_for_y_t_index)

## NMT Model

In [8]:
class NMTModel(nn.Module):
    def __init__(self, source_vocab_size, source_embedding_size, target_vocab_size,
                target_embedding_size, encoding_size, target_bos_index):
        super(NMTModel, self).__init__()
        
        self.encoder = NMTEncoder(num_embeddings= source_vocab_size,
                                 embedding_size=source_embedding_size,
                                 rnn_hidden_size=encoding_size)
        
        decoding_size = encoding_size * 2
        
        self.decoder = NMTDecoder(num_embeddings= target_vocab_size,
                                 embedding_size= target_embedding_size,
                                 rnn_hidden_size= decoding_size,
                                 bos_index= target_bos_index)
        
    def forward(self, x_source, x_source_lengths, target_sequence):
        encoder_state, final_hidden_states = self.encoder(x_source,
                                                         x_source_lengths)
        decoded_states = self.decoder(encoder_state, final_hidden_states,
                                     target_sequence)
        
        return decoded_states

## Utility functions

In [11]:
def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)

## Training Routine

In [9]:
args = Namespace(
    # Data information
    frequency_cutoff = 25,
    text_csv = '/content/drive/My Drive/Colab Notebooks/Data/surnames_with_splits.csv',
    # Model HyperParameters
    source_embedding_size=24, 
    target_embedding_size=24,
    encoding_size=32,
    # Training HyperParameters
    batch_size = 128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    momentum=0.1,
    num_epochs=100,
    seed=1337,
    cuda=True,
    dropout=0.1
)

def make_train_state(args):
    return {
        'epoch_index':0,
        'train_loss':[],
        'train_acc':[],
        'val_loss': [],
        'val_acc': [],
        'test_loss': -1,
        'test_acc': -1,
    }

In [None]:
train_state = make_train_state(args)

if torch.cuda.is_available() and args.cuda:
  args.cuda = True
else:
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Device available ", args.device)

# dataset object
dataset = NMTDataset.load_dataset_and_make_vectorizer(args.text_csv)

# vectorizer
vectorizer = dataset.get_vectorizer()

# classifier
model = NMTModel(source_vocab_size= len(vectorizer.source_vocab),
                target_vocab_size= len(vectorizer.target_vocab),
                source_embedding_size = args.source_embedding_size,
                target_embedding_size= args.target_embedding_size,
                encoding_size= args.encoding_size,
                target_bos_index= vectorizer.target_vocab.begin_seq_index)
model.to(args.device)

# optimizer
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

In [None]:
mask_index = vectorizer.target_vocab.mask_index
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size)-1, 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size)-1, 
                        position=1, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_nmt_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(batch_dict['x_source'], 
                           batch_dict['x_source_length'], 
                           batch_dict['x_target'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)


            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        modelConditioned.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(batch_dict['x_source'], 
                           batch_dict['x_source_length'], 
                           batch_dict['x_target'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")