In [14]:
%config Completer.use_jedi = False

In [1]:
import random
import time
import math
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from dataset import nmtDataset
import helpers as utils

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
hyp_params = {
    "batch_size": 128,
    "num_epochs": 10,

    # Encoder parameters
    "encoder_embedding_size": 512,
    "encoder_dropout": 0.5,

    # Decoder parameters
    "decoder_dropout": 0.5,
    "decoder_embedding_size": 512,

    # Common parameters
    "hidden_size": 512,
    "num_layers": 2
}

In [4]:
log = utils.Logger('logs/emd512-enc2-dec2-bilstm.out')

In [5]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout, bidirectional = True)
        
    def forward(self, x):
        # Shape (embedding) --> [Sequence_length , batch_size , embedding dims]
        embedding = self.dropout(self.embedding(x))
        
        # ************** Multiplied by 2 because of bi-directional LSTM
        # Shape --> (output) [Sequence_length , batch_size , hidden_size * 2]
        # Shape --> (hs, cs) [num_layers * 2, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        return hidden_state, cell_state
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout, output_size):
        super(Decoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
         # ************** Multiplying 2 because of bi-directional LSTM
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers * 2, dropout=dropout)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_state, cell_state):
        # As we are not feeding whole sentence we will each token a time
        # hence our sequence length would be just 1 however shape of x is batch_size
        # to add sequence length we will unsequeeze it
        # Shape (x) --> [batch_size] (see seq2seq model) so making it [1, batch_size]
        x = x.unsqueeze(0)
        
        # Shape (embedded) --> (1, batch_size, embedding dims)
        embedded = self.dropout(self.embedding(x))
        
        # Shape (outputs) --> (1, 32, 1024) [1, batch_size , hidden_size]
        # Shape (hs, cl) --> (2, 32, 1024)  [num_layers * 2, batch_size , hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedded, (hidden_state, cell_state))

        # Shape (outputs) -->  (1, batch_size, hidden_size)
        # Shape (outputs.squeeze(0)) -->  (batch_size, hidden_size)
        # Shape (predictions) --> (batch_size, target_vocab_size)
        predictions = self.fc(outputs.squeeze(0))
        
        return predictions, hidden_state, cell_state

class SeqtoSeq(nn.Module):
    def __init__(self, gen_params, target_vocab, device):
        super(SeqtoSeq, self).__init__()

        self.Encoder = Encoder(gen_params["input_size_encoder"],
                          gen_params["encoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["encoder_dropout"]).to(device)

        self.Decoder = Decoder(gen_params["input_size_decoder"],
                          gen_params["decoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["decoder_dropout"],
                          gen_params["output_size"]).to(device)

        self.target_vocab = target_vocab
        self.device = device
    
    def forward(self, source, target, tfr=0.5):
        # Shape -> (Sentence length, Batch_size)
        batch_size = source.shape[1]

        target_len = target.shape[0]  # Length of target sentences
        target_vocab_size = len(self.target_vocab)
        
        # here we will store all the outputs
        # so outputs is arrange in a way that sentences are in column and batch size is row and every element
        # will consist of probability of each word from the vocab
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)

        # Shape --> (hs, cs) (num_layers * 2, batch_size size, hidden_size) (contains encoder's hs, cs - context vectors)
        hidden_state, cell_state = self.Encoder(source)

        # Shape (target) -> (Sentence length, Batch_size)
        # Shape (x) --> (batch_size)
        x = target[0]  # First token (Trigger)
        
        for i in range(1, target_len):
            # Shape (output) --> (batch_size, target_vocab_size)
            # Shape (hs, cl) --> (num_layers * 2, batch_size , hidden_size)
            output, hidden_state, cell_state = self.Decoder(x, hidden_state, cell_state)
            outputs[i] = output
            best_guess = output.argmax(1)  # 0th dimension is batch size, 1st dimension is word embedding
            # Schedule sampling
            x = target[
                i] if random.random() < tfr else best_guess  # Either pass the next word correctly from the dataset
            # or use the earlier predicted word

        # Shape --> (sentence length, batch size, vocab size)
        return outputs

In [6]:
nmtds_train = nmtDataset('datasets/Multi30k/', 'train')
nmtds_valid = nmtDataset('datasets/Multi30k/', 'val', nmtds_train)
nmtds_test = nmtDataset('datasets/Multi30k/', 'test', nmtds_train)

In [7]:
train_dataloader = DataLoader(nmtds_train, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, device))

valid_dataloader = DataLoader(nmtds_valid, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, device))

In [8]:
hyp_params["input_size_encoder"] = len(nmtds_train.src_vocab)
hyp_params["input_size_decoder"] = len(nmtds_train.trg_vocab)
hyp_params["output_size"] = len(nmtds_train.trg_vocab)

model = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
optimizer = optim.Adam(model.parameters())

pad_idx = nmtds_train.trg_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(device)

In [9]:
min_el = math.inf
patience = 1
best_model = {}
best_epoch = 0

epoch_loss = 0
for epoch in range(hyp_params["num_epochs"]):
    start = time.time()
    
    epoch_loss = utils.train_model(model, train_dataloader, criterion, optimizer)
    eval_loss = utils.evaluate_model(model, valid_dataloader, criterion)
    
    log.log(f"Epoch: {epoch+1}, Train loss: {epoch_loss}, Eval loss: {eval_loss}, patience: {patience}. Time {time.time() - start}")

    
    if eval_loss < min_el:
        best_epoch = epoch+1
        min_el = eval_loss
        best_model = copy.deepcopy(model)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'eval_loss': min_el
        }, 'model-bilstm.pt')
        patience = 1
    else:
        patience += 1
    
    if patience == 10:
        log.log("[STOPPING] Early stopping in action..")
        log.log(f"Best epoch was {best_epoch} with {min_el} eval loss")
        break

log.close()

100%|██████████| 227/227 [02:20<00:00,  1.62it/s]


Epoch: 1, Train loss: 5.216370603586609, Eval loss: 4.687207579612732, patience: 1. Time 142.16274309158325


100%|██████████| 227/227 [02:20<00:00,  1.62it/s]


Epoch: 2, Train loss: 4.598463703357176, Eval loss: 4.3513670563697815, patience: 1. Time 142.30233526229858


100%|██████████| 227/227 [02:22<00:00,  1.59it/s]


Epoch: 3, Train loss: 4.173404616931461, Eval loss: 4.210356831550598, patience: 1. Time 144.33386492729187


100%|██████████| 227/227 [02:21<00:00,  1.61it/s]


Epoch: 4, Train loss: 3.867253988324808, Eval loss: 4.019710153341293, patience: 1. Time 143.04434609413147


100%|██████████| 227/227 [02:21<00:00,  1.61it/s]


Epoch: 5, Train loss: 3.6668072545055774, Eval loss: 3.990602344274521, patience: 1. Time 143.1064372062683


100%|██████████| 227/227 [02:20<00:00,  1.61it/s]


Epoch: 6, Train loss: 3.5022870727572673, Eval loss: 3.9146927893161774, patience: 1. Time 142.4289939403534


100%|██████████| 227/227 [02:19<00:00,  1.62it/s]


Epoch: 7, Train loss: 3.3620631736805784, Eval loss: 3.8330230712890625, patience: 1. Time 141.4496030807495


100%|██████████| 227/227 [02:20<00:00,  1.62it/s]


Epoch: 8, Train loss: 3.241636825553121, Eval loss: 3.801970452070236, patience: 1. Time 141.77670741081238


100%|██████████| 227/227 [02:19<00:00,  1.63it/s]


Epoch: 9, Train loss: 3.132763042323915, Eval loss: 3.7934650778770447, patience: 1. Time 141.08256220817566


100%|██████████| 227/227 [02:19<00:00,  1.63it/s]


Epoch: 10, Train loss: 3.0410957599001307, Eval loss: 3.774424761533737, patience: 1. Time 141.120463848114


In [11]:
model_l = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
model_l.load_state_dict(torch.load('model-bilstm.pt', map_location=device)["model_state_dict"])
model_l.eval()

<All keys matched successfully>

In [12]:
utils.bleu(model_l, nmtds_test, False, device)

100%|██████████| 1000/1000 [00:25<00:00, 39.41it/s]


0.19580678641796112