In [1]:
import random
import time
import math
import copy
import gc

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from dataset import nmtDataset
import helpers as utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Seeding for consistency in reproducibility
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[+] Download and installation successful
You can now load the package via spacy.load('de_core_news_sm')


In [5]:
hyp_params = {
    "batch_size": 128,
    "num_epochs": 10,

    # Encoder parameters
    "encoder_embedding_size": 512,
    "encoder_dropout": 0.5,

    # Decoder parameters
    "decoder_dropout": 0.5,
    "decoder_embedding_size": 512,

    # Common parameters
    "hidden_size": 512,
    "num_layers": 2
}

In [6]:
log = utils.Logger('logs/emd512-enc2-dec2-64b-vanilla.out')

In [7]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout)
        
    def forward(self, x):
        # Shape (embedding) --> [Sequence_length , batch_size , embedding dims]
        embedding = self.dropout(self.embedding(x))
        
        # Shape --> (output) [Sequence_length , batch_size , hidden_size]
        # Shape --> (hs, cs) [num_layers, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        return hidden_state, cell_state
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout, output_size):
        super(Decoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_state, cell_state):
        # As we are not feeding whole sentence we will each token a time
        # hence our sequence length would be just 1 however shape of x is batch_size
        # to add sequence length we will unsequeeze it
        # Shape (x) --> [batch_size] (see seq2seq model) so making it [1, batch_size]
        x = x.unsqueeze(0)
        
        # Shape (embedded) --> (1, batch_size, embedding dims)
        embedded = self.dropout(self.embedding(x))
        
        # Shape (outputs) --> (1, 32, 1024) [1, batch_size , hidden_size]
        # Shape (hs, cl) --> (2, 32, 1024)  [num_layers, batch_size , hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedded, (hidden_state, cell_state))
        
        '''
            Output vs Hidden state:
                - Output state is the hidden state output for every single token while
                  Hidden state is the state for the last token only
                  hence
                  hidden == output[-1,:,:]
                  
                  Here in Sequence to Sequence models, the Decoder takes single input
                  token at a time hence output and hidden state should be equal.
                  
                  However, in this model we are using two (num_layers = 2) stacked LSTM.
                  Hence, we have two hidden states and one output state as each LSTM cell
                  its own hidden state althought output state of first cell goes to second
                  and hence we only obtain single output state.
                  
                  So here we are using output state. In case of num_layers = 1 we can use either
                  hidden our output state. As it like concatenation of both cells states!
                  
                  
                  Although, in case of stacked LSTM we cant have output state of first cell but we
                  output and hidden state of last cell and they are equal we can verify that with
                  
                  print(torch.all(torch.eq(outputs, hidden_state[-1,:,:])))
        '''

        # Shape (outputs) -->  (1, batch_size, hidden_size)
        # Shape (outputs.squeeze(0)) -->  (batch_size, hidden_size)
        # Shape (predictions) --> (batch_size, target_vocab_size)
        predictions = self.fc(outputs.squeeze(0))
        
        return predictions, hidden_state, cell_state

class SeqtoSeq(nn.Module):
    def __init__(self, gen_params, target_vocab, device):
        super(SeqtoSeq, self).__init__()

        self.Encoder = Encoder(gen_params["input_size_encoder"],
                          gen_params["encoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["encoder_dropout"]).to(device)

        self.Decoder = Decoder(gen_params["input_size_decoder"],
                          gen_params["decoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["decoder_dropout"],
                          gen_params["output_size"]).to(device)

        self.target_vocab = target_vocab
        self.device = device
    
    def forward(self, source, src_lens, target, tfr=0.5):
        # Shape -> (Sentence length, Batch_size)
        batch_size = source.shape[1]

        target_len = target.shape[0]  # Length of target sentences
        target_vocab_size = len(self.target_vocab)
        
        # here we will store all the outputs
        # so outputs is arrange in a way that sentences are in column and batch size is row and every element
        # will consist of probability of each word from the vocab
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)

        # Shape --> (hs, cs) (num_layers, batch_size size, hidden_size) (contains encoder's hs, cs - context vectors)
        hidden_state, cell_state = self.Encoder(source)

        # Shape of x (32 elements)
        x = target[0]  # First token (Trigger)

        for i in range(1, target_len):
            # Shape --> output (batch_size, target_vocab_size)
            output, hidden_state, cell_state = self.Decoder(x, hidden_state, cell_state)
            outputs[i] = output
            best_guess = output.argmax(1)  # 0th dimension is batch size, 1st dimension is word embedding
            # Schedule sampling
            x = target[
                i] if random.random() < tfr else best_guess  # Either pass the next word correctly from the dataset
            # or use the earlier predicted word

        # Shape --> (sentence length, batch size, vocab size)
        return outputs

In [8]:
nmtds_train = nmtDataset('datasets/Multi30k/', 'train')
nmtds_valid = nmtDataset('datasets/Multi30k/', 'val', nmtds_train)
nmtds_test = nmtDataset('datasets/Multi30k/', 'test', nmtds_train)

pad_idx = nmtds_train.trg_vocab["<pad>"]

In [9]:
train_dataloader = DataLoader(nmtds_train, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, pad_idx, device))

valid_dataloader = DataLoader(nmtds_valid, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, pad_idx, device))

In [10]:
hyp_params["input_size_encoder"] = len(nmtds_train.src_vocab)
hyp_params["input_size_decoder"] = len(nmtds_train.trg_vocab)
hyp_params["output_size"] = len(nmtds_train.trg_vocab)

model = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(device)

In [11]:
min_el = math.inf
patience = 1
best_model = {}
best_epoch = 0

epoch_loss = 0
for epoch in range(hyp_params["num_epochs"]):
    start = time.time()
    
    gc.collect()
    torch.cuda.empty_cache()
    
    epoch_loss = utils.train_model(model, train_dataloader, criterion, optimizer)
    eval_loss = utils.evaluate_model(model, valid_dataloader, criterion)
    
    log.log(f"Epoch: {epoch+1}, Train loss: {epoch_loss}, Eval loss: {eval_loss}, patience: {patience}. Time {time.time() - start}")

    
    if eval_loss < min_el:
        best_epoch = epoch+1
        min_el = eval_loss
        best_model = copy.deepcopy(model)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'eval_loss': min_el
        }, 'model-vanilla.pt')
        patience = 1
    else:
        patience += 1
    
    if patience == 10:
        log.log("[STOPPING] Early stopping in action..")
        log.log(f"Best epoch was {best_epoch} with {min_el} eval loss")
        break

log.close()

100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:28<00:00,  7.99it/s]


Epoch: 1, Train loss: 5.393388748168945, Eval loss: 5.274656772613525, patience: 1. Time 29.082746267318726


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.23it/s]


Epoch: 2, Train loss: 4.770442008972168, Eval loss: 4.943822860717773, patience: 1. Time 28.058428049087524


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.15it/s]


Epoch: 3, Train loss: 4.427362442016602, Eval loss: 4.698703765869141, patience: 1. Time 28.360026836395264


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.33it/s]


Epoch: 4, Train loss: 4.198544502258301, Eval loss: 4.5788960456848145, patience: 1. Time 27.756704330444336


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.12it/s]


Epoch: 5, Train loss: 4.015783786773682, Eval loss: 4.453860759735107, patience: 1. Time 28.481960773468018


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.19it/s]


Epoch: 6, Train loss: 3.8494179248809814, Eval loss: 4.36923360824585, patience: 1. Time 28.24337077140808


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:28<00:00,  8.03it/s]


Epoch: 7, Train loss: 3.7414731979370117, Eval loss: 4.263300895690918, patience: 1. Time 28.813265800476074


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:28<00:00,  8.10it/s]


Epoch: 8, Train loss: 3.612299680709839, Eval loss: 4.220077991485596, patience: 1. Time 28.561676263809204


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.18it/s]


Epoch: 9, Train loss: 3.5119380950927734, Eval loss: 4.1298089027404785, patience: 1. Time 28.274757146835327


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:27<00:00,  8.17it/s]


Epoch: 10, Train loss: 3.408858299255371, Eval loss: 4.111256122589111, patience: 1. Time 28.288347244262695


In [12]:
model_l = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
model_l.load_state_dict(torch.load('model-vanilla.pt', map_location=device)["model_state_dict"])
model_l.eval()

SeqtoSeq(
  (Encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 512)
    (LSTM): LSTM(512, 512, num_layers=2, dropout=0.5)
  )
  (Decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7853, 512)
    (LSTM): LSTM(512, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=7853, bias=True)
  )
  (target_vocab): Vocab()
)

In [14]:
utils.bleu(model_l, nmtds_test, False, device)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 133.62it/s]


0.1590950352389295