In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Transformer
from tqdm import tqdm

import torchtext

from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


from dataset import nmtDataset
import helpers as utils

import spacy
import numpy as np

import random
import math
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
log = utils.Logger('logs/transformers.out')  

In [3]:
hyp_params = {
    "batch_size": 1,
    "num_epochs": 10,
    "d_model": 512,
    "n_head": 8,
    "num_encoder_layers": 3,
    "num_decoder_layers": 3,
    "feedforward_dim": 128,
    "dropout": 0.1
}

In [11]:
nmtds_train = nmtDataset('datasets/Multi30k/', 'train')
nmtds_valid = nmtDataset('datasets/Multi30k/', 'val', nmtds_train)
nmtds_test = nmtDataset('datasets/Multi30k/', 'test', nmtds_train)

SRC_PAD_IDX = nmtds_train.src_vocab["<pad>"]
TRG_PAD_IDX = nmtds_train.trg_vocab["<pad>"]

train_dataloader = DataLoader(nmtds_train, batch_size=hyp_params["batch_size"], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, SRC_PAD_IDX, device))

valid_dataloader = DataLoader(nmtds_valid, batch_size=hyp_params["batch_size"], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, SRC_PAD_IDX, device))

hyp_params["src_vocab_size"] = len(nmtds_train.src_vocab)
hyp_params["trg_vocab_size"] = len(nmtds_train.trg_vocab)

In [5]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, maxlen = 12):
        super(PositionalEncoding, self).__init__()
        
        # A tensor consists of all the possible positions (index) e.g 0, 1, 2, ... max length of input
        # Shape (pos) --> [max len, 1]
        pos = torch.arange(0, maxlen).unsqueeze(1)
        
        pos_encoding = torch.zeros((maxlen, d_model))
        
        # In the paper, they had 2i in the positional encoding formula
        # where i is the dimension 
        sin_den = 10000 ** (torch.arange(0, d_model, 2)/d_model) # sin for even item of position's dimension
        cos_den = 10000 ** (torch.arange(1, d_model, 2)/d_model) # cos for odd 
        
        pos_encoding[:, 0::2] = torch.sin(pos / sin_den) 
        pos_encoding[:, 1::2] = torch.cos(pos / cos_den)
        
        # Shape (pos_embedding) --> [max len, d_model]
        pos_encoding = pos_encoding.unsqueeze(-2)
        # Shape (pos_embedding) --> [max len, 1, d_model]

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        
        # shape (token_embedding) --> [sentence len, batch size, d_model]
        
        # Combining embeddings with positional encodings
        # Note: As we made positional encoding with the size max length of sentence in our dataset 
        #       hence here we are picking till the sentence length in a batch
        #       Another thing to notice is in the paper they used FIXED positional encoding, there are
        #       methods where we can also learn them but we are doing as presented in the paper
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])
    
class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(InputEmbedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        # shape (tokens) --> [sentence len, batch size]
        # shape (inp_emb) --> [sentence len, batch size, d_model]
        inp_emb = self.embedding(tokens.long()) * math.sqrt(self.d_model)
        return inp_emb

In [6]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, 
                 src_vocab_size, 
                 trg_vocab_size, 
                 d_model, 
                 dropout,
                 nhead,
                 num_encoder_layers,
                 num_decoder_layers,
                 dim_feedforward
                ):
        super(Seq2SeqTransformer, self).__init__()
        
        self.src_inp_emb = InputEmbedding(src_vocab_size, d_model)
        self.trg_inp_emb = InputEmbedding(trg_vocab_size, d_model)
        
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
        
        self.transformer = Transformer(d_model=d_model,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        
        self.generator = nn.Linear(d_model, trg_vocab_size)
    
    def forward(self, src, trg):
        self.positional_encoding(self.trg_inp_emb(trg))
    
    def create_mask(src, tgt):
        src_seq_len = src.shape[0]
        tgt_seq_len = tgt.shape[0]

        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt_seq_len)
        src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool) # All False hence unchanged

        src_padding_mask = (src == PAD_IDX).transpose(0, 1)
        tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
        return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [7]:
transformer = Seq2SeqTransformer(hyp_params["src_vocab_size"],
                                hyp_params["trg_vocab_size"],
                                hyp_params["d_model"],
                                hyp_params["dropout"],
                                hyp_params["n_head"],
                                hyp_params["num_encoder_layers"],
                                hyp_params["num_decoder_layers"],
                                hyp_params["feedforward_dim"]
                                ).to(device)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [16]:
for batch_idx, batch in enumerate(tqdm(train_dataloader)):
    src = batch["src"]
    trg = batch["trg"]
    
    print(src.shape)
    
    trg_inp = trg[:-1, :]
    
    
    break

  0%|                                                                                                                                                                                                                                                                                                                                             | 0/29001 [00:00<?, ?it/s]

torch.Size([14, 1])





In [18]:
torch.zeros((7, 7)).type(torch.bool)

tensor([[False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False]])