# Training model for Machine Translation

Library Imports for the jupyter notebook. We are refering to this [blog](https://medium.com/@hunter-j-phillips/putting-it-all-together-the-implemented-transformer-bfb11ac1ddfehttps://medium.com/@hunter-j-phillips/putting-it-all-together-the-implemented-transformer-bfb11ac1ddfe) to understand attention network in depth

In [1]:
!pip install -q portalocker

# importing required libraries
import math
import copy
import time
import random
import spacy
import numpy as np
import os 

# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim

# load and build datasets
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import portalocker

# visualization packages
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from torchinfo import summary

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device



device(type='cuda')

# Reading the dataframe and converting it into iterable for consuming in pytorch dataset

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
df = pd.read_csv("/kaggle/input/machine-translation-dataset-de-en/translation_train.csv")
df.head()

Unnamed: 0,english,german
0,"Two young, White males are outside near many b...",Zwei junge weiße Männer sind im Freien in der ...
1,Several men in hard hats are operating a giant...,Mehrere Männer mit Schutzhelmen bedienen ein A...
2,A little girl climbing into a wooden playhouse.,Ein kleines Mädchen klettert in ein Spielhaus ...
3,A man in a blue shirt is standing on a ladder ...,Ein Mann in einem blauen Hemd steht auf einer ...
4,Two men are at the stove preparing food.,Zwei Männer stehen am Herd und bereiten Essen zu.


In [4]:
df.shape

(29000, 2)

In [5]:
# Perform train - val split
train_df=df.sample(frac=0.95,random_state=200)
val_df=df.drop(train_df.index)

In [6]:
def create_iterable(sample_df):
    """
    This converts pandas dataframe into list of tuples
    consisting of (german sentences, english sentences).
    
    This iterable in used in our data preparation
    """
    sample_iter = sample_df.to_dict(orient='records')
    out_iter = [(dict1["german"], dict1["english"]) for dict1 in sample_iter]
    print(f"length of iterable: {len(out_iter)}")
    return out_iter

In [7]:
train_iter = create_iterable(train_df)
val_iter = create_iterable(val_df)

length of iterable: 27550
length of iterable: 1450


In [8]:
test_df = pd.read_csv("/kaggle/input/machine-translation-dataset-de-en/translation_test.csv")
test_df.head()

Unnamed: 0,english,german
0,A man in an orange hat starring at something.,"Ein Mann mit einem orangefarbenen Hut, der etw..."
1,A Boston Terrier is running on lush green gras...,Ein Boston Terrier läuft über saftig-grünes Gr...
2,A girl in karate uniform breaking a stick with...,Ein Mädchen in einem Karateanzug bricht einen ...
3,Five people wearing winter jackets and helmets...,Fünf Leute in Winterjacken und mit Helmen steh...
4,People are fixing the roof of a house.,Leute Reparieren das Dach eines Hauses.


In [9]:
test_iter = create_iterable(test_df)

length of iterable: 1000


# Create Vocab from data sources

In [10]:
def load_tokenizers():
    """
    Load the German and English tokenizers provided by spaCy.

    Returns:
        spacy_de:     German tokenizer
        spacy_en:     English tokenizer
    """
    try:
        spacy_de = spacy.load("de_core_news_sm")
    except OSError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except OSError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    print("Loaded English and German tokenizers.")
    return spacy_de, spacy_en

In [11]:
def tokenize(text: str, tokenizer):
  """
    Split a string into its tokens using the provided tokenizer.

    Args:
        text:         string 
        tokenizer:    tokenizer for the language
        
    Returns:
        tokenized list of strings       
  """
  return [tok.text.lower() for tok in tokenizer.tokenizer(text)]

In [12]:
def yield_tokens(data_iter, tokenizer, index: int):
  """
    Return the tokens for the appropriate language.

    Args:
        data_iter:    text here 
        tokenizer:    tokenizer for the language
        index:        index of the language in the tuple | (de=0, en=1)
        
    Yields:
        sequences based on index       
  """
  for from_tuple in data_iter:
    yield tokenizer(from_tuple[index])

In [13]:
def build_vocabulary(
                    spacy_de, 
                    spacy_en, 
                    train_iter, 
                    val_iter, 
                    test_iter, 
                    min_freq: int = 2):
  
    def tokenize_de(text: str):
        """
          Call the German tokenizer.

          Args:
              text:         string 
              min_freq:     minimum frequency needed to include a word in the vocabulary

          Returns:
              tokenized list of strings       
        """
        return tokenize(text, spacy_de)

    def tokenize_en(text: str):
        """
          Call the English tokenizer.

          Args:
              text:         string 

          Returns:
              tokenized list of strings       
        """
        return tokenize(text, spacy_en)

    print("Building German Vocabulary...")

#     # load train, val, and test data pipelines
#     train, val, test = datasets.IWSLT2016(language_pair=("de", "en"))
    train = train_iter
    val = val_iter
    test = test_iter

    # generate source vocabulary
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_de, index=0), # tokens for each German sentence (index 0)
        min_freq=min_freq, 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    print("Building English Vocabulary...")

    # generate target vocabulary
    vocab_trg = build_vocab_from_iterator(
        yield_tokens(train + val + test, tokenize_en, index=1), # tokens for each English sentence (index 1)
        min_freq=2, # 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    # set default token for out-of-vocabulary words (OOV)
    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_trg.set_default_index(vocab_trg["<unk>"])

    return vocab_src, vocab_trg

In [14]:
def load_vocab(spacy_de, spacy_en, train_iter, val_iter, test_iter, min_freq: int = 2):
    """
    Args:
        spacy_de:     German tokenizer
        spacy_en:     English tokenizer
        min_freq:     minimum frequency needed to include a word in the vocabulary

    Returns:
        vocab_src:    German vocabulary
        vocab_trg:     English vocabulary       
    """
    if not os.path.exists("vocab.pt"):
        # build the German/English vocabulary if it does not exist
        vocab_src, vocab_trg = build_vocabulary(spacy_de, 
                                                spacy_en, 
                                                train_iter, 
                                                val_iter, 
                                                test_iter, 
                                                min_freq)
        # save it to a file
        torch.save((vocab_src, vocab_trg), "vocab.pt")
    else:
        # load the vocab if it exists
        vocab_src, vocab_trg = torch.load("vocab.pt")

    print("Finished.\nVocabulary sizes:")
    print("\tSource:", len(vocab_src))
    print("\tTarget:", len(vocab_trg))
    return vocab_src, vocab_trg

In [15]:
# global variables used later in the script
spacy_de, spacy_en = load_tokenizers()



Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Loaded English and German tokenizers.


In [16]:
vocab_src, vocab_trg = load_vocab(
                            spacy_de, 
                            spacy_en, 
                            train_iter, 
                            val_iter, 
                            test_iter)

Building German Vocabulary...
Building English Vocabulary...
Finished.
Vocabulary sizes:
	Source: 7983
	Target: 5979


In [17]:
BOS_IDX = vocab_trg['<bos>']
EOS_IDX = vocab_trg['<eos>']
PAD_IDX = vocab_trg['<pad>']

# Perform data preprocess

In [18]:
def data_process(raw_data):
    """
    Process raw sentences by tokenizing and converting to integers based on 
    the vocabulary.

    Args:
        raw_data:     German-English sentence pairs 
    Returns:
        data:         tokenized data converted to index based on vocabulary   
    """
    data = []
    # loop through each sentence pair
    for (raw_de, raw_en) in tqdm(raw_data):
        de_tensor_ = []
        # tokenize the sentence and convert each word to an integers
        for token in spacy_de.tokenizer(raw_de):
            de_tensor_.append(vocab_src[token.text.lower()])
            
        en_tensor_ = []
        # tokenize the sentence and convert each word to an integers
        for token in spacy_en.tokenizer(raw_en):
            en_tensor_.append(vocab_trg[token.text.lower()])
            
        de_tensor_ = torch.tensor(de_tensor_, dtype=torch.long)
        en_tensor_ = torch.tensor(en_tensor_, dtype=torch.long)
        # append tensor representations
        data.append((de_tensor_, en_tensor_))
    return data

In [19]:
# processed data
train_data = data_process(train_iter)
print(f"Train data shape: {len(train_data)}")
val_data = data_process(val_iter)
print(f"Val data shape: {len(val_data)}")
test_data = data_process(test_iter)
print(f"Test data shape: {len(test_data)}")

100%|██████████| 27550/27550 [00:03<00:00, 7680.73it/s]


Train data shape: 27550


100%|██████████| 1450/1450 [00:00<00:00, 7557.78it/s]


Val data shape: 1450


100%|██████████| 1000/1000 [00:00<00:00, 7376.39it/s]

Test data shape: 1000





# Create collate function

In [20]:
def generate_batch(data_batch):
    """
    Process indexed-sequences by adding <bos>, <eos>, and <pad> tokens.

    Args:
        data_batch:     German-English indexed-sentence pairs

    Returns:
        two batches:    one for German and one for English
    """
    de_batch, en_batch = [], []

    # for each sentence
    for (de_item, en_item) in data_batch:
        # add <bos> and <eos> indices before and after the sentence
        de_temp = torch.cat([torch.tensor([BOS_IDX]), 
                             de_item, 
                             torch.tensor([EOS_IDX])], dim=0).to(device)
        en_temp = torch.cat([torch.tensor([BOS_IDX]), 
                             en_item, 
                             torch.tensor([EOS_IDX])], dim=0).to(device)

        # add padding
        de_batch.append(pad(de_temp,(0, # dimension to pad
                                MAX_PADDING - len(de_temp), # amount of padding to add
                              ),value=PAD_IDX,))

        # add padding
        en_batch.append(pad(en_temp,(0, # dimension to pad
                                MAX_PADDING - len(en_temp), # amount of padding to add
                              ),
                              value=PAD_IDX,))

    return torch.stack(de_batch), torch.stack(en_batch)

# Creating DataLoaders for training and testing

In [21]:
MAX_PADDING = 20
BATCH_SIZE = 128

train_iter = DataLoader(
                to_map_style_dataset(train_data), 
                batch_size=BATCH_SIZE,
                shuffle=True, 
                drop_last=True, 
                collate_fn=generate_batch)

valid_iter = DataLoader(
                to_map_style_dataset(val_data),
                batch_size=BATCH_SIZE,
                shuffle=True, 
                drop_last=True, 
                collate_fn=generate_batch)

test_iter = DataLoader(
                to_map_style_dataset(test_data), 
                batch_size=BATCH_SIZE,
                shuffle=True, 
                drop_last=True, 
                collate_fn=generate_batch)


# Creating Attention Network Model

In [22]:
import math

## Create submodules for network

In [23]:
class Embedding(nn.Module):
    """
    Embedding lookup table which is used by the positional embedding block.
    Embedding lookup table is shared across input and output
    """
    def __init__(self, vocab_size, dmodel):
        """
        Embedding lookup needs a vocab size and model dimension size matrix for 
        creating lookups
        """
        super().__init__()
        self.embedding_lookup = nn.Embedding(vocab_size, dmodel)
        self.vocab_size = vocab_size
        self.dmodel = dmodel
        
    def forward(self, token_ids):
        """
        For a given token lookup the embedding vector
        
        As per the paper, we also multiply the embedding vector with sqrt of dmodel 
        """
        assert token_ids.ndim == 2, \
        f'Expected: (batch size, max token sequence length), got {token_ids.shape}'
        
        embedding_vector = self.embedding_lookup(token_ids)
        
        return embedding_vector * math.sqrt(self.dmodel)


In [24]:
class PositionalEncoding(nn.Module):
    def __init__(self, dmodel, max_seq_length = 5000, pdropout = 0.1,):
        """
        dmodel(int): model dimensions
        max_seq_length(int): Maximum input sequence length
        pdropout(float): Dropout probability
        """
        super().__init__()
        self.dropout = nn.Dropout(p = pdropout)
        
        # Calculate frequencies
        position_ids = torch.arange(0, max_seq_length).unsqueeze(1)
        # -ve sign is added because the exponents are inverted when you multiply position and frequencies
        frequencies = torch.pow(10000, -torch.arange(0, dmodel, 2, dtype = torch.float)/ dmodel) 
        
        # Create positional encoding table
        positional_encoding_table = torch.zeros(max_seq_length, dmodel)
        # Fill the table with even entries with sin and odd entries with cosine
        positional_encoding_table[:, 0::2] = torch.sin(position_ids * frequencies)
        positional_encoding_table[:, 1::2] = torch.cos(position_ids * frequencies)
    
        # Registering the position enconding in state_dict but the its not included 
        # in named parameter as it is not trainable
        self.register_buffer("positional_encoding_table", positional_encoding_table)
    
    def forward(self, embeddings_batch):
        """
        embeddings_batch shape = (batch size, seq_length, dmodel)
        positional_encoding_table shape = (max_seq_length, dmodel)
        """
        assert embeddings_batch.ndim == 3, \
        f"Embeddings batch should have dimension of 3 but got {embeddings_batch.ndim}"
        assert embeddings_batch.size()[-1] == self.positional_encoding_table.size()[-1], \
        f"Embedding batch shape and positional_encoding_table shape should match, expected Embedding batch shape : {embeddings_batch.shape[-1]} while positional_encoding_table shape : {positional_encoding_table[-1]}"
        
        # Get encodings for the given input sequence length
        pos_encodings = self.positional_encoding_table[:embeddings_batch.shape[1]] # Choose only seq_length out of max_seq_length
        
        # Final output 
        out = embeddings_batch + pos_encodings
        out = self.dropout(out)
        return out

In [25]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, dmodel, dff, pdropout = 0.1):
        super().__init__()
        
        self.dropout = nn.Dropout(p = pdropout)
        
        self.W1 = nn.Linear(dmodel, dff)      # Intermediate layer
        self.W2 = nn.Linear(dff, dmodel)    # Output layer
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        """
        Perform Feedforward calculation
        
        x shape = (B - batch size, S/T - max token sequence length, D- model dimension).
        """
        out = self.W2(self.relu(self.dropout(self.W1(x))))
        return out

In [26]:
class MultiHeadAttention(nn.Module):
    """
    We can refer to the following blog to understand in depth about the transformer and MHA
    https://medium.com/@hunter-j-phillips/multi-head-attention-7924371d477a
    
    Here we are clubbing all the linear layers together and duplicating the inputs and 
    then performing matrix multiplications
    """
    def __init__(self, dk, dv, h, pdropout=0.1):
        """
        Input Args:
        
        dk(int): Key dimensions used for generating Key weight matrix
        dv(int): Val dimensions used for generating val weight matrix
        h(int) : Number of heads in MHA
        """
        super().__init__()
        assert dk == dv
        self.dk = dk
        self.dv = dv
        self.h = h
        self.dmodel = self.dk * self.h  # model dimension
        
        # Add the params in modulelist as the params in the conv list needs to be tracked
        # wq, wk, wv -> multiple linear weights for the number of heads
        self.WQ = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
        self.WK = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
        self.WV = nn.Linear(self.dmodel, self.dmodel) # shape -> (dmodel, dmodel)
        # Output Weights
        self.WO = nn.Linear(self.h*self.dv, self.dmodel)  # shape -> (dmodel, dmodel)
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(p = pdropout)
        
    def forward(self, query, key, val, mask=None):
        """
        Forward pass for MHA
        
        X has a size of (batch_size, seq_length, d_model)
        Wq, Wk, and Wv have a size of (d_model, d_model)
        
        Perform Scaled Dot Product Attention on multi head attention. 
        
        Notation: B - batch size, S/T - max src/trg token-sequence length
        query shape = (B, S, dmodel)
        key shape = (B, S, dmodel)
        val shape = (B, S, dmodel)
        """
        # Weight the queries
        Q = self.WQ(query)     # shape -> (B, S, dmodel)
        K = self.WK(key)       # shape -> (B, S, dmodel)
        V = self.WV(val)       # shape -> (B, S, dmodel)
        
        # Separate last dimension to number of head and dk
        batch_size = Q.size(0)   
        Q = Q.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
        K = K.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
        V = V.view(batch_size, -1, self.h, self.dk)   # shape -> (B, S, h, dk)
        
        # each sequence is split across n_heads, with each head receiving seq_length tokens 
        # with d_key elements in each token instead of d_model.
        Q = Q.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
        K = K.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
        V = V.permute(0, 2, 1, 3) # shape -> (B, h, S, dk)
        
        # dot product of Q and K
        scaled_dot_product = torch.matmul(Q, K.permute(0, 1, 3, 2)) / math.sqrt(self.dk)
        
        # fill those positions of product as (-1e10) where mask positions are 0
        if mask is not None:
            scaled_dot_product = scaled_dot_product.masked_fill(mask == 0, -1e10)
            
        attn_probs = self.softmax(scaled_dot_product)
        
        # Create head 
        head = torch.matmul(self.dropout(attn_probs), V)  # shape -> (B, h, S, S) * (B, h, S, dk) = (B, h, S, dk)
        # Prepare the head to pass it through output linear layer
        head = head.permute(0, 2, 1, 3).contiguous()  # shape -> (B, S, h, dk)
        # Concatenate the head together
        head = head.view(batch_size, -1, self.h* self.dk)  # shape -> (B, S, (h*dk = dmodel))
        # Pass through output layer
        token_representation = self.WO(head)
        return token_representation, attn_probs
    

## Create Encoder for the Network

In [27]:
class EncoderLayer(nn.Module):
    """
    This building block in the encoder layer consists of the following
    1. MultiHead Attention
    2. Sublayer Logic
    3. Positional FeedForward Network
    """
    def __init__(self, dk, dv, h, dim_multiplier = 4, pdropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(dk, dv, h, pdropout)
        # Reference page 5 chapter 3.2.2 Multi-head attention
        dmodel = dk*h
        # Reference page 5 chapter 3.3 positionwise FeedForward
        dff = dmodel * dim_multiplier
        self.attn_norm = nn.LayerNorm(dmodel)
        self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
        self.ff_norm = nn.LayerNorm(dmodel)
        
        self.dropout = nn.Dropout(p = pdropout)
        
    def forward(self, src_inputs, src_mask=None):
        """
        Forward pass as per page 3 chapter 3.1
        """
        mha_out, attention_wts = self.attention(
                                query = src_inputs, 
                                key = src_inputs, 
                                val = src_inputs, 
                                mask = src_mask)
        
        # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
        # Actual paper design is the following
        intermediate_out = self.attn_norm(src_inputs + self.dropout(mha_out))
        
        pff_out = self.ff(intermediate_out)
        
        # Perform Add Norm again
        out = self.ff_norm(intermediate_out + self.dropout(pff_out))
        return out, attention_wts

In [28]:
class Encoder(nn.Module):
    def __init__(self, dk, dv, h, num_encoders, dim_multiplier = 4, pdropout=0.1):
        super().__init__()
        self.encoder_layers = nn.ModuleList([
            EncoderLayer(dk, 
                         dv, 
                         h, 
                         dim_multiplier, 
                         pdropout) for _ in range(num_encoders)
        ])
        
    def forward(self, src_inputs, src_mask = None):
        """
        Input from the Embedding layer
        src_inputs = (B - batch size, S/T - max token sequence length, D- model dimension)
        """
        src_representation = src_inputs
        
        # Forward pass through encoder stack
        for enc in self.encoder_layers:
            src_representation, attn_probs = enc(src_representation, src_mask)
            
        self.attn_probs = attn_probs
        return src_representation

## Create Decoder Layer

In [29]:
class DecoderLayer(nn.Module):
    def __init__(
                self, 
                dk, 
                dv, 
                h,
                dim_multiplier = 4, 
                pdropout = 0.1):
        super().__init__()
        
        # Reference page 5 chapter 3.2.2 Multi-head attention
        dmodel = dk*h
        # Reference page 5 chapter 3.3 positionwise FeedForward
        dff = dmodel * dim_multiplier
        
        # Masked Multi Head Attention
        self.masked_attention = MultiHeadAttention(dk, dv, h, pdropout)
        self.masked_attn_norm = nn.LayerNorm(dmodel)
        
        # Multi head attention
        self.attention = MultiHeadAttention(dk, dv, h, pdropout)
        self.attn_norm = nn.LayerNorm(dmodel)
        
        # Add position FeedForward Network
        self.ff = PositionwiseFeedForward(dmodel, dff, pdropout=pdropout)
        self.ff_norm = nn.LayerNorm(dmodel)
        
        self.dropout = nn.Dropout(p = pdropout)
        
    def forward(self, trg: Tensor, src: Tensor, trg_mask: Tensor, src_mask: Tensor):
        """
        Args:
            trg:          embedded sequences                (batch_size, trg_seq_length, d_model)
            src:          embedded sequences                (batch_size, src_seq_length, d_model)
            trg_mask:     mask for the sequences            (batch_size, 1, trg_seq_length, trg_seq_length)
            src_mask:     mask for the sequences            (batch_size, 1, 1, src_seq_length)

        Returns:
            trg:          sequences after self-attention    (batch_size, trg_seq_length, d_model)
            attn_probs:   self-attention softmax scores     (batch_size, n_heads, trg_seq_length, src_seq_length)
        """
        _trg, attn_probs = self.masked_attention(
                                query = trg, 
                                key = trg, 
                                val = trg, 
                                mask = trg_mask)
        
        # Residual connection between input and sublayer output, details: Page 7, Chapter 5.4 "Regularization",
        # Actual paper design is the following
        trg = self.masked_attn_norm(trg + self.dropout(_trg))
        
        # Inputs to the decoder attention is given as follows
        # query = previous decoder layer
        # key and val = output of encoder
        # mask = src_mask
        # Reference : page 5 chapter 3.2.3 point 1
        _trg, attn_probs = self.attention(
                                query = trg, 
                                key = src, 
                                val = src, 
                                mask = src_mask)
        trg = self.attn_norm(trg + self.dropout(_trg))
        
        # position-wise feed-forward network
        _trg = self.ff(trg)
        # Perform Add Norm again
        trg = self.ff_norm(trg + self.dropout(_trg))
        return trg, attn_probs

In [30]:
class Decoder(nn.Module):
    def __init__(
                self, 
                dk, 
                dv, 
                h, 
                num_decoders, 
                dim_multiplier = 4, 
                pdropout=0.1):
        super().__init__()
        self.decoder_layers = nn.ModuleList([
            DecoderLayer(dk, 
                         dv, 
                         h, 
                         dim_multiplier, 
                         pdropout) for _ in range(num_decoders)
        ])
        
    def forward(self, target_inputs, src_inputs, target_mask, src_mask):
        """
        Input from the Embedding layer
        target_inputs = embedded sequences    (batch_size, trg_seq_length, d_model)
        src_inputs = embedded sequences       (batch_size, src_seq_length, d_model)
        target_mask = mask for the sequences  (batch_size, 1, trg_seq_length, trg_seq_length)
        src_mask = mask for the sequences     (batch_size, 1, 1, src_seq_length)
        """
        target_representation = target_inputs
        
        # Forward pass through decoder stack
        for layer in self.decoder_layers:
            target_representation, attn_probs = layer(
                                    target_representation,
                                    src_inputs, 
                                    target_mask,
                                    src_mask)
        self.attn_probs = attn_probs
        return target_representation

## Adding all up to construct the complete model for language translation

In [31]:
class Transformer(nn.Module):
    def __init__(self,
                dk, 
                dv, 
                h,
                src_vocab_size,
                target_vocab_size,
                num_encoders,
                num_decoders,
                src_pad_idx,
                target_pad_idx,
                dim_multiplier = 4, 
                pdropout=0.1,
                device = "cpu"
                ):
        super().__init__()
        
        # Reference page 5 chapter 3.2.2 Multi-head attention
        dmodel = dk*h
        # Modules required to build Encoder
        self.src_embeddings = Embedding(src_vocab_size, dmodel)
        self.src_positional_encoding = PositionalEncoding(
                                        dmodel,
                                        max_seq_length = src_vocab_size,
                                        pdropout = pdropout
                                        )
        self.encoder = Encoder(
                                dk, 
                                dv, 
                                h, 
                                num_encoders, 
                                dim_multiplier=dim_multiplier, 
                                pdropout=pdropout)
        
        # Modules required to build Decoder
        self.target_embeddings = Embedding(target_vocab_size, dmodel)
        self.target_positional_encoding = PositionalEncoding(
                                        dmodel,
                                        max_seq_length = target_vocab_size,
                                        pdropout = pdropout
                                        )
        self.decoder = Decoder(
                                dk, 
                                dv, 
                                h, 
                                num_decoders,  
                                dim_multiplier=4, 
                                pdropout=0.1)
        
        # Final output 
        self.linear = nn.Linear(dmodel, target_vocab_size)
#         self.softmax = nn.Softmax(dim=-1)
        self.device = device
        self.src_pad_idx = src_pad_idx
        self.target_pad_idx = target_pad_idx
        self.init_params()  
        
    # This part wasn't mentioned in the paper, but it's super important!
    def init_params(self):
        """
        xavier has tremendous impact! I didn't expect
        that the model's perf, with normalization layers, 
        is so dependent on the choice of weight initialization.
        """
        for name, p in self.named_parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
                
    def make_src_mask(self, src):
        """
        Args:
            src: raw sequences with padding        (batch_size, seq_length) 
            src_pad_idx(int): index where the token need not be attended

        Returns:
            src_mask: mask for each sequence            (batch_size, 1, 1, seq_length)
        """
        batch_size = src.shape[0]
        # assign 1 to tokens that need attended to and 0 to padding tokens, 
        # then add 2 dimensions
        src_mask = (src != self.src_pad_idx).view(batch_size, 1, 1, -1)
        return src_mask
    
    def make_target_mask(self, target):
        """
        Args:
            target:  raw sequences with padding        (batch_size, seq_length)     
            target_pad_idx(int): index where the token need not be attended

        Returns:
            target_mask: mask for each sequence   (batch_size, 1, seq_length, seq_length)
        """

        seq_length = target.shape[1]
        batch_size = target.shape[0]
        
        # assign True to tokens that need attended to and 
        # False to padding tokens, then add 2 dimensions
        target_mask = (target != self.target_pad_idx).view(batch_size, 1, 1, -1) # (batch_size, 1, 1, seq_length)

        # generate subsequent mask
        trg_sub_mask = torch.tril(torch.ones((seq_length, seq_length), device=self.device)).bool() # (batch_size, 1, seq_length, seq_length)

        # bitwise "and" operator | 0 & 0 = 0, 1 & 1 = 1, 1 & 0 = 0
        target_mask = target_mask & trg_sub_mask

        return target_mask
    
    def forward(
        self, 
        src_token_ids_batch, 
        target_token_ids_batch):
        
        # create source and target masks     
        src_mask = self.make_src_mask(
                        src_token_ids_batch) # (batch_size, 1, 1, src_seq_length)
        target_mask = self.make_target_mask(
                        target_token_ids_batch) # (batch_size, 1, trg_seq_length, trg_seq_length)

        # Create embeddings
        src_representations = self.src_embeddings(src_token_ids_batch)
        src_representations = self.src_positional_encoding(src_representations)
        
        target_representations = self.target_embeddings(target_token_ids_batch)
        target_representations = self.target_positional_encoding(target_representations)

        # Encode 
        encoded_src = self.encoder(src_representations, src_mask)
        
        # Decode
        decoded_output = self.decoder(
                                target_representations, 
                                encoded_src, 
                                target_mask, 
                                src_mask)
        
        # Post processing
        out = self.linear(decoded_output)
        # Don't use softmax as we are not comparing against softmaxed output while 
        # computing loss. We are comparing against linear outputs
#         # Output 
#         out = self.softmax(out)
        return out

## Create Model Instance

In [32]:
src_pad_idx = vocab_src.get_stoi()["<pad>"]
target_pad_idx = vocab_trg.get_stoi()["<pad>"]
src_pad_idx, target_pad_idx

(2, 2)

In [33]:
len(vocab_src), len(vocab_trg)

(7983, 5979)

In [34]:
config = {
    'dk' : 32,
    'dv' : 32,
    'h' : 8,
    'src_vocab_size' : len(vocab_src),
    'target_vocab_size' : len(vocab_trg),
    'src_pad_idx' : vocab_src.get_stoi()["<pad>"],
    'target_pad_idx' : vocab_trg.get_stoi()["<pad>"],
    'num_encoders' : 3,
    'num_decoders' : 3,
    'dim_multiplier' : 4,
    'pdropout': 0.1,
    "lr": 0.0003,
    "N_EPOCHS": 50,
    "CLIP": 1,
    "patience": 5,
}
config

{'dk': 32,
 'dv': 32,
 'h': 8,
 'src_vocab_size': 7983,
 'target_vocab_size': 5979,
 'src_pad_idx': 2,
 'target_pad_idx': 2,
 'num_encoders': 3,
 'num_decoders': 3,
 'dim_multiplier': 4,
 'pdropout': 0.1,
 'lr': 0.0003,
 'N_EPOCHS': 50,
 'CLIP': 1,
 'patience': 5}

In [35]:
dk = config["dk"]
dv = config["dv"]
h = config["h"]
src_vocab_size = config["src_vocab_size"]
target_vocab_size = config["target_vocab_size"]
src_pad_idx = config["src_pad_idx"]
target_pad_idx = config["target_pad_idx"]
num_encoders = config["num_encoders"]
num_decoders = config["num_decoders"]
dim_multiplier = config["dim_multiplier"]
pdropout = config["pdropout"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Transformer(
                dk, 
                dv, 
                h,
                src_vocab_size,
                target_vocab_size,
                num_encoders,
                num_decoders,
                dim_multiplier, 
                pdropout,
                device = device)
model.cuda()
print(model)



Transformer(
  (src_embeddings): Embedding(
    (embedding_lookup): Embedding(7983, 256)
  )
  (src_positional_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (encoder_layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (attention): MultiHeadAttention(
          (WQ): Linear(in_features=256, out_features=256, bias=True)
          (WK): Linear(in_features=256, out_features=256, bias=True)
          (WV): Linear(in_features=256, out_features=256, bias=True)
          (WO): Linear(in_features=256, out_features=256, bias=True)
          (softmax): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff): PositionwiseFeedForward(
          (dropout): Dropout(p=0.1, inplace=False)
          (W1): Linear(in_features=256, out_features=1024, bias=True)
          (W2): Linear(in_features=1024, out_features=256, bias=True)
  

In [36]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,640,475 trainable parameters


# Training part

In [37]:
LEARNING_RATE = config["lr"]

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [38]:
def train(model, iterator, optimizer, criterion, clip):
    """
    Train the model on the given data.

    Args:
        model:        Transformer model to be trained
        iterator:     data to be trained on
        optimizer:    optimizer for updating parameters
        criterion:    loss function for updating parameters
        clip:         value to help prevent exploding gradients

    Returns:
        loss for the epoch
    """

    # set the model to training mode
    model.train()

    epoch_loss = 0
    
    show_summary = True
    
    # loop through each batch in the iterator
    for i, batch in enumerate(iterator):

        # set the source and target batches    
        src,trg = batch
        
#         print("src", src, src.size())
#         print("trg", trg, trg.size())
        
#         print("trg[:,:-1]", trg[:,:-1], trg[:,:-1].size())
        
        
        # expected output
        expected_output = trg[:,1:]
#         print("expected_output", expected_output, expected_output.size())

        # zero the gradients
        optimizer.zero_grad()
        
        # Show model summary
        if show_summary:
            show_summary = False
            summary(model, input_data=[src, trg[:,:-1]])

        # logits for each output
        logits = model(src, trg[:,:-1])
    
        # calculate the loss
        loss = criterion(logits.contiguous().view(-1, logits.shape[-1]), 
                        expected_output.contiguous().view(-1))

        # backpropagation
        loss.backward()

        # clip the weights
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # update the weights
        optimizer.step()

        # update the loss
        epoch_loss += loss.item()

    # return the average loss for the epoch
    return epoch_loss / len(iterator)

In [39]:
def evaluate(model, iterator, criterion):
    """
    Evaluate the model on the given data.

    Args:
        model:        Transformer model to be trained
        iterator:     data to be evaluated
        criterion:    loss function for assessing outputs

    Returns:
        loss for the data
    """

    # set the model to evaluation mode
    model.eval()

    epoch_loss = 0
    
    # evaluate without updating gradients
    with torch.no_grad():

        # loop through each batch in the iterator
        for i, batch in enumerate(iterator):

            # set the source and target batches  
            src, trg = batch
            
            # expected output
            expected_output = trg[:,1:]


            # logits for each output
            logits = model(src, trg[:,:-1])


            # calculate the loss
            loss = criterion(logits.contiguous().view(-1, logits.shape[-1]), 
                          expected_output.contiguous().view(-1))

            # update the loss
            epoch_loss += loss.item()
        
    # return the average loss for the epoch
    return epoch_loss / len(iterator)

In [40]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Perform Training

In [41]:
N_EPOCHS = config["N_EPOCHS"]
CLIP = config["CLIP"]
patience = config["patience"]

best_valid_loss = float('inf')

patience_count = 0

# loop through each epoch
for epoch in tqdm(range(N_EPOCHS)):
    
    start_time = time.time()

    # calculate the train loss and update the parameters
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)

    # calculate the loss on the validation set
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    # calculate how long the epoch took
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

    # save the model when it performs better than the previous run
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-model.pt')
        patience_count = 0
    else:
        patience_count += 1
        print(f"Loss did not improve")
    
    if patience is not None and patience_count == patience:
        print(f"Early Stopping since validation count did not decrease after {patience_count} patience!")
        break

  2%|▏         | 1/50 [00:12<10:32, 12.90s/it]

Epoch: 01 | Time: 0m 12s
	Train Loss: 4.851 | Train PPL: 127.835
	 Val. Loss: 3.767 |  Val. PPL:  43.255


  4%|▍         | 2/50 [00:25<10:03, 12.57s/it]

Epoch: 02 | Time: 0m 12s
	Train Loss: 3.469 | Train PPL:  32.094
	 Val. Loss: 3.046 |  Val. PPL:  21.038


  6%|▌         | 3/50 [00:37<09:46, 12.49s/it]

Epoch: 03 | Time: 0m 12s
	Train Loss: 2.824 | Train PPL:  16.837
	 Val. Loss: 2.546 |  Val. PPL:  12.756


  8%|▊         | 4/50 [00:50<09:32, 12.45s/it]

Epoch: 04 | Time: 0m 12s
	Train Loss: 2.381 | Train PPL:  10.820
	 Val. Loss: 2.249 |  Val. PPL:   9.479


 10%|█         | 5/50 [01:02<09:18, 12.41s/it]

Epoch: 05 | Time: 0m 12s
	Train Loss: 2.078 | Train PPL:   7.992
	 Val. Loss: 2.085 |  Val. PPL:   8.045


 12%|█▏        | 6/50 [01:14<09:05, 12.39s/it]

Epoch: 06 | Time: 0m 12s
	Train Loss: 1.856 | Train PPL:   6.398
	 Val. Loss: 1.967 |  Val. PPL:   7.148


 14%|█▍        | 7/50 [01:27<08:53, 12.40s/it]

Epoch: 07 | Time: 0m 12s
	Train Loss: 1.680 | Train PPL:   5.365
	 Val. Loss: 1.886 |  Val. PPL:   6.591


 16%|█▌        | 8/50 [01:39<08:40, 12.40s/it]

Epoch: 08 | Time: 0m 12s
	Train Loss: 1.534 | Train PPL:   4.639
	 Val. Loss: 1.825 |  Val. PPL:   6.201


 18%|█▊        | 9/50 [01:52<08:29, 12.42s/it]

Epoch: 09 | Time: 0m 12s
	Train Loss: 1.412 | Train PPL:   4.105
	 Val. Loss: 1.768 |  Val. PPL:   5.860


 20%|██        | 10/50 [02:04<08:16, 12.40s/it]

Epoch: 10 | Time: 0m 12s
	Train Loss: 1.310 | Train PPL:   3.706
	 Val. Loss: 1.751 |  Val. PPL:   5.760


 22%|██▏       | 11/50 [02:16<08:03, 12.40s/it]

Epoch: 11 | Time: 0m 12s
	Train Loss: 1.218 | Train PPL:   3.381
	 Val. Loss: 1.731 |  Val. PPL:   5.648


 24%|██▍       | 12/50 [02:29<07:49, 12.35s/it]

Epoch: 12 | Time: 0m 12s
	Train Loss: 1.136 | Train PPL:   3.114
	 Val. Loss: 1.741 |  Val. PPL:   5.703
Loss did not improve


 26%|██▌       | 13/50 [02:41<07:36, 12.33s/it]

Epoch: 13 | Time: 0m 12s
	Train Loss: 1.062 | Train PPL:   2.891
	 Val. Loss: 1.743 |  Val. PPL:   5.717
Loss did not improve


 28%|██▊       | 14/50 [02:53<07:24, 12.34s/it]

Epoch: 14 | Time: 0m 12s
	Train Loss: 0.995 | Train PPL:   2.704
	 Val. Loss: 1.739 |  Val. PPL:   5.692
Loss did not improve


 30%|███       | 15/50 [03:06<07:12, 12.35s/it]

Epoch: 15 | Time: 0m 12s
	Train Loss: 0.936 | Train PPL:   2.550
	 Val. Loss: 1.751 |  Val. PPL:   5.762
Loss did not improve


 30%|███       | 15/50 [03:18<07:42, 13.23s/it]

Epoch: 16 | Time: 0m 12s
	Train Loss: 0.875 | Train PPL:   2.398
	 Val. Loss: 1.764 |  Val. PPL:   5.834
Loss did not improve
Early Stopping since validation count did not decrease after 5 patience!





In [42]:
model.load_state_dict(torch.load('transformer-model.pt'))

# calculate the loss on the test set
test_loss = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}')

Test Loss: 1.691 | Test PPL:   5.422


In [43]:
def translate_sentence(sentence, model, device, max_length = 50):
    """
    Translate a German sentence to its English equivalent.

    Args:
        sentence:     German sentence to be translated to English; list or str
        model:        Transformer model used for translation
        device:       device to perform translation on
        max_length:   maximum token length for translation

    Returns:
        src:                  return the tokenized input
        trg_input:            return the input to the decoder before the final output 
        trg_output:           return the final translation, shifted right
        attn_probs:           return the attention scores for the decoder heads
        masked_attn_probs:    return the masked attention scores for the decoder heads
    """
    
    model.eval()

    # tokenize and index the provided string
    if isinstance(sentence, str):
        src = ['<bos>'] + [token.text.lower() for token in spacy_de(sentence)] +  ['<eos>']
    else:
        src = ['<bos>'] + sentence + ['<eos>']

    # convert to integers
    src_indexes = [vocab_src[token] for token in src]

    # convert list to tensor
    src_tensor = torch.tensor(src_indexes).int().unsqueeze(0).to(device)

    # set <bos> token for target generation
    trg_indexes = [vocab_trg.get_stoi()['<bos>']]

    # generate new tokens
    for i in range(max_length):

        # convert the list to a tensor
        trg_tensor = torch.tensor(trg_indexes).int().unsqueeze(0).to(device)

        # generate the next token 
        with torch.no_grad():

            # generate the logits
            logits = model.forward(src_tensor, trg_tensor)

            # select the newly predicted token
            pred_token = logits.argmax(2)[:,-1].item()

            # if <eos> token or max length, stop generating
            if pred_token == vocab_trg.get_stoi()['<eos>'] or i == (max_length-1):

                # decoder input
                trg_input = vocab_trg.lookup_tokens(trg_indexes)

                # decoder output
                trg_output = vocab_trg.lookup_tokens(logits.argmax(2).squeeze(0).tolist())

                return src, trg_input, trg_output
      
            # else, continue generating
            else:
                # add the token
                trg_indexes.append(pred_token)

In [44]:
# 'a woman with a large purse is walking by a gate'
src = ['eine', 'frau', 'mit', 'einer', 'großen', 'geldbörse', 'geht', 'an', 'einem', 'tor', 'vorbei', '.']
trg = ["a", "woman", "with", "a", "large", "purse", "is", "walking", "by", "a", "gate"]

src, trg_input, trg_output = translate_sentence(src, model, device)

print(f'source = {src}')
print(f'target input = {trg_input}')
print(f'target output = {trg_output}')

source = ['<bos>', 'eine', 'frau', 'mit', 'einer', 'großen', 'geldbörse', 'geht', 'an', 'einem', 'tor', 'vorbei', '.', '<eos>']
target input = ['<bos>', 'a', 'woman', 'with', 'a', 'large', 'purse', 'walks', 'by', 'a', 'gate', 'with', 'a', 'gate', '.']
target output = ['a', 'woman', 'with', 'a', 'large', 'purse', 'walks', 'by', 'a', 'gate', 'with', 'a', 'gate', '.', '<eos>']


# Calculate Bleu Score

In [45]:
def compute_metrics(model, iterator):
    """
    Generate predictions for the provided iterator.

    Args:
        model:        Transformer model to be trained
        iterator:     data to be evaluated

    Returns:
        predictions:  list of predictions, which are tokenized strings
        labels:       list of expected output, which are tokenized strings
    """

    # set the model to evaluation mode
    model.eval()

    predictions = []
    labels = []  

    # evaluate without updating gradients
    with torch.no_grad():
    
        # loop through each batch in the iterator
        for i, batch in tqdm(enumerate(iterator)):

            # set the source and target batches  
            src, trg = batch

            # predict the output
            src_out, trg_input, trg_output = translate_sentence(
                            vocab_src.lookup_tokens(src.tolist()), 
                            model, 
                            device)

            # prediction | remove <eos> token
            predictions.append(trg_output[:-1])

            # expected output | add extra dim for calculation
            labels.append([vocab_trg.lookup_tokens(trg.tolist())]) 

    # return the average loss for the epoch
    return predictions, labels

In [46]:
predictions, labels = compute_metrics(model, test_data)

1000it [02:14,  7.42it/s]


In [47]:
from torchtext.data.metrics import bleu_score
score = bleu_score(predictions, labels)
score = score*100
print(f"Bleu score on test data is {score}")

Bleu score on test data is 30.030295252799988


In [48]:
str_score = f"{score:.1f}"
str_score

'30.0'

## Save model and input param

In [49]:
torch.save(model.state_dict(), f'transformer-model-{str_score}.pt')

In [50]:
import json 
# Convert and write JSON object to file
with open("input_params.json", "w") as outfile: 
    json.dump(config, outfile)
config

{'dk': 32,
 'dv': 32,
 'h': 8,
 'src_vocab_size': 7983,
 'target_vocab_size': 5979,
 'src_pad_idx': 2,
 'target_pad_idx': 2,
 'num_encoders': 3,
 'num_decoders': 3,
 'dim_multiplier': 4,
 'pdropout': 0.1,
 'lr': 0.0003,
 'N_EPOCHS': 50,
 'CLIP': 1,
 'patience': 5}

# Store model on HuggingFace Hub

In [51]:
# ! pip install --upgrade huggingface_hub -q

In [52]:
# from huggingface_hub import notebook_login
# notebook_login()

In [53]:
# from huggingface_hub import HfApi
# api = HfApi()

In [54]:
# api.upload_folder(
#     folder_path="/kaggle/working/.",
#     repo_id="Rzoro/Transformer_de_en_multi30K",
#     repo_type="space",
# )