In [118]:
import math
import gc
import time
import copy
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

from torchtext.vocab import build_vocab_from_iterator

from tqdm import tqdm

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Seeding for consistency in reproducibility
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [119]:
# Courtesy & more details about dataset https://www.kaggle.com/basilb2s/language-detection
!wget -q https://raw.githubusercontent.com/maqboolkhan/Project-NLP/master/Classification/Language%20Detection.csv

In [120]:
ds = pd.read_csv('Language Detection.csv')

# Printing all available languages in the dataset
ds.Language.unique().tolist()

['English',
 'Malayalam',
 'Hindi',
 'Tamil',
 'Portugeese',
 'French',
 'Dutch',
 'Spanish',
 'Greek',
 'Russian',
 'Danish',
 'Italian',
 'Turkish',
 'Sweedish',
 'Arabic',
 'German',
 'Kannada']

In [121]:
ds = ds.loc[  (ds.Language == 'German') | (ds.Language == 'English') | (ds.Language == 'Arabic')]
train_set, test_set = train_test_split(ds, test_size=0.3, random_state=2022)

In [122]:
trg_langs = ds.Language.unique().tolist()
trg_langs

['English', 'Arabic', 'German']

In [123]:
class LangDataset(Dataset):
    def __init__(self, ds, trg_langs, train_vocab=None):
        self.corpus = ds

        if not train_vocab:
            self.src_vocab, self.trg_vocab = self._build_vocab()
        else:
            self.src_vocab, self.trg_vocab = train_vocab

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, item):
        text = self.corpus.iloc[item].Text
        lang = self.corpus.iloc[item].Language

        return {
            'src': self.src_vocab.lookup_indices(text.lower().split()),
            'trg': self.trg_vocab.lookup_indices([lang])
        }

    def _build_vocab(self):
        # Here one could remove stopwords and use word lemmatisation.
        # Both techniques will reduce the vocab size and hence model size
        # and could also enhance the model's performance 
        src_tokens = self.corpus.Text.str.cat().lower().split()

        src_vocab = build_vocab_from_iterator([src_tokens], specials=["<unk>", "<pad>"])
        src_vocab.set_default_index(src_vocab['<unk>'])

        trg_vocab = build_vocab_from_iterator([trg_langs])

        return src_vocab, trg_vocab

In [124]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, maxlen = 5000):
        super(PositionalEncoding, self).__init__()

        # A tensor consists of all the possible positions (index) e.g 0, 1, 2, ... max length of input
        # Shape (pos) --> [max len, 1]
        pos = torch.arange(0, maxlen).unsqueeze(1)
        pos_encoding = torch.zeros((maxlen, d_model))

        sin_den = 10000 ** (torch.arange(0, d_model, 2)/d_model) # sin for even item of position's dimension
        cos_den = 10000 ** (torch.arange(1, d_model, 2)/d_model) # cos for odd 

        pos_encoding[:, 0::2] = torch.sin(pos / sin_den) 
        pos_encoding[:, 1::2] = torch.cos(pos / cos_den)

        # Shape (pos_embedding) --> [max len, d_model]
        # Adding one more dimension in-between
        pos_encoding = pos_encoding.unsqueeze(-2)
        # Shape (pos_embedding) --> [max len, 1, d_model]

        self.dropout = nn.Dropout(dropout)

        # We want pos_encoding be saved and restored in the `state_dict`, but not trained by the optimizer
        # hence registering it!
        # Source & credits: https://discuss.pytorch.org/t/what-is-the-difference-between-register-buffer-and-register-parameter-of-nn-module/32723/2
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        # shape (token_embedding) --> [sentence len, batch size, d_model]

        # Concatenating embeddings with positional encodings
        # Note: As we made positional encoding with the size max length of sentence in our dataset 
        #       hence here we are picking till the sentence length in a batch
        #       Another thing to notice is in the Transformer's paper they used FIXED positional encoding, 
        #       there are methods where we can also learn them
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])


class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(InputEmbedding, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        # shape (tokens) --> [sentence len, batch size]
        # shape (inp_emb) --> [sentence len, batch size, d_model]
        # Multiplying with square root of d_model as they mentioned in the Transformer's paper
        inp_emb = self.embedding(tokens.long()) * math.sqrt(self.d_model)
        return inp_emb

In [125]:
class TransformerClassifier(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 d_model,
                 dropout,
                 n_head,
                 dim_feedforward,
                 n_layers
                ):
        super().__init__()

        self.src_inp_emb = InputEmbedding(src_vocab_size, d_model)
        self.trg_inp_emb = InputEmbedding(trg_vocab_size, d_model)

        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)

        # Only using Encoder of Transformer model
        encoder_layers = nn.TransformerEncoderLayer(d_model, n_head, dim_feedforward, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)

        self.d_model = d_model
        self.decoder = nn.Linear(d_model, trg_vocab_size)

    def forward(self, x):
        x_emb = self.positional_encoding(self.src_inp_emb(x))
        # Shape (output) -> (Sequence length, batch size, d_model)
        output = self.transformer_encoder(x_emb)
        # We want our output to be in the shape of (batch size, d_model) so that
        # we can use it with CrossEntropyLoss hence averaging using first (Sequence length) dimension 
        # Shape (mean) -> (batch size, d_model)
        # Shape (decoder) -> (batch size, d_model)
        return self.decoder(output.mean(0))

In [126]:
hyp_params = {
    "batch_size": 64,
    "lr": 0.0005,
    "num_epochs": 10,
    "d_model": 512, # Input embedding dimension
    "n_head": 8, # No. of multi-head attention block (aka paralle self-attention layers)
    "n_layers": 3,
    "feedforward_dim": 128,
    "dropout": 0.1
}

In [127]:
def collate_fn(batch, pad_value, device):
    trgs = []
    srcs = []
    for row in batch:
        srcs.append(torch.tensor(row["src"], dtype=torch.long).to(device))
        trgs.append(torch.tensor(row["trg"]).to(device))

    padded_srcs = pad_sequence(srcs, padding_value=pad_value)
    return {"src": padded_srcs, "trg": torch.tensor([trgs]).to(device)}

train_langds = LangDataset(train_set, trg_langs)
test_langds = LangDataset(test_set, trg_langs, (train_langds.src_vocab, train_langds.trg_vocab))

SRC_PAD_IDX = train_langds.src_vocab["<pad>"]

train_dt = DataLoader(train_langds, batch_size=hyp_params["batch_size"], shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))

test_dt = DataLoader(test_langds, batch_size=hyp_params["batch_size"], shuffle=
                   True, collate_fn=lambda batch_size: collate_fn(batch_size, SRC_PAD_IDX, device))

hyp_params["src_vocab_size"] = len(train_langds.src_vocab)
hyp_params["trg_vocab_size"] = len(trg_langs)

In [128]:
def train_model(model, train_dataloader, criterion, optimizer):
    model.train()
    epoch_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        # Clear the accumulating gradients
        optimizer.zero_grad()

        src = batch["src"]  # shape --> [seq len, batch size]
        trg = batch["trg"]  # shape --> [1, batch size]

        # shape (out) --> [batch size, trg size]
        out = model(src)
        loss = criterion(out, trg.squeeze(0))

        loss.backward()

        optimizer.step()
        epoch_loss += loss.detach().cpu()

    return epoch_loss/len(train_dataloader)


def evaluate_model(model, valid_dataloader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(valid_dataloader):
            src = batch["src"]  # shape --> [seq len, batch size]
            trg = batch["trg"]  # shape --> [1, batch size]

            # shape (out) --> [batch size, trg size]
            out = model(src)
            loss = criterion(out, trg.squeeze(0))

            epoch_loss += loss.detach().cpu()

    return epoch_loss/len(valid_dataloader)

In [129]:
model = TransformerClassifier(hyp_params["src_vocab_size"],
                                hyp_params["trg_vocab_size"],
                                hyp_params["d_model"],
                                hyp_params["dropout"],
                                hyp_params["n_head"],
                                hyp_params["feedforward_dim"],
                                hyp_params["n_layers"]
                                ).to(device)

criterion = nn.CrossEntropyLoss().to(device)

optimizer = optim.Adam(model.parameters(), lr=hyp_params["lr"])

In [130]:
min_el = math.inf
patience = 1
best_model = {}
best_epoch = 0

epoch_loss = 0
for epoch in range(hyp_params["num_epochs"]):
  start = time.time()
  gc.collect()
  torch.cuda.empty_cache()

  epoch_loss = train_model(model, train_dt, criterion, optimizer)
  eval_loss = evaluate_model(model, test_dt, criterion)
  
  
  print(f"Epoch: {epoch+1}, Train loss: {epoch_loss:.5f}, Eval loss: {eval_loss:.5f}. Time {time.time() - start:.2f} secs")

  if eval_loss < min_el:
      best_epoch = epoch+1
      min_el = eval_loss
      best_model = copy.deepcopy(model)
      # torch.save({
      #     'model_state_dict': model.state_dict(),
      #     'optimizer_state_dict': optimizer.state_dict(),
      #     'eval_loss': min_el
      # }, 'model-transformer.pt')

100%|██████████| 27/27 [00:03<00:00,  8.12it/s]


Epoch: 1, Train loss: 0.52770, Eval loss: 0.12146. Time 4.08 secs


100%|██████████| 27/27 [00:03<00:00,  8.78it/s]


Epoch: 2, Train loss: 0.08631, Eval loss: 0.10422. Time 3.85 secs


100%|██████████| 27/27 [00:03<00:00,  8.68it/s]


Epoch: 3, Train loss: 0.03560, Eval loss: 0.10591. Time 3.86 secs


100%|██████████| 27/27 [00:03<00:00,  8.25it/s]


Epoch: 4, Train loss: 0.02582, Eval loss: 0.09553. Time 4.01 secs


100%|██████████| 27/27 [00:03<00:00,  8.20it/s]


Epoch: 5, Train loss: 0.03062, Eval loss: 0.13160. Time 4.04 secs


100%|██████████| 27/27 [00:03<00:00,  8.49it/s]


Epoch: 6, Train loss: 0.02317, Eval loss: 0.13430. Time 3.90 secs


100%|██████████| 27/27 [00:03<00:00,  8.54it/s]


Epoch: 7, Train loss: 0.02874, Eval loss: 0.10321. Time 3.89 secs


100%|██████████| 27/27 [00:03<00:00,  7.01it/s]


Epoch: 8, Train loss: 0.02292, Eval loss: 0.13102. Time 4.87 secs


100%|██████████| 27/27 [00:03<00:00,  7.90it/s]


Epoch: 9, Train loss: 0.02401, Eval loss: 0.13358. Time 4.22 secs


100%|██████████| 27/27 [00:03<00:00,  8.49it/s]


Epoch: 10, Train loss: 0.03079, Eval loss: 0.16248. Time 3.93 secs


In [131]:
f"Best epoch was {best_epoch} with {min_el} eval loss"

'Best epoch was 4 with 0.09552959352731705 eval loss'

In [132]:
true_labels =[]
pred_labels =[]

for i in test_langds:
    inp = torch.tensor(i['src']).unsqueeze(1).to(device)
    trg = i['trg'][0]

    with torch.no_grad():
        pred = best_model(inp).view(-1).argmax().item()

    true_labels.append(trg)
    pred_labels.append(pred)

In [133]:
print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       167
           1       0.99      0.97      0.98       417
           2       0.86      0.97      0.91       134

    accuracy                           0.95       718
   macro avg       0.93      0.94      0.94       718
weighted avg       0.95      0.95      0.95       718



### Inference

In [141]:
text = "hallo, wie gehts ihnen?"
txt_to_ind = train_langds.src_vocab.lookup_indices(text.split())
inp_tensor = torch.tensor(txt_to_ind).to(device).unsqueeze(1)

with torch.no_grad():
    res = best_model(inp_tensor).view(-1).argmax().item()
train_langds.trg_vocab.lookup_token(res)

'German'