# CONFIGURATIONS

In [75]:
unofficial_single_gpu_config=dict(
    # RUN CONFIG:
    RUN_NAME='unofficial_single_gpu_run',
    RUN_DESCRIPTION='Default run on GPU, 10GB of VRAM needed for this.',
    RUNS_FOLDER_PTH='/kaggle/working/',
    # DATA CONFIG:
    DATASET_SIZE=80000,
    TEST_PROPORTION=0.001,
    MAX_SEQ_LEN=40,
    VOCAB_SIZE=60000,
    TOKENIZER_TYPE='wordlevel', # 'wordlevel' or 'bpe
    # TRAINING CONFIG:
    BATCH_SIZE=48,
    GRAD_ACCUMULATION_STEPS=2048//48,
    WORKER_COUNT=10,
    EPOCHS=100,
    # OPTIMIZER CONFIG:
    BETAS=(0.9, 0.98),
    EPS=1e-9,
    # SCHEDULER CONFIG:
    N_WARMUP_STEPS=4000,
    # MODEL CONFIG:
    D_MODEL=512,
    N_BLOCKS=6,
    N_HEADS=8,
    D_FF=2048,
    DROPOUT_PROBA=0.1,
    # OTHER:
    MODEL_SAVE_EPOCH_CNT=10,
    DEVICE='gpu',
    LABEL_SMOOTHING=0.1,
)

configs={
    'unofficial_single_gpu_config': unofficial_single_gpu_config,

}

# SCHEDULER

In [29]:
class CustomScheduler():

    def __init__(self, optimizer, d_model, n_warmup_steps=4000):
        self.optimizer = optimizer
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.cur_step = 0
        self.cur_lr=None

        # Init LR right away
        self.step()

    def step(self):
        self.cur_step += 1
        self.cur_lr=self._get_lr()

        for p in self.optimizer.param_groups:
            p['lr'] = self.cur_lr

    def _get_lr(self):
        return self.d_model**(-0.5) * min(self.cur_step**(-0.5), self.cur_step*self.n_warmup_steps**(-1.5))

    def get_last_lr(self):
        return [group['lr'] for group in self.optimizer.param_groups]

    def zero_grad(self):
        self.optimizer.zero_grad()



# CUSTOM ENUMERATOR

In [30]:
import datetime
import time
import logging

# Configure Logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

# Custom enumerator which predicts for loop finish time
def enumerateWithEstimate(iter, desc_str, start_ndx=0):

    iter_len = len(iter)

    log.warning("{} ----/{}, starting".format(
        desc_str,
        iter_len,
    ))

    start_ts = time.time()
    for current_ndx in iter:
        yield current_ndx

        duration_sec = ((time.time() - start_ts)
                        / (current_ndx + 1)
                        * (iter_len)
                        )

        done_dt = datetime.datetime.fromtimestamp(start_ts + duration_sec)

        log.info("{} {:-4}/{}, done at {}".format(
            desc_str,
            current_ndx+1,
            iter_len,
            str(done_dt).rsplit('.', 1)[0],
        ))

# TOKENIZER

In [31]:
from tokenizers import Tokenizer
from tokenizers.processors import TemplateProcessing
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.trainers import BpeTrainer, WordLevelTrainer
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace,WhitespaceSplit


def get_tokenizer_bpe(data, vocab_size):
    # Configure tokenizer
    tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    tokenizer.normalizer=normalizers.Sequence([NFD(),StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = Whitespace()
    trainer_src = BpeTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[BOS]","[EOS]"])

    # Configure batch iterators to train tokenizers from memory
    def batch_iterator_src(batch_size=10000):
        for i in range(0, len(data), batch_size):
            yield data[i : i + batch_size]['translation_src']
        for i in range(0, len(data), batch_size):
            yield data[i : i + batch_size]['translation_trg']

    # Train tokenizers
    tokenizer.train_from_iterator(batch_iterator_src(), trainer=trainer_src, length=len(data))

    # Configure postprocessing to add [BOS] and [EOS] tokens to sequences
    tokenizer.post_processor = TemplateProcessing(
        single="[BOS] $A [EOS]",
        special_tokens=[
            ("[BOS]", 2),
            ("[EOS]", 3),
        ],
    )
    return tokenizer

def get_tokenizer_wordlevel(data, vocab_size):
    # Configure tokenizer
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    tokenizer.normalizer=normalizers.Sequence([NFD(),StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = WhitespaceSplit()
    trainer_src = WordLevelTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[BOS]","[EOS]"])

    # Configure batch iterators to train tokenizers from memory
    def batch_iterator_src(batch_size=10000):
        for i in range(0, len(data), batch_size):
            yield data[i : i + batch_size]['translation_src']
        for i in range(0, len(data), batch_size):
            yield data[i : i + batch_size]['translation_trg']

    # Train tokenizers
    tokenizer.train_from_iterator(batch_iterator_src(), trainer=trainer_src, length=len(data))

    # Configure postprocessing to add [BOS] and [EOS] tokens to trg sequence
    tokenizer.post_processor = TemplateProcessing(
        single="[BOS] $A [EOS]",
        special_tokens=[
            ("[BOS]", 2),
            ("[EOS]", 3),
        ],
    )

    return tokenizer

# LEARNER

In [32]:
import torch
import wandb
import copy
import logging

# Configure Logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

def noop(*a, **k):
    return None

class Learner:
    def __init__(self, model, train_dl, val_dl, loss_func, cbs, opt, sched=None, device='cuda'):
        self.model=model
        self.train_dl=train_dl
        self.val_dl=val_dl
        self.loss_func=loss_func
        self.cbs=cbs
        self.opt=opt
        self.sched=sched
        self.device=device

        self.cur_step=1

        self.best_val_loss=float('inf')
        self.best_model_state_dict=copy.deepcopy(self.model.state_dict())

        for cb in cbs:
            cb.learner=self

    def one_batch(self):
        self('before_batch')
        self.xb,self.yb=self.batch
        self.preds=self.model(self.xb,self.yb)
        self('before_loss')
        self.loss=self.loss_func(
            self.preds.reshape(-1, self.preds.size(-1)), # Reshaping for loss
            self.yb[:,1:].contiguous().view(-1) # Shifting right (without BOS)
        )
        self('after_loss')
        if self.model.training:
            self.loss.backward()
            if self.cur_step % wandb.config.GRAD_ACCUMULATION_STEPS == 0:
                self.opt.step()
                if self.sched != None:
                    self.sched.step()
                self.opt.zero_grad()
            self.cur_step+=1
        self('after_batch')

    def one_epoch(self, is_train):
        self('before_epoch')
        self.model.training=is_train

        if self.model.training:
            self.model.train()
        else:
            self.model.eval()

        dl=self.train_dl if is_train else self.val_dl
        for self.batch_idx,self.batch in enumerate(dl):
            self.one_batch()
        self('after_epoch')

    def fit(self, n_epochs):
        self('before_fit')
        self.n_epochs=n_epochs

        for self.epoch_idx in enumerateWithEstimate(range(n_epochs), desc_str="Training status"):
            self.one_epoch(is_train=True)
            with torch.no_grad():
                self.one_epoch(is_train=False)
        self('after_fit')

    def __call__(self, cb_method_name):
        for cb in self.cbs:
            getattr(cb, cb_method_name, noop)()

# CALLBACKS

In [33]:
import os
import copy

import torch
import torch.nn as nn
import numpy as np
from tokenizers import Tokenizer
from nltk.translate.bleu_score import corpus_bleu

# Configure Logging
import wandb
import logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

class CheckpointSaver():

    def __init__(self, epoch_cnt):
        self.epoch_cnt = epoch_cnt

    def after_epoch(self):
        # Save model every 'epoch_cnt' epochs
        if not self.learner.model.training and self.learner.epoch_idx % self.epoch_cnt == 0:
            epoch_ckpt_pth=os.path.join('/kaggle/working/',f'model_ckpt_epoch{self.learner.epoch_idx}.pt')
            torch.save(self.learner.model.state_dict(), epoch_ckpt_pth)

        # Save best model
        best_model_ckpt_pth=os.path.join('/kaggle/working/',f'model_ckpt_best.pt')
        torch.save(self.learner.best_model_state_dict, best_model_ckpt_pth)


class TrackExample():

    def before_fit(self):
        tokenizer_pth=os.path.join('/kaggle/working/','tokenizer.json')
        self.tokenizer = Tokenizer.from_file(tokenizer_pth)
        self.table=wandb.Table(columns=['train_x','train_y','train_y_pred','val_x','val_y','val_y_pred'])

        # Extract a training set example
        x_train,y_train=next(iter(self.learner.train_dl))
        train_example_x=x_train[0].numpy()
        train_example_y=y_train[0].numpy()

        # Extract a validation set example
        x_val,y_val=next(iter(self.learner.val_dl))
        val_example_x=x_val[0].numpy()
        val_example_y=y_val[0].numpy()

        # Convert to text
        self.train_example_x_text=self.tokenizer.decode(train_example_x, skip_special_tokens=False)
        self.train_example_y_text=self.tokenizer.decode(train_example_y, skip_special_tokens=False)

        self.val_example_x_text=self.tokenizer.decode(val_example_x, skip_special_tokens=False)
        self.val_example_y_text=self.tokenizer.decode(val_example_y, skip_special_tokens=False)

    def after_epoch(self):
        if not self.learner.model.training:
            train_example_y_pred_text=self.learner.model.translate(self.train_example_x_text, self.tokenizer)
            val_example_y_pred_text=self.learner.model.translate(self.val_example_x_text, self.tokenizer)

            log.info(f"""Tracking Example progress:
            Train Example x:     \t{ self.train_example_x_text}
            Train Example y:     \t{ self.train_example_y_text}
            Train Example y_pred:\t{ train_example_y_pred_text}
            ---------------------
            Val Example x:       \t{ self.val_example_x_text}
            Val Example y:       \t{ self.val_example_y_text}
            Val Example y_pred:  \t{ val_example_y_pred_text}
            """
            )


class TrackBleu():

    def before_fit(self):
        tokenizer_pth=os.path.join('/kaggle/working/','tokenizer.json')
        self.tokenizer = Tokenizer.from_file(tokenizer_pth)

    def before_epoch(self):
        self.preds_text_tokens=[]
        self.yb_text_tokens=[]
        self.xb_text_tokens=[]

    def after_batch(self):
        if not self.learner.model.training:

            preds=self.learner.preds.detach().cpu()
            preds=nn.functional.log_softmax(preds, dim=-1)
            preds=preds.argmax(dim=-1).squeeze(-1)

            preds_text=self.tokenizer.decode_batch(preds.numpy(), skip_special_tokens=False)
            xb_text=self.tokenizer.decode_batch(self.learner.xb.detach().cpu().numpy(), skip_special_tokens=False)
            yb_text=self.tokenizer.decode_batch(self.learner.yb.detach().cpu().numpy(), skip_special_tokens=False)

            preds_text_tokens=[t for t in preds_text]
            xb_text_tokens=[t for t in xb_text]
            yb_text_tokens=[t for t in yb_text]

            self.preds_text_tokens+=preds_text_tokens
            self.xb_text_tokens+=xb_text_tokens
            self.yb_text_tokens+=yb_text_tokens

    def after_epoch(self):
        if not self.learner.model.training:
            yb_text_tokens_for_bleu=[[item] for item in self.yb_text_tokens]
            bleu=corpus_bleu(yb_text_tokens_for_bleu,self.preds_text_tokens)
            wandb.log({'bleu': bleu}, step=self.learner.cur_step)


class MoveToDeviceCallback():

    def before_batch(self):
        if self.learner.device=='cuda':
            try:
                self.learner.batch = (self.learner.batch[0].to('cuda'), self.learner.batch[1].to('cuda'))
            except Exception as e:
                log.error(
                    "Exception occurred: Can't move the batch to GPU", exc_info=True)

    def before_fit(self):
        if self.learner.device=='cuda':
            try:
                self.learner.model = self.learner.model.to('cuda')
            except Exception as e:
                log.error(
                    "Exception occurred: Can't move the model to GPU", exc_info=True)


class TrackLoss():

    def before_epoch(self):
        self.batch_cnt = 0
        self.loss_sum = 0

    def after_batch(self):

        self.batch_cnt += 1
        loss = self.learner.loss

        loss = loss.detach().cpu()
        self.loss_sum += loss

        # Tracking train loss by batch
        if self.learner.model.training:
            wandb.log({'batch':self.learner.batch_idx}, step=self.learner.cur_step)
            wandb.log({'epoch':self.learner.epoch_idx}, step=self.learner.cur_step)
            wandb.log({'Loss/Train': loss.item()}, step=self.learner.cur_step)

            if self.learner.sched!=None:
                lr= self.learner.sched.get_last_lr()
                wandb.log({'Lr': lr[0]}, step=self.learner.cur_step)

    def after_epoch(self):

        # Calculate avg epoch loss
        avg_loss = self.loss_sum/self.batch_cnt
        avg_loss=avg_loss.item()

        # Log
        if self.learner.model.training:
            log.info(f"Epoch: {self.learner.epoch_idx} | Training | Loss: {avg_loss:.5f}")
            wandb.log({'Loss_Avg/Train': avg_loss}, step=self.learner.cur_step)
        else:
            log.info(f"Epoch: {self.learner.epoch_idx} | Validation | Loss: {avg_loss:.5f}")
            wandb.log({'Loss_Avg/Val': avg_loss}, step=self.learner.cur_step)

            if avg_loss<self.learner.best_val_loss:
                log.info(f"Loss/Val high score, remembering state_dict.")
                self.learner.best_val_loss = avg_loss
                self.learner.best_model_state_dict=copy.deepcopy(self.learner.model.state_dict())

# DATASET 

In [34]:
import os
import random

import wandb
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset


def chunk(indices, chunk_size):
    return torch.split(torch.tensor(indices), chunk_size)

def pad_collate_fn(batch):
    src_sentences,trg_sentences=[],[]
    for sample in batch:
        src_sentences+=[sample[0]]
        trg_sentences+=[sample[1]]

    src_sentences = pad_sequence(src_sentences, batch_first=True, padding_value=0)
    trg_sentences = pad_sequence(trg_sentences, batch_first=True, padding_value=0)

    return src_sentences, trg_sentences

class TranslationDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        src_encoded=self.dataset[idx]['translation_src']
        trg_encoded=self.dataset[idx]['translation_trg']

        return (
            torch.tensor(src_encoded),
            torch.tensor(trg_encoded),
        )

class CustomBatchSampler(Sampler):
    def __init__(self, dataset, batch_size):

        # Dataset is already sorted so just chunk indices
        # into batches of indices for sampling
        self.batch_size=batch_size
        self.indices=range(len(dataset))
        self.batch_of_indices=list(chunk(self.indices, self.batch_size))
        self.batch_of_indices = [batch.tolist() for batch in self.batch_of_indices]

    def __iter__(self):
        random.shuffle(self.batch_of_indices)
        return iter(self.batch_of_indices)

    def __len__(self):
        return len(self.batch_of_indices)


def get_data(example_cnt):
    data=load_dataset('wmt14','de-en',split='train').shuffle(seed=42)
    data=data.select(range(example_cnt))
    data=data.flatten()
    data=data.rename_column('translation.de','translation_trg')
    data=data.rename_column('translation.en','translation_src')

    return data

def preprocess_data(data, tokenizer, max_seq_len, test_proportion):

    # Tokenize
    def tokenize(example):
        return {
            'translation_src': tokenizer.encode(example['translation_src']).ids,
            'translation_trg': tokenizer.encode(example['translation_trg']).ids,
        }
    data=data.map(tokenize)

    # Compute sequence lengths
    def sequence_length(example):
        return {
            'length_src': [len(item) for item in example['translation_src']],
            'length_trg': [len(item) for item in example['translation_trg']],
        }
    data=data.map(sequence_length, batched=True, batch_size=10000)

    # Filter by sequence lengths
    def filter_long(example):
        return example['length_src']<= max_seq_len and example['length_trg']<=max_seq_len
    data=data.filter(filter_long)

    # Split
    data=data.train_test_split(test_size=test_proportion)

    # Sort each split by length for dynamic batching (see CustomBatchSampler)
    data['train']=data['train'].sort('length_src', reverse=True)
    data['test']=data['test'].sort('length_src', reverse=True)

    return data


def get_translation_dataloaders(
    dataset_size,
    vocab_size,
    tokenizer_type,
    tokenizer_save_pth,
    test_proportion,
    batch_size,
    max_seq_len,
    report_summary,
    ):

    data=get_data(dataset_size)

    if tokenizer_type == 'wordlevel':
        tokenizer=get_tokenizer_wordlevel(data, vocab_size)
    elif tokenizer_type == 'bpe':
        tokenizer=get_tokenizer_bpe(data, vocab_size)

    # Save tokenizers
    tokenizer.save(tokenizer_save_pth)

    data=preprocess_data(data, tokenizer, max_seq_len, test_proportion)

    if report_summary:
        wandb.run.summary['train_len']=len(data['train'])
        wandb.run.summary['val_len']=len(data['test'])

    # Create pytorch datasets
    train_ds=TranslationDataset(data['train'])
    val_ds=TranslationDataset(data['test'])

    # Create a custom batch sampler
    custom_batcher_train = CustomBatchSampler(train_ds, batch_size)
    custom_batcher_val= CustomBatchSampler(val_ds, batch_size)

    # Create pytorch dataloaders
    train_dl=DataLoader(train_ds, collate_fn=pad_collate_fn, batch_sampler=custom_batcher_train, pin_memory=True)
    val_dl=DataLoader(val_ds, collate_fn=pad_collate_fn, batch_sampler=custom_batcher_val, pin_memory=True)

    return train_dl, val_dl

# Architecture - Transformer

# POSITION WISE FEED FORWARD NET

In [35]:
##PositionWiseFeedForwardNet

import torch.nn as nn
import torch

class PositionWiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForwardNet, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)

        # Optional Dropout (not mentioned in the paper)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        return self.w_2(self.dropout(torch.relu(self.w_1(x))))

# MULTI HEAD ATTENTION

In [36]:


import torch
import torch.nn as nn
import math

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_head):
        super(ScaledDotProductAttention, self).__init__()

        self.d_head = d_head

        # Optional dropout (not mentioned in the paper)
        self.attention_dropout = nn.Dropout(p=0.1)

    def forward(self, q, k, v, mask=None):
        # q, k, v dims: (batch_size, n_heads, seq_len, d_head)

        attention_weights = torch.matmul(q, k.transpose(-2, -1))  # (batch_size, n_heads, seq_len, seq_len)
        scaled_attention_weights = attention_weights / math.sqrt(self.d_head)  # (batch_size, n_heads, seq_len, seq_len)

        if mask is not None:
            scaled_attention_weights = scaled_attention_weights.masked_fill(mask == 0, float('-inf')) # (batch_size, n_heads, seq_len, seq_len)

        # Apply softmax over the last dimension which corresponds to attention weights for a key
        scaled_attention_weights = nn.functional.softmax(scaled_attention_weights, dim=-1) # (batch_size, n_heads, seq_len, seq_len)

        # Optional dropout (not mentioned in the paper)
        scaled_attention_weights = self.attention_dropout(scaled_attention_weights) # (batch_size, n_heads, seq_len, seq_len)

        weighted_v = torch.matmul(scaled_attention_weights, v) # (batch_size, n_heads, seq_len, d_head)

        return weighted_v


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super(MultiHeadAttention, self).__init__()

        self.n_heads= n_heads

        assert d_model % n_heads == 0
        self.d_head = d_model // n_heads

        self.dot_product_attention_layer= ScaledDotProductAttention(self.d_head)

        self.W_0 = nn.Linear(d_model, d_model)

    def _split_into_heads(self, q,k,v):
        q= q.view(q.size(0), q.size(1), self.n_heads, self.d_head) # (batch_size, seq_len, n_heads, d_head)
        k= k.view(k.size(0), k.size(1), self.n_heads, self.d_head) # (batch_size, seq_len, n_heads, d_head)
        v= v.view(v.size(0), v.size(1), self.n_heads, self.d_head) # (batch_size, seq_len, n_heads, d_head)

        q= q.transpose(1,2) # (batch_size, n_heads, seq_len, d_head)
        k= k.transpose(1,2) # (batch_size, n_heads, seq_len, d_head)
        v= v.transpose(1,2) # (batch_size, n_heads, seq_len, d_head)

        return q,k,v

    def _concatenate_heads(self,attention_output):
        attention_output = attention_output.transpose(1,2).contiguous() # (batch_size, seq_len, n_heads, d_head)
        attention_output = attention_output.view(attention_output.size(0), attention_output.size(1), -1) # (batch_size, seq_len, n_heads * d_head)

        return attention_output

    def forward(self, q, k, v, mask=None):
        q,k,v= self._split_into_heads(q,k,v) # (batch_size, n_heads, seq_len, d_head)
        attention_output = self.dot_product_attention_layer(q, k, v, mask) # (batch_size, n_heads, seq_len, d_head)
        attention_output = self._concatenate_heads(attention_output) # (batch_size, seq_len, n_heads * d_head)

        attention_output = self.W_0(attention_output) # (batch_size, seq_len, d_model)

        return attention_output

# ADD AND NORM

In [37]:
##add_and_norm
import torch.nn as nn

class AddAndNorm(nn.Module):
    def __init__(self, d_model):
        super(AddAndNorm, self).__init__()

        self.layer_norm=nn.LayerNorm(d_model)

    def forward(self, x, residual):
        return self.layer_norm(x+residual)


# ENCODER

In [38]:


import torch.nn as nn


class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout_proba):
        super(TransformerEncoderBlock, self).__init__()

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        self.mha_layer=MultiHeadAttention(d_model, n_heads)
        self.dropout_layer_1=nn.Dropout(dropout_proba)
        self.add_and_norm_layer_1 = AddAndNorm(d_model)

        self.ffn_layer = PositionWiseFeedForwardNet(d_model, d_ff)
        self.dropout_layer_2=nn.Dropout(dropout_proba)
        self.add_and_norm_layer_2 = AddAndNorm(d_model)

    def forward(self, x, mask):
        # x dims: (batch_size, src_seq_len, d_model)
        # mask dim: (batch_size, 1, 1, src_seq_len)

        q = self.W_q(x) # (batch_size, src_seq_len, d_model)
        k = self.W_k(x) # (batch_size, src_seq_len, d_model)
        v = self.W_v(x) # (batch_size, src_seq_len, d_model)

        mha_out = self.mha_layer(q, k, v, mask) # (batch_size, src_seq_len, d_model)
        mha_out= self.dropout_layer_1(mha_out) # (batch_size, src_seq_len, d_model)
        mha_out = self.add_and_norm_layer_1(x, mha_out) # (batch_size, src_seq_len, d_model)

        ffn_out = self.ffn_layer(mha_out) # (batch_size, src_seq_len, d_model)
        ffn_out= self.dropout_layer_2(ffn_out) # (batch_size, src_seq_len, d_model)
        ffn_out = self.add_and_norm_layer_2(mha_out, ffn_out)  # (batch_size, src_seq_len, d_model)

        return ffn_out


class TransformerEncoder(nn.Module):
    def __init__(self, n_blocks, n_heads, d_model, d_ff, dropout_proba=0.1):
        super(TransformerEncoder, self).__init__()

        self.encoder_blocks=nn.ModuleList([TransformerEncoderBlock(d_model, n_heads, d_ff, dropout_proba) for _ in range(n_blocks)])

    def forward(self, x, mask):
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x, mask)
        return x

# DECODER

In [39]:
##transformer_decoder

import torch.nn as nn
import os

class TransformerDecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout_proba):
        super(TransformerDecoderBlock, self).__init__()

        self.W_q_1 = nn.Linear(d_model, d_model)
        self.W_k_1 = nn.Linear(d_model, d_model)
        self.W_v_1 = nn.Linear(d_model, d_model)

        self.mha_layer_1=MultiHeadAttention(d_model, n_heads)
        self.dropout_layer_1=nn.Dropout(dropout_proba)
        self.add_and_norm_1 = AddAndNorm(d_model)

        self.W_q_2 = nn.Linear(d_model, d_model)
        self.W_k_2 = nn.Linear(d_model, d_model)
        self.W_v_2 = nn.Linear(d_model, d_model)

        self.mha_layer_2=MultiHeadAttention(d_model, n_heads)
        self.dropout_layer_2=nn.Dropout(dropout_proba)
        self.add_and_norm_2 = AddAndNorm(d_model)

        self.ffn_layer = PositionWiseFeedForwardNet(d_model, d_ff)
        self.dropout_layer_3=nn.Dropout(dropout_proba)
        self.add_and_norm_3 = AddAndNorm(d_model)

    def forward(self, x, encoder_output, src_mask, trg_mask):
        # x dims: (batch_size, trg_seq_len, d_model)
        # encoder_output dims: (batch_size, src_seq_len, d_model)
        # src_mask dim: (batch_size, 1, 1, src_seq_len)
        # trg_mask dim: (batch_size, 1, trg_seq_len, trg_seq_len)

        # 1st attention layer, trg_mask is used here
        q_1 = self.W_q_1(x) # (batch_size, trg_seq_len, d_model)
        k_1 = self.W_k_1(x) # (batch_size, trg_seq_len, d_model)
        v_1 = self.W_v_1(x) # (batch_size, trg_seq_len, d_model)

        mha_layer_1_out = self.mha_layer_1(q_1, k_1, v_1, trg_mask) # (batch_size, trg_seq_len, d_model)
        mha_layer_1_out= self.dropout_layer_1(mha_layer_1_out) # (batch_size, trg_seq_len, d_model)
        mha_layer_1_out = self.add_and_norm_1(mha_layer_1_out, x) # (batch_size, trg_seq_len, d_model)

        # 2nd attention layer, src_mask is used here
        q_2 = self.W_q_2(mha_layer_1_out) # (batch_size, trg_seq_len, d_model)
        k_2 = self.W_k_2(encoder_output) # (batch_size, src_seq_len, d_model)
        v_2 = self.W_v_2(encoder_output) # (batch_size, src_seq_len, d_model)

        mha_layer_2_out = self.mha_layer_2(q_2, k_2, v_2, src_mask) # (batch_size, trg_seq_len, d_model)
        mha_layer_2_out= self.dropout_layer_2(mha_layer_2_out) # (batch_size, trg_seq_len, d_model)
        mha_layer_2_out = self.add_and_norm_2(mha_layer_2_out, mha_layer_1_out) # (batch_size, trg_seq_len, d_model)

        # Position-wise feed forward
        ffn_out = self.ffn_layer(mha_layer_2_out) # (batch_size, trg_seq_len, d_model)
        ffn_out= self.dropout_layer_3(ffn_out) # (batch_size, trg_seq_len, d_model)
        ffn_out = self.add_and_norm_3(ffn_out, mha_layer_2_out) # (batch_size, trg_seq_len, d_model)

        return ffn_out


class TransformerDecoder(nn.Module):
    def __init__(self, n_blocks, n_heads, d_model, d_ff, dropout_proba):
        super(TransformerDecoder, self).__init__()

        self.decoder_blocks=nn.ModuleList([TransformerDecoderBlock(d_model, n_heads, d_ff, dropout_proba) for _ in range(n_blocks)])

    def forward(self, x, encoder_output, src_mask, trg_mask):
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, encoder_output, src_mask, trg_mask)
        return x

# POSITIONAL ENCODING

In [40]:
## positional_encoding

import torch
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_len=500, dropout_proba=0.1):
        super(PositionalEncoding, self).__init__()
        self.max_seq_len=max_seq_len
        self.d_model=d_model

        pe_table=self.get_pe_table()
        self.register_buffer('pe_table' , pe_table)

        self.dropout=nn.Dropout(dropout_proba)

    def get_pe_table(self):
        position_idxs=torch.arange(self.max_seq_len).unsqueeze(1)
        embedding_idxs=torch.arange(self.d_model).unsqueeze(0)

        angle_rads = position_idxs * 1/torch.pow(10000, (2*(embedding_idxs//2))/self.d_model)

        angle_rads[:, 0::2] = torch.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = torch.cos(angle_rads[:, 1::2])

        pe_table = angle_rads.unsqueeze(0) # So we can apply it to a batch

        return pe_table

    def forward(self, embeddings_batch):
        seq_len = embeddings_batch.size(1)
        pe_batch = self.pe_table[:, :seq_len].clone().detach()
        return self.dropout(embeddings_batch + pe_batch)

# ENCODER DECODER

In [41]:
## TransformerEncoderDecoder

import torch.nn as nn

import math


class TransformerEncoderDecoder(nn.Module):
    def __init__(self,d_model, n_blocks, src_vocab_size, trg_vocab_size, n_heads, d_ff, dropout_proba):
        super(TransformerEncoderDecoder, self).__init__()
        self.dropout_proba = dropout_proba
        self.d_model=d_model

        # Encoder part
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.src_pos_embedding= PositionalEncoding(d_model)
        self.encoder= TransformerEncoder(n_blocks, n_heads, d_model, d_ff, dropout_proba)

        # Decoder part
        self.trg_embedding = nn.Embedding(trg_vocab_size, d_model)
        self.trg_pos_embedding= PositionalEncoding(d_model)
        self.decoder= TransformerDecoder(n_blocks, n_heads, d_model, d_ff, dropout_proba)

        # Linear mapping to vocab size
        self.linear = nn.Linear(d_model, trg_vocab_size)

        # Switch to xavier initialization (shown to be beneficial)
        self.init_with_xavier()

        # Sharing weights between two embedding layers and the pre-softmax linear layer
        self.src_embedding.weight = self.trg_embedding.weight
        self.linear.weight = self.trg_embedding.weight

    def encode(self, src_token_ids, src_mask):
        # Encoder part
        src_embeddings = self.src_embedding(src_token_ids) * math.sqrt(self.d_model) # (batch_size, src_seq_len, d_model)
        src_embeddings = self.src_pos_embedding(src_embeddings) # (batch_size, src_seq_len, d_model)
        encoder_outputs = self.encoder(src_embeddings, src_mask) # (batch_size, src_seq_len, d_model)

        return encoder_outputs

    def decode(self, trg_token_ids, encoder_outputs, src_mask, trg_mask):
        # Decoder part
        trg_embeddings = self.trg_embedding(trg_token_ids) * math.sqrt(self.d_model) # (batch_size, trg_seq_len, d_model)
        trg_embeddings = self.trg_pos_embedding(trg_embeddings) # (batch_size, trg_seq_len, d_model)
        decoder_outputs = self.decoder(trg_embeddings, encoder_outputs, src_mask, trg_mask) # (batch_size, trg_seq_len, d_model)

        # Linear mapping to vocab size
        linear_out = self.linear(decoder_outputs) # (batch_size, trg_seq_len, trg_vocab_size)

        return linear_out

    def forward(self, src_token_ids, trg_token_ids, src_mask, trg_mask):

        encoder_outputs= self.encode(src_token_ids, src_mask) # (batch_size, src_seq_len, d_model)
        decoder_outputs= self.decode(trg_token_ids, encoder_outputs, src_mask, trg_mask) # (batch_size, trg_seq_len, d_model)

        return decoder_outputs

    def init_with_xavier(self):
        for name, p in self.named_parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

# Machine Translation Transformer

In [42]:


import torch
import torch.nn as nn


class MachineTranslationTransformer(nn.Module):
    def __init__(self, d_model,n_blocks,src_vocab_size,trg_vocab_size,n_heads,d_ff, dropout_proba):
        super(MachineTranslationTransformer, self).__init__()

        self.transformer_encoder_decoder=TransformerEncoderDecoder(
            d_model,
            n_blocks,
            src_vocab_size,
            trg_vocab_size,
            n_heads,
            d_ff,
            dropout_proba
        )

    def _get_pad_mask(self, token_ids, pad_idx=0):
        pad_mask= (token_ids != pad_idx).unsqueeze(-2)
        return pad_mask.unsqueeze(1) # (batch_size, 1, 1, src_seq_len)

    def _get_lookahead_mask(self, token_ids):
        sz_b, len_s = token_ids.size()
        subsequent_mask = (1 - torch.triu(torch.ones((1, len_s, len_s), device=token_ids.device), diagonal=1)).bool()
        return subsequent_mask.unsqueeze(1) # (batch_size, 1, trg_seq_len, trg_seq_len)

    def forward(self, src_token_ids, trg_token_ids):

        # Since trg_token_ids contains both [BOS] and [SOS] tokens
        # we need to remove the [EOS] token when using it as input to the decoder.
        # Similarly we remove the [BOS] token when we use it as y to calculate loss,
        # which also makes y and y_pred shapes match.

        # Removing [EOS] token
        trg_token_ids=trg_token_ids[:, :-1]

        src_mask = self._get_pad_mask(src_token_ids) # (batch_size, 1, 1, src_seq_len)
        trg_mask = self._get_pad_mask(trg_token_ids) & self._get_lookahead_mask(trg_token_ids)  # (batch_size, 1, trg_seq_len, trg_seq_len)

        return self.transformer_encoder_decoder(src_token_ids, trg_token_ids, src_mask, trg_mask)

    def preprocess(self, sentence, tokenizer):
        device = next(self.parameters()).device

        src_token_ids=tokenizer.encode(sentence).ids
        src_token_ids=torch.tensor(src_token_ids, dtype=torch.long).to(device)
        src_token_ids=src_token_ids.unsqueeze(0) # To batch format

        return src_token_ids

    def translate(self, sentence, tokenizer, max_tokens=100, skip_special_tokens=False):

        # Infer the device of the model
        device = next(self.parameters()).device

        # Get tokenizer special tokens.
        eos_id=tokenizer.token_to_id('[EOS]')
        bos_id=tokenizer.token_to_id('[BOS]')

        # Tokenize sentence.
        src_token_ids=self.preprocess(sentence, tokenizer)

        # Initialize target sequence with SOS token.
        trg_token_ids=torch.LongTensor([bos_id]).unsqueeze(0).to(device) # (1, 1)

        # Obtain src mask
        src_mask=self._get_pad_mask(src_token_ids) # (batch_size, src_seq_len)

        # with torch.no_grad():
        encoder_output=self.transformer_encoder_decoder.encode(src_token_ids, src_mask) # (batch_size, src_seq_len, d_model)

        while True:

            # Obtain decoder output.
            trg_mask=self._get_lookahead_mask(trg_token_ids)  # Can also be set to None but for my config I found this works better.
            decoder_output=self.transformer_encoder_decoder.decode(trg_token_ids, encoder_output, src_mask, trg_mask)

            # Identify token with highest probability.
            softmax_output=nn.functional.log_softmax(decoder_output, dim=-1) # (batch_size, trg_seq_len, trg_vocab_size)
            softmax_output_last=softmax_output[:, -1, :] # (batch_size, trg_vocab_size)
            _, token_id=softmax_output_last.max(dim=-1) # (batch_size, trg_seq_len)

            # Check if token is EOS or we reached the maximum number of tokens.
            if token_id.item() == eos_id or trg_token_ids.size(1) == max_tokens:
                trg_token_ids=torch.cat([trg_token_ids, token_id.unsqueeze(0)], dim=-1) # (batch_size, trg_seq_len+1)
                break

            # Add token to target sequence.
            trg_token_ids=torch.cat([trg_token_ids, token_id.unsqueeze(0)], dim=-1) # (batch_size, trg_seq_len+1)

        # Detokenize sentence.
        decoded_output=tokenizer.decode(trg_token_ids.squeeze(0).detach().cpu().numpy(), skip_special_tokens=skip_special_tokens)

        return decoded_output




# LOG CONFIGURER

In [16]:
import logging
import logging.handlers
import os
import wandb


root_logger = logging.getLogger()

# Some libraries attempt to add their own root logger handlers. This is
# getting rid of those
for handler in list(root_logger.handlers):
    root_logger.removeHandler(handler)

# Choose log format
logfmt_str = "%(asctime)s %(levelname)-8s pid:%(process)d %(name)s:%(lineno)03d:%(funcName)s    %(message)s"

# Create a console handler
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.DEBUG)
c_format=logging.Formatter(logfmt_str)
c_handler.setFormatter(c_format)

# Create a file handler
os.makedirs(os.path.join('/kaggle/working/'), exist_ok=True)
log_pth=os.path.join('/kaggle/working/','log.txt')
f_handler = logging.FileHandler(log_pth)
f_handler.setLevel(logging.DEBUG)
f_format=logging.Formatter(logfmt_str)
f_handler.setFormatter(f_format)

# Add handlers to logger
root_logger.addHandler(c_handler)
root_logger.addHandler(f_handler)

# TRAINING

In [17]:
import os
from logging import log

import torch
import torch.nn as nn
import torch.optim as optim

import wandb

# Initialize configuration
import wandb
config_name='unofficial_single_gpu_config' # MODIFY THIS TO CHANGE CONFIGURATION
wandb.init(config=configs[config_name],project="ECEN_740_Project3",entity="phanenderchalasani")

# Configure Logging
import logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)

# Seed the Random Number Generators
import torch
torch.manual_seed(0)
import random
random.seed(0)
import numpy as np
np.random.seed(0)


class TrainingApp:
    def __init__(self):

        log.info('----- Training Started -----')

        # Device handling
        if wandb.config.DEVICE=='gpu':
            if not torch.cuda.is_available():
                raise ValueError('GPU is not available.')
            self.device = 'cuda'
            log.info(f'Device name is {torch.cuda.get_device_name()}')
        else:
            log.info(f'Device name is CPU')
            self.device='cpu'

    def main(self):

        train_dl, val_dl = get_translation_dataloaders(
            dataset_size=wandb.config.DATASET_SIZE,
            vocab_size=wandb.config.VOCAB_SIZE,
            tokenizer_save_pth=os.path.join('/kaggle/working','tokenizer.json'),
            tokenizer_type=wandb.config.TOKENIZER_TYPE,
            batch_size=wandb.config.BATCH_SIZE,
            report_summary=True,
            max_seq_len=wandb.config.MAX_SEQ_LEN,
            test_proportion=wandb.config.TEST_PROPORTION,
        )

        model = MachineTranslationTransformer(
            d_model=wandb.config.D_MODEL,
            n_blocks=wandb.config.N_BLOCKS,
            src_vocab_size=wandb.config.VOCAB_SIZE,
            trg_vocab_size=wandb.config.VOCAB_SIZE,
            n_heads=wandb.config.N_HEADS,
            d_ff=wandb.config.D_FF,
            dropout_proba=wandb.config.DROPOUT_PROBA
        )

        loss_func = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1, reduction='mean')

        optimizer = optim.Adam(model.parameters(), betas=wandb.config.BETAS, eps=wandb.config.EPS)
        scheduler=CustomScheduler(optimizer, wandb.config.D_MODEL, wandb.config.N_WARMUP_STEPS)

        # # The above scheduler's efficiency is highly influenced by dataset and batch size,
        # # alternatively you can use the below configuration, which also works much better for overfit configs.
        # optimizer = optim.Adam(model.parameters(), lr=0.00001, betas=wandb.config.BETAS, eps=wandb.config.EPS)
        # scheduler=optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.0005, epochs=wandb.config.EPOCHS, steps_per_epoch=len(train_dl), pct_start=0.3)

        cbs = [
            MoveToDeviceCallback(),
            TrackLoss(),
            TrackExample(),
            TrackBleu(),
            CheckpointSaver(epoch_cnt=wandb.config.MODEL_SAVE_EPOCH_CNT,),
            ]

        wandb.watch(model, log_freq=1000)
        learner = Learner(model,
                          train_dl,
                          val_dl,
                          loss_func,
                          cbs,
                          optimizer,
                          scheduler,
                          self.device)

        learner.fit(wandb.config.EPOCHS)


if __name__ == "__main__":
    TrainingApp().main()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


2024-04-29 03:43:32,006 INFO     pid:34 __main__:032:__init__    ----- Training Started -----
2024-04-29 03:43:32,050 INFO     pid:34 __main__:039:__init__    Device name is Tesla P100-PCIE-16GB


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 280M/280M [00:01<00:00, 159MB/s]  
Downloading data: 100%|██████████| 265M/265M [00:01<00:00, 172MB/s]  
Downloading data: 100%|██████████| 273M/273M [00:01<00:00, 212MB/s]  
Downloading data: 100%|██████████| 474k/474k [00:00<00:00, 2.15MB/s]
Downloading data: 100%|██████████| 509k/509k [00:00<00:00, 2.51MB/s]


Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Map:   0%|          | 0/80000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/80000 [00:00<?, ? examples/s]

2024-04-29 03:49:11,159 INFO     pid:34 __main__:161:after_epoch    Epoch: 0 | Training | Loss: 10.91056
2024-04-29 03:49:12,464 INFO     pid:34 __main__:164:after_epoch    Epoch: 0 | Validation | Loss: 10.76890
2024-04-29 03:49:12,466 INFO     pid:34 __main__:168:after_epoch    Loss/Val high score, remembering state_dict.
2024-04-29 03:49:14,183 INFO     pid:34 __main__:061:after_epoch    Tracking Example progress:
            Train Example x:     	[BOS] set a short distance from the train station and close to the pedestrian area, this fully renovated hotel is the ideal setting for your leisure or business stay. [EOS]
            Train Example y:     	[BOS] das hotel liegt im herzen der stadt nahe der beliebten place [UNK] hier wohnen sie am fuß der [UNK] in idealer lage, um stadt und berge zu entdecken. [EOS] [PAD] [PAD] [PAD] [PAD] [PAD]
            Train Example y_pred:	[BOS] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]

# TESTING ON VALIDATION.PY

## Loading the validation.py file

In [53]:
%run /kaggle/input/data-validation/validation.py

                                     German  \
0                             Guten Morgen!   
1                          Wie geht es dir?   
2                          Ich bin hungrig.   
3      Entschuldigung, wo ist die Toilette?   
4                      Wie viel kostet das?   
5                 Ich spreche kein Deutsch.   
6                        Was ist dein Name?   
7                          Es tut mir leid.   
8                          Woher kommst du?   
9                           Ich liebe dich.   
10                         Wie spät ist es?   
11                    Kannst du mir helfen?   
12                      Ich verstehe nicht.   
13                         Auf Wiedersehen!   
14                      Wo ist der Bahnhof?   
15                     Ich habe eine Frage.   
16                         Wie alt bist du?   
17                            Ich bin müde.   
18  Was machst du gerne in deiner Freizeit?   
19                             Was ist das?   
20           

## dataset1

In [54]:
df_dataset1

Unnamed: 0,German,English
0,Guten Morgen!,Good morning!
1,Wie geht es dir?,How are you?
2,Ich bin hungrig.,I am hungry.
3,"Entschuldigung, wo ist die Toilette?","Excuse me, where is the restroom?"
4,Wie viel kostet das?,How much does that cost?
5,Ich spreche kein Deutsch.,I don't speak German.
6,Was ist dein Name?,What is your name?
7,Es tut mir leid.,I'm sorry.
8,Woher kommst du?,Where are you from?
9,Ich liebe dich.,I love you.


## dataset_2

In [56]:
df_dataset2

Unnamed: 0,German,English
0,Die Sonne scheint am blauen Himmel über den Be...,The sun shines in the blue sky over the mounta...
1,Ich gehe mit meinem Hund im Park spazieren.,I walk with my dog in the park.
2,Wir essen gerne Pizza und trinken kühles Bier.,We enjoy eating pizza and drinking cold beer.
3,Meine Schwester liest ein Buch in ihrem Zimmer.,My sister is reading a book in her room.
4,Der Zug fährt pünktlich zum Bahnhof und bringt...,The train departs on time to the station and b...
5,Im Garten blühen bunte Blumen und grüne Sträuc...,"In the garden, colorful flowers and green bush..."
6,Er spielt Gitarre und singt schöne Lieder auf ...,He plays the guitar and sings beautiful songs ...
7,Die Kinder spielen fröhlich im Park und lachen...,The children play happily in the park and laug...
8,Der Kellner serviert köstliches Essen und erfr...,The waiter serves delicious food and refreshin...
9,Ich höre gerne klassische Musik und entspanne ...,I enjoy listening to classical music and relax...


In [57]:
device = "GPU"
def load_models(model_pth):

    # Load the model from checkpoint
    model = MachineTranslationTransformer(
        d_model=512,
        n_blocks=6,
        src_vocab_size=60000,
        trg_vocab_size=60000,
        n_heads=8,
        d_ff=2048,
        dropout_proba=0)

    if device=='GPU':
        model.load_state_dict(torch.load(model_pth,map_location=torch.device('cuda')))
    else:
        model.load_state_dict(torch.load(model_pth,map_location=torch.device('cpu')))
    model.eval()

    return model


In [58]:
from tokenizers import Tokenizer

#model_path = "/kaggle/input/ckpt-99-epochs/model_ckpt_epoch99.pt"
model_path = "/kaggle/input/project3/model_ckpt_best.pt"

model = load_models(model_path)

#tokenizer_pth="/kaggle/input/740-tokeniser/tokenizer.json"
tokenizer_pth = "/kaggle/input/project3/tokenizer.json"
tokenizer=Tokenizer.from_file(tokenizer_pth)




In [65]:
import pandas as pd

translated_data = []
translated_data_with_EOS_BOS_UNK = []
for index, row in df_dataset1.iterrows():
    german_sentence = row['German']
    english_sentence = row['English']
    out = model.translate(english_sentence, tokenizer)

    # Remove special tokens from the output
    special_tokens = [ '[BOS]','[UNK]','[EOS]']
    out_clean = ' '.join(token for token in out.split() if token not in special_tokens)

    translated_data.append([german_sentence, english_sentence, out_clean])
    translated_data_with_EOS_BOS_UNK.append([german_sentence, english_sentence, out])

# Create DataFrame
translated_dataset1_no_enk = pd.DataFrame(translated_data, columns=['GERMAN(Given)', 'ENGLISH', 'German Translated by model'])
translated_dataset1 = pd.DataFrame(translated_data_with_EOS_BOS_UNK, columns=['ORGINAL GERMAN', 'ENGLISH', 'German Translated by model'])



In [66]:
translated_dataset1_no_enk

Unnamed: 0,GERMAN(Given),ENGLISH,German Translated by model
0,Guten Morgen!,Good morning!,sehr
1,Wie geht es dir?,How are you?,wie ist sie nicht!
2,Ich bin hungrig.,I am hungry.,ich habe fur den bericht
3,"Entschuldigung, wo ist die Toilette?","Excuse me, where is the restroom?",was ist die
4,Wie viel kostet das?,How much does that cost?,wie ist die
5,Ich spreche kein Deutsch.,I don't speak German.,ich habe nicht mehr als
6,Was ist dein Name?,What is your name?,was ist die
7,Es tut mir leid.,I'm sorry.,ich habe
8,Woher kommst du?,Where are you from?,was sind sie
9,Ich liebe dich.,I love you.,ich kann mich


## Translated Dataframe 1 with special tokens UNK EOS BOS

In [67]:
translated_dataset1

Unnamed: 0,ORGINAL GERMAN,ENGLISH,German Translated by model
0,Guten Morgen!,Good morning!,[BOS] sehr [UNK] [EOS]
1,Wie geht es dir?,How are you?,[BOS] wie ist sie nicht! [EOS]
2,Ich bin hungrig.,I am hungry.,[BOS] ich habe fur den bericht [UNK] [EOS]
3,"Entschuldigung, wo ist die Toilette?","Excuse me, where is the restroom?",[BOS] was ist die [UNK] [EOS]
4,Wie viel kostet das?,How much does that cost?,[BOS] wie ist die [UNK] [EOS]
5,Ich spreche kein Deutsch.,I don't speak German.,[BOS] ich habe nicht mehr als [UNK] [EOS]
6,Was ist dein Name?,What is your name?,[BOS] was ist die [UNK] [EOS]
7,Es tut mir leid.,I'm sorry.,[BOS] ich habe [UNK] [EOS]
8,Woher kommst du?,Where are you from?,[BOS] was sind sie [UNK] [EOS]
9,Ich liebe dich.,I love you.,[BOS] ich kann mich [UNK] [EOS]


## Testing on df2

In [70]:
import pandas as pd

translated_data = []
translated_data_with_EOS_BOS_UNK = []
for index, row in df_dataset2.iterrows():
    german_sentence = row['German']
    english_sentence = row['English']
    out = model.translate(english_sentence, tokenizer)

    # Remove special tokens from the output
    special_tokens = [ '[BOS]','[UNK]','[EOS]']
    out_clean = ' '.join(token for token in out.split() if token not in special_tokens)

    translated_data.append([german_sentence, english_sentence, out_clean])
    translated_data_with_EOS_BOS_UNK.append([german_sentence, english_sentence, out])

# Create DataFrame
translated_dataset2_no_enk = pd.DataFrame(translated_data, columns=['GERMAN(Given)', 'ENGLISH', 'PREDICTED TRANSLATED GERMAN'])
translated_dataset2 = pd.DataFrame(translated_data_with_EOS_BOS_UNK, columns=['ORGINAL GERMAN', 'ENGLISH', ' PREDICTED TRANSLATED GERMAN'])




## Translated Dataframe 2 with special tokens UNK EOS BOS

In [72]:
translated_dataset2_no_enk

Unnamed: 0,GERMAN(Given),ENGLISH,PREDICTED TRANSLATED GERMAN
0,Die Sonne scheint am blauen Himmel über den Be...,The sun shines in the blue sky over the mounta...,die in den letzten jahren hat die stadt
1,Ich gehe mit meinem Hund im Park spazieren.,I walk with my dog in the park.,ich bin in der nahe von
2,Wir essen gerne Pizza und trinken kühles Bier.,We enjoy eating pizza and drinking cold beer.,in der nahe und
3,Meine Schwester liest ein Buch in ihrem Zimmer.,My sister is reading a book in her room.,das ist sehr sehr sehr sehr gutes uhr fur die ...
4,Der Zug fährt pünktlich zum Bahnhof und bringt...,The train departs on time to the station and b...,der und der war der der
5,Im Garten blühen bunte Blumen und grüne Sträuc...,"In the garden, colorful flowers and green bush...",in der und sind die
6,Er spielt Gitarre und singt schöne Lieder auf ...,He plays the guitar and sings beautiful songs ...,er wurde am jahr 2009 und er wurde am geboren.
7,Die Kinder spielen fröhlich im Park und lachen...,The children play happily in the park and laug...,die finden in der karte und sehen die karte von .
8,Der Kellner serviert köstliches Essen und erfr...,The waiter serves delicious food and refreshin...,das hat sich in den und den in den ganzen zeit...
9,Ich höre gerne klassische Musik und entspanne ...,I enjoy listening to classical music and relax...,ich habe die und auf


In [73]:
translated_dataset2

Unnamed: 0,ORGINAL GERMAN,ENGLISH,PREDICTED TRANSLATED GERMAN
0,Die Sonne scheint am blauen Himmel über den Be...,The sun shines in the blue sky over the mounta...,[BOS] die [UNK] [UNK] in den letzten jahren ha...
1,Ich gehe mit meinem Hund im Park spazieren.,I walk with my dog in the park.,[BOS] ich bin in der nahe von [UNK] [EOS]
2,Wir essen gerne Pizza und trinken kühles Bier.,We enjoy eating pizza and drinking cold beer.,[BOS] in der nahe [UNK] [UNK] [UNK] und [UNK] ...
3,Meine Schwester liest ein Buch in ihrem Zimmer.,My sister is reading a book in her room.,[BOS] das [UNK] ist sehr sehr sehr sehr gutes ...
4,Der Zug fährt pünktlich zum Bahnhof und bringt...,The train departs on time to the station and b...,[BOS] der [UNK] und der [UNK] war der [UNK] de...
5,Im Garten blühen bunte Blumen und grüne Sträuc...,"In the garden, colorful flowers and green bush...",[BOS] in der [UNK] und [UNK] sind die [UNK] [U...
6,Er spielt Gitarre und singt schöne Lieder auf ...,He plays the guitar and sings beautiful songs ...,[BOS] er wurde am jahr 2009 und er wurde am [U...
7,Die Kinder spielen fröhlich im Park und lachen...,The children play happily in the park and laug...,[BOS] die [UNK] finden in der karte und sehen ...
8,Der Kellner serviert köstliches Essen und erfr...,The waiter serves delicious food and refreshin...,[BOS] das [UNK] [UNK] hat sich in den [UNK] un...
9,Ich höre gerne klassische Musik und entspanne ...,I enjoy listening to classical music and relax...,[BOS] ich habe die [UNK] und [UNK] auf [UNK] [...


## Took the first 80,000 sentences of the dataset and reserved 80 sentences(0.1%) for testing. Trained the model using the same parameters mentioned on the paper. Used the validation.py to validate the final model.

## Since, we trained the model only on 80,000 sentences, the model is not translating accurately. Can train the model on more data using high computational power

# References

### 1. Attention is all you need (https://arxiv.org/abs/1706.03762).
### 2. Dataset (https://huggingface.co/datasets/wmt/wmt14/viewer/de-en).
### 3. Source code (https://github.com/brandokoch/attention-is-all-you-need-paper/tree/master).
### 4. Pytorch (https://pytorch.org/docs/stable/index.html).
### 5. Pytorch transformer (https://pytorch.org/tutorials/beginner/transformer_tutorial.html).