# **EN-VI Machine Translation using Transformer Model**

## **Dataset**

In [1]:
!pip install -q datasets sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset

data = load_dataset(
    "mt_eng_vietnamese",
    "iwslt2015-en-vi"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [3]:
data['train']

Dataset({
    features: ['translation'],
    num_rows: 133318
})

In [4]:
data['train']['translation'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## **Tokenization**

In [5]:


from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [6]:
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'

token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('basic_english')
token_transform[TGT_LANGUAGE] = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [7]:
def yield_tokens(data_iter, lang):
    for data_sample in data_iter['translation']:
        yield token_transform[lang](data_sample[lang])


for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = data['train']

    # Create torchtext's Vocab object
    vocab_transform[lang] = build_vocab_from_iterator(
        yield_tokens(train_iter, lang),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )

    vocab_transform[lang].set_default_index(UNK_IDX)

In [8]:
vocab_transform[SRC_LANGUAGE].get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'the', 'and', 'to', '&apos']

In [9]:
vocab_transform[TGT_LANGUAGE].get_itos()[:10]

['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'và', 'tôi', 'là', 'một']

In [10]:
len(vocab_transform[SRC_LANGUAGE]), len(vocab_transform[TGT_LANGUAGE])

(47271, 21114)

## **Dataloader**

In [11]:
import torch
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for lang in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[lang] = sequential_transforms(
        token_transform[lang], # Tokenization
        vocab_transform[lang], # Numericalization
        tensor_transform # Add BOS/EOS and create tensor
    )

# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for sample in batch:
        src_sample, tgt_sample = sample[SRC_LANGUAGE], sample[TGT_LANGUAGE]
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample).to(dtype=torch.int64))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample).to(dtype=torch.int64))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, tgt_batch

In [12]:
from torch.utils.data import DataLoader

BATCH_SIZE = 8

train_dataloader = DataLoader(
    data['train']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

valid_dataloader = DataLoader(
    data['validation']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

test_dataloader = DataLoader(
    data['test']['translation'],
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn
)

## **Model**

In [None]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout,
                                       batch_first=True)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX)
    tgt_padding_mask = (tgt == PAD_IDX)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
src_ids, tgt_ids = next(iter(train_dataloader))
src_ids = src_ids.to(DEVICE)
tgt_ids = tgt_ids.to(DEVICE)
tgt_input = tgt_ids[:, :-1] # input for decoder
tgt_output = tgt_ids[:, 1:] # output for decoder
src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_ids, tgt_input)
logits = transformer(
    src_ids, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask
)
loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_output.reshape(-1))

In [None]:
logits.shape

torch.Size([8, 77, 21114])

In [None]:
loss

tensor(10.0798, device='cuda:0', grad_fn=<NllLossBackward0>)

## **Trainer**

In [None]:
import time

def train_epoch(model, optimizer, criterion, train_dataloader, device):
    model.train()
    losses = []

    for src_ids, tgt_ids in train_dataloader:
        src_ids = src_ids.to(device)
        tgt_ids = tgt_ids.to(device)

        tgt_input = tgt_ids[:, :-1]
        tgt_output = tgt_ids[:, 1:]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_ids, tgt_input)
        try:
            output = model(
                src_ids, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask
            )
        except:
            print(src_ids.shape, tgt_input.shape)

        optimizer.zero_grad()

        loss = criterion(
            output.reshape(-1, output.shape[-1]),
            tgt_output.reshape(-1))
        loss.backward()

        optimizer.step()
        losses.append(loss.item())

    return sum(losses) / len(losses)

def evaluate(model, data_loader, criterion, device):
    model.eval()
    losses = []
    with torch.no_grad():
        for src_ids, tgt_ids in data_loader:
            src_ids = src_ids.to(device)
            tgt_ids = tgt_ids.to(device)

            tgt_input = tgt_ids[:, :-1]
            tgt_output = tgt_ids[:, 1:]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src_ids, tgt_input)
            output = model(
                src_ids, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask
            )
            loss = criterion(
                output.reshape(-1, output.shape[-1]),
                tgt_output.reshape(-1)
            )
            losses.append(loss.item())
    return sum(losses) / len(losses)

def train(model, train_dataloader, valid_dataloader, optimizer, criterion, device, epochs):
    for epoch in range(1, epochs+1):
        start_time = time.time()
        train_loss = train_epoch(model, optimizer, criterion, train_dataloader, device)
        valid_loss = evaluate(model, valid_dataloader, criterion, device)
        end_time = time.time()
        print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

## **Training**

In [None]:
SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
transformer = transformer.to(DEVICE)

criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

epochs = 5
train(transformer, train_dataloader, valid_dataloader, optimizer, criterion, DEVICE, epochs)



Epoch: 1, Train loss: 4.619, Val loss: 4.193, Epoch time = 463.990s
Epoch: 2, Train loss: 3.981, Val loss: 3.880, Epoch time = 467.306s
Epoch: 3, Train loss: 3.731, Val loss: 3.703, Epoch time = 465.825s
Epoch: 4, Train loss: 3.570, Val loss: 3.605, Epoch time = 455.484s
Epoch: 5, Train loss: 3.457, Val loss: 3.524, Epoch time = 455.488s


## **Inference**

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word[-1].item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(1, -1)
    num_tokens = src.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(
        vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [None]:
translate(transformer, "i go to school")

In [None]:
from tqdm import tqdm
import sacrebleu

pred_sentences, tgt_sentences = [], []
for sample in tqdm(data['test']['translation']):
    src_sentence = sample[SRC_LANGUAGE]
    tgt_sentence = sample[TGT_LANGUAGE]

    pred_sentence = translate(transformer, src_sentence)
    pred_sentences.append(pred_sentence)

    tgt_sentences.append(tgt_sentence)

bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
bleu_score

100%|██████████| 1269/1269 [01:53<00:00, 11.14it/s]


BLEU = 6.91 46.8/16.8/6.4/2.4 (BP = 0.659 ratio = 0.706 hyp_len = 23819 ref_len = 33738)