# **En-Vi Neural Machine Translation using RNNs, Transformer Model**

## **Dataset**

In [None]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/484.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m307.2/484.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset

ds = load_dataset("thainq107/iwslt2015-en-vi")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/522 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133317 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [None]:
ds['train'][0]

{'en': 'Rachel Pike : The science behind a climate headline',
 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}

## **Tokenizer**

In [None]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models

# Tạo tokenizer dạng word-based
tokenizer_en = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer_vi = Tokenizer(models.WordLevel(unk_token="<unk>"))

tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_vi.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab_size=15000,
    min_frequency=2,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

# Huấn luyện tokenizer
tokenizer_en.train_from_iterator(ds["train"]["en"], trainer)
tokenizer_vi.train_from_iterator(ds["train"]["vi"], trainer)

# Lưu tokenizer
tokenizer_en.save("tokenizer_en.json")
tokenizer_vi.save("tokenizer_vi.json")

In [None]:
len(tokenizer_en.get_vocab()), len(tokenizer_vi.get_vocab())

(15000, 13684)

In [None]:
tokenizer_en.encode("how are you")

Encoding(num_tokens=3, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
tokenizer_en.encode("how are you").ids

[81, 27, 18]

In [None]:
tokenizer_vi.encode("bạn có khoẻ không").ids

[18, 9, 596, 14]

## **Encoding**

In [None]:
from transformers import PreTrainedTokenizerFast

# Load tokenizer đã train vào PreTrainedTokenizerFast
tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_en.json",
    unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>"
)
tokenizer_vi = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer_vi.json",
    unk_token="<unk>", pad_token="<pad>", bos_token="<bos>", eos_token="<eos>"
)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
len(tokenizer_en), len(tokenizer_vi)

(15000, 13684)

In [None]:
MAX_LEN = 75

def preprocess_function(examples):
    src_texts = examples["en"]
    tgt_texts = ["<bos> " + sent + "<eos>" for sent in examples["vi"]]

    src_encodings = tokenizer_en(
        src_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )
    tgt_encodings = tokenizer_vi(
        tgt_texts, padding="max_length", truncation=True, max_length=MAX_LEN
    )

    return {
        "input_ids": src_encodings["input_ids"],
        "labels": tgt_encodings["input_ids"],
    }

preprocessed_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

Map:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [None]:
tokenizer_vi.unk_token_id

1

In [None]:
tokenizer_vi.pad_token_id, tokenizer_vi.bos_token_id, tokenizer_vi.eos_token_id

(0, 2, 3)

In [None]:
preprocessed_ds['train']

Dataset({
    features: ['en', 'vi', 'input_ids', 'labels'],
    num_rows: 133317
})

In [None]:
print(preprocessed_ds['train'][0])

{'en': 'Rachel Pike : The science behind a climate headline', 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu', 'input_ids': [6675, 1, 57, 60, 339, 604, 13, 744, 5643, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [2, 1960, 66, 1157, 131, 8, 376, 113, 38, 417, 735, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


## **Model**

### **GRU**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_src=10000, vocab_size_tgt=10000,
                 embedding_dim=128, hidden_size=128, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.dropout = dropout

class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))  # B x S x H
        output, hidden = self.gru(embedded)  # B x S x H, B x H
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_dim, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)  # LM Head

    def forward(self, input, hidden):
        output = self.embedding(input)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)  # B x 1 x Vocab
        return output, hidden

class Seq2SeqRNNModel(PreTrainedModel):
    config_class = Seq2SeqRNNConfig

    def __init__(self, config, tokenizer_en):
        super().__init__(config)
        self.encoder = EncoderRNN(
            config.vocab_size_src, config.embedding_dim,
            config.hidden_size, config.dropout)
        self.decoder = DecoderRNN(
            config.hidden_size, config.embedding_dim, config.vocab_size_tgt)
        self.BOS_IDX = tokenizer_en.bos_token_id
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD Token

    def forward(self, input_ids, labels):
        batch_size, seq_len = labels.shape
        decoder_input = torch.full((batch_size, 1), self.BOS_IDX, dtype=torch.long).to(input_ids.device)  # ✅ Sửa lỗi
        encoder_output, decoder_hidden = self.encoder(input_ids)
        decoder_outputs = []

        for i in range(seq_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            decoder_input = labels[:, i].unsqueeze(1)  # Teacher forcing

        logits = torch.cat(decoder_outputs, dim=1)  # B x S x Vocab
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return {"loss": loss, "logits": logits}

In [None]:
config = Seq2SeqRNNConfig(
    vocab_size_src=len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi)
)
model = Seq2SeqRNNModel(config, tokenizer_en)

In [None]:
model

Seq2SeqRNNModel(
  (encoder): EncoderRNN(
    (embedding): Embedding(15000, 128)
    (gru): GRU(128, 128, batch_first=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (decoder): DecoderRNN(
    (embedding): Embedding(13684, 128)
    (gru): GRU(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=13684, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
)

### **Transformer**

In [None]:
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig

def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    device = src.device

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(torch.bool)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)
    src_padding_mask = (src == 0)
    tgt_padding_mask = (tgt == 0)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(
            self, vocab_size_src=10000, vocab_size_tgt=10000, max_seq_length=50,
            d_model=256, num_heads=8, num_layers=6, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src = vocab_size_src
        self.vocab_size_tgt = vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout

class Seq2SeqTransformerModel(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig

    def __init__(self, config):
        super().__init__(config)

        self.embedding_src = nn.Embedding(
            config.vocab_size_src, config.d_model)
        self.embedding_tgt = nn.Embedding(
            config.vocab_size_tgt, config.d_model)

        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model)
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model)

        self.transformer = nn.Transformer(
            d_model=config.d_model,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dropout=config.dropout,
            batch_first=True
        )

        self.generator = nn.Linear(
            config.d_model, config.vocab_size_tgt
            )
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # Ignore PAD token

    def forward(self, input_ids, labels):
        tgt_input = labels[:, :-1]
        tgt_output = labels[:, 1:]
        batch_size, seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape

        src_positions = torch.arange(seq_len_src, device=input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(seq_len_tgt, device=labels.device).unsqueeze(0)

        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions)
        tgt_embedded = self.embedding_tgt(tgt_input) + self.position_embedding_tgt(tgt_positions)

        src_mask, tgt_mask, src_key_padding_mask, tgt_key_padding_mask = create_mask(input_ids, tgt_input)

        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.generator(outs)
        loss = self.loss_fn(logits.permute(0, 2, 1), tgt_output)

        return {"loss": loss, "logits": logits}

    def encode(self, src, src_mask):
        _, seq_len_src = src.shape
        src_positions = torch.arange(
            seq_len_src, device=src.device).unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(
            src_positions)
        return self.transformer.encoder(src_embedded, src_mask)

    def decode(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(
            seq_len_tgt, device=tgt.device).unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_tgt(
            tgt_positions)
        return self.transformer.decoder(
            tgt_embedded, encoder_output, tgt_mask
        )

In [None]:
# Khởi tạo config
config = Seq2SeqTransformerConfig(
    vocab_size_src=len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi), max_seq_length=75
)

# Tạo mô hình
model = Seq2SeqTransformerModel(config)

In [None]:
model

Seq2SeqTransformerModel(
  (embedding_src): Embedding(15000, 256)
  (embedding_tgt): Embedding(13684, 256)
  (position_embedding_src): Embedding(75, 256)
  (position_embedding_tgt): Embedding(75, 256)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((256,), eps=

### **Test Model**

In [None]:
input_ids = torch.tensor([preprocessed_ds['train'][0]['input_ids']])
labels = torch.tensor([preprocessed_ds['train'][0]['labels']])

In [None]:
input_ids

tensor([[6675,    1,   57,   60,  339,  604,   13,  744, 5643,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]])

In [None]:
labels

tensor([[   2, 1960,   66, 1157,  131,    8,  376,  113,   38,  417,  735,    3,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0]])

In [None]:
pred = model(input_ids, labels)

In [None]:
pred

{'loss': tensor(9.5132, grad_fn=<NllLossBackward0>),
 'logits': tensor([[[ 0.3837,  0.0276,  0.3859,  ..., -0.2222, -0.3631, -0.1750],
          [ 0.3909,  0.0769,  0.3010,  ..., -0.1100, -0.3537, -0.2031],
          [-0.1219, -0.2498,  0.1721,  ..., -0.0878, -0.0597,  0.0384],
          ...,
          [ 0.2402,  0.4268,  0.3161,  ..., -0.2446,  0.3921,  0.1282],
          [ 0.2402,  0.4268,  0.3161,  ..., -0.2446,  0.3921,  0.1282],
          [ 0.2402,  0.4268,  0.3161,  ..., -0.2446,  0.3921,  0.1282]]],
        grad_fn=<CatBackward0>)}

## **Trainer**

In [None]:
# Disable wandb
import os
os.environ['WANDB_DISABLED'] = 'true'

# # Use wandb
# import wandb
# wandb.init(
#     project="en-vi-machine-translation",
#     name="transformer" # "gru"
# )

In [None]:
from transformers import Trainer, TrainingArguments

# Cấu hình training
training_args = TrainingArguments(
    output_dir="./en-vi-machine-translation",
    logging_dir="logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=512,
    per_device_eval_batch_size=512,
    num_train_epochs=25,
    learning_rate=2e-5,
    save_total_limit=1,
    # report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_ds["train"],
    eval_dataset=preprocessed_ds["validation"]
)

### **GRU**

In [None]:
trainer.train()

### **Transformer**

In [None]:
trainer.train()

## **Inference**

In [None]:
def greedy_decode(model, src, src_mask, max_len, start_symbol, device="cpu"):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    print('v')
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1), device)
                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        prob = model.generator(out[:, -1, :]) # LM Head
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word[-1].item() # index

        ys = torch.cat([ys,torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        if next_word == 3: #EOS : 3
            break
    return ys

def translate(model, src_sentence, device):
    model.eval()
    input_ids = tokenizer_en([src_sentence], return_tensors='pt')['input_ids'].to(device)
    num_tokens = input_ids.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(device)
    tgt_tokens = greedy_decode(
        model,  input_ids, src_mask, max_len=num_tokens + 5, start_symbol=2, device=device)
    return tokenizer_vi.decode(a.detach().cpu()[0])

In [None]:
translate(model, "i go to school", model.device)

## **Evaluate**

In [None]:
!pip install -q sacrebleu==2.5.1

In [None]:
from tqdm import tqdm
import sacrebleu

pred_sentences, tgt_sentences = [], []
for sample in tqdm(ds['test']):
    src_sentence = sample['en']
    tgt_sentence = sample['vi']

    pred_sentence = translate(model, src_sentence)
    pred_sentences.append(pred_sentence)

    tgt_sentences.append(tgt_sentence)

bleu_score = sacrebleu.corpus_bleu(pred_sentences, [tgt_sentences], force=True)
bleu_score

In [None]:
import sacrebleu

pred_sentences = ['tôi đang đi học']
tgt_sentences = ['tôi đang đi tới trường']
bleu_score = sacrebleu.corpus_bleu(
    pred_sentences, [tgt_sentences], force=True
)
bleu_score

BLEU = 46.31 75.0/66.7/50.0/50.0 (BP = 0.779 ratio = 0.800 hyp_len = 4 ref_len = 5)