In [None]:
! pip -q install torchtext==0.6.0
! pip -q install pyvi
! pip install https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
! python -m spacy link vi_spacy_model vi_spacy_model
!pip install sacrebleu

import nltk
nltk.download('wordnet')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
  Downloading https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz (233.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.3/233.3 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.7.0,>=3.6.0 (from vi_core_news_lg==3.6.0)
  Downloading spacy-3.6.1.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.6 MB/s[

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import os
import math

#### **Training helpers**

In [None]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model

        self.embed = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embed(x)

# Embedder(100, 512)(torch.LongTensor([1,2,3,4])).shape

#### **Innovation: RoPE implementation**

In [None]:
class RoPE(nn.Module):
    """
    A conceptual implementation of Rotary Positional Embedding (RoPE).
    It generates and applies the rotation matrix/tensor to Q and K vectors.
    """
    def __init__(self, d_model, max_len=512, base=10000):
        super().__init__()
        self.d_model = d_model

        inv_freq = 1.0 / (base ** (torch.arange(0, d_model, 2).float() / d_model))
        self.register_buffer("inv_freq", inv_freq)

        t = torch.arange(max_len, dtype=torch.float)
        freqs = torch.einsum("i,j->ij", t, inv_freq)

        emb = torch.cat((freqs, freqs), dim=-1)

        self.register_buffer("cos_cached", emb.cos()[:, None, None, :], persistent=False)
        self.register_buffer("sin_cached", emb.sin()[:, None, None, :], persistent=False)

    def forward(self, x, seq_len):
            # x shape: (B, H, N, D_k)
            B, H, N, D_k = x.shape

            cos = self.cos_cached[:seq_len, :].to(x.device, dtype=x.dtype)
            sin = self.sin_cached[:seq_len, :].to(x.device, dtype=x.dtype)

            cos = cos.permute(2, 1, 0, 3)
            sin = sin.permute(2, 1, 0, 3)

            d = self.d_model # D_k

            x_rot = x[..., :d//2]
            x_pass = x[..., d//2:]

            rotated_x = torch.cat((-x_pass, x_rot), dim=-1)

            return (x * cos) + (rotated_x * sin)

#### **Attention Mechanism**

In [None]:
def attention(q, k, v, mask=None, dropout=None):
    """
    q: batch_size x head x seq_length x d_model
    k: batch_size x head x seq_length x d_model
    v: batch_size x head x seq_length x d_model
    mask: batch_size x 1 x 1 x seq_length
    output: batch_size x head x seq_length x d_model
    """

    # attention score được tính bằng cách nhân q với k
    d_k = q.size(-1)
    scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(d_k)

    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask==0, -1e9)
    # xong rồi thì chuẩn hóa bằng softmax
    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output, scores

# attention(torch.rand(32, 8, 30, 512), torch.rand(32, 8, 30, 512), torch.rand(32, 8, 30, 512)).shape

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        assert d_model % heads == 0

        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.attn = None

        # RoPE requires d_model to be a multiple of 2 (which is true for common d_model values)
        # For strict RoPE, the rotation is applied only on the d_k dimension.
        self.rope = RoPE(d_model=self.d_k, max_len=512) # RoPE instance for head dimension

        # Linear projections
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        # Note: We still calculate seq_len here, but we will not use it in the view.
        # seq_len = q.size(1)

        # 1. Linear Projection (B, N, D) -> (B, N, D)
        q = self.q_linear(q)
        k = self.k_linear(k)
        v = self.v_linear(v)

        # 2. Split into heads and transpose (B, N, D) -> (B, H, N, D_k)
        # We use -1 for the sequence length dimension (index 1)
        q = q.view(bs, -1, self.h, self.d_k).transpose(1, 2)
        k = k.view(bs, -1, self.h, self.d_k).transpose(1, 2)
        v = v.view(bs, -1, self.h, self.d_k).transpose(1, 2)

        # ... rest of the code (including RoPE) ...
        # If you need the sequence length later for RoPE (N), you can get it from the
        # reshaped tensor: N = q.size(2)

        q_len = q.size(2)
        k_len = k.size(2)
        # 3. APPLY RoPE to Q and K
        q = self.rope(q, seq_len=q_len) # Use the dynamically calculated length
        k = self.rope(k, seq_len=k_len) # Use the dynamically calculated length

        # 4. Compute Attention Scores (Scores, Attn Weights)
        scores, self.attn = attention(q, k, v, mask, self.dropout)

        # 5. Concatenate Heads (B, H, N, D_k) -> (B, N, D)
        # scores has shape (B, H, N, D_k)
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)

        # 6. Final Linear Output
        output = self.out(concat)
        return output

In [None]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()

        self.size = d_model

        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))

        self.eps = eps

    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

#### **Innovation: Using SwiGLU as activation unit**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()

        # Two parallel projections
        self.w = nn.Linear(d_model, d_ff)   # gate branch
        self.v = nn.Linear(d_model, d_ff)   # linear branch

        # Output projection
        self.w2 = nn.Linear(d_ff, d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # SwiGLU: silu(xW) ⊗ xV
        x = F.silu(self.w(x)) * self.v(x)
        x = self.dropout(x)
        x = self.w2(x)
        return x

In [None]:
# Prelayer Normalization
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout=0.1):
        super().__init__()
        # Norms are initialized, same as before
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, d_ff, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def forward(self, x, mask):
        norm_x = self.norm_1(x)

        attn_out = self.attn(norm_x, norm_x, norm_x, mask)

        x = x + self.dropout_1(attn_out)

        norm_x = self.norm_2(x)

        ffn_out = self.ff(norm_x)

        x = x + self.dropout_2(ffn_out)

        return x

In [None]:
import torch
import torch.nn as nn

# Prelayer Norm

class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout=0.1):
        super().__init__()
        # Norms are initialized, same as before
        self.norm_1 = Norm(d_model) # For Masked Self-Attention
        self.norm_2 = Norm(d_model) # For Encoder-Decoder Attention
        self.norm_3 = Norm(d_model) # For Feed-Forward Network

        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)

        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) # Masked Self-Attention
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) # Cross-Attention
        self.ff = FeedForward(d_model, d_ff, dropout=dropout)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        norm_x = self.norm_1(x)

        attn_out = self.attn_1(norm_x, norm_x, norm_x, trg_mask)

        x = x + self.dropout_1(attn_out)

        norm_x = self.norm_2(x)

        attn_out = self.attn_2(norm_x, e_outputs, e_outputs, src_mask)

        x = x + self.dropout_2(attn_out)

        norm_x = self.norm_3(x)

        ffn_out = self.ff(norm_x)

        x = x + self.dropout_3(ffn_out)

        return x

In [None]:
import copy

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, d_ff, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        # self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, d_ff, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, src, mask):
        """
        src: batch_size x seq_length
        mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        x = self.embed(src)
        # x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)

# Encoder(232, 512,6,8,0.1)(torch.LongTensor(32, 30).random_(0, 10), torch.rand(32, 1, 30)).shape

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, d_ff, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        # self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, d_ff, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        """
        trg: batch_size x seq_length
        e_outputs: batch_size x seq_length x d_model
        src_mask: batch_size x 1 x seq_length
        trg_mask: batch_size x 1 x seq_length
        output: batch_size x seq_length x d_model
        """
        x = self.embed(trg)
        # x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

# Decoder(232, 512, 6, 8, 0.1)(torch.LongTensor(32, 30).random_(0, 10), torch.rand(32, 30, 512), torch.rand(32, 1, 30), torch.rand(32, 1, 30)).shape

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, d_ff, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, d_ff, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, d_ff, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        """
        src: batch_size x seq_length
        trg: batch_size x seq_length
        src_mask: batch_size x 1 x seq_length
        trg_mask batch_size x 1 x seq_length
        output: batch_size x seq_length x vocab_size
        """
        e_outputs = self.encoder(src, src_mask)

        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

# Transformer(232, 232, 512, 6, 8, 0.1)(torch.LongTensor(32, 30).random_(0, 10), torch.LongTensor(32, 30).random_(0, 10),torch.rand(32, 1, 30),torch.rand(32, 1, 30)).shape

#### **Preparing the dataset**

In [None]:
import kagglehub

kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [None]:
# Run to use VLSP dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nguynvitcng21020173/vlsp-2025-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nguynvitcng21020173/vlsp-2025-data?dataset_version_number=1...


100%|██████████| 49.1M/49.1M [00:00<00:00, 162MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/nguynvitcng21020173/vlsp-2025-data/versions/1


In [None]:
!cp -r /root/.cache/kagglehub/datasets/nguynvitcng21020173/vlsp-2025-data/versions/1 /content/VLSP_data

In [None]:
!head -n 10000 '/content/VLSP_data/train.en.txt' > '/content/VLSP_data/valid.en'
!head -n 10000 '/content/VLSP_data/train.vi.txt' > '/content/VLSP_data/valid.vi'
!tail -n +10001 '/content/VLSP_data/train.en.txt' > '/content/VLSP_data/train_new.en'
!tail -n +10001 '/content/VLSP_data/train.vi.txt' > '/content/VLSP_data/train_new.vi'

#### **Innovation: Using BPE as tokenizer**

In [None]:
from tokenizers import ByteLevelBPETokenizer
import os

# Initialize
tokenizer = ByteLevelBPETokenizer()

# Train on both source and target training files
tokenizer.train(
    files=["/content/VLSP_data/train_new.en", "/content/VLSP_data/train_new.vi"],  # both languages
    vocab_size=30000,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>"]
)

os.makedirs("bpe_tokenizer", exist_ok=True)

# Save tokenizer
tokenizer.save_model("bpe_tokenizer")

['bpe_tokenizer/vocab.json', 'bpe_tokenizer/merges.txt']

In [None]:
from tokenizers import ByteLevelBPETokenizer

bpe_tokenizer = ByteLevelBPETokenizer(
    "bpe_tokenizer/vocab.json",
    "bpe_tokenizer/merges.txt"
)

PAD_ID = bpe_tokenizer.token_to_id("<pad>")
SOS_ID = bpe_tokenizer.token_to_id("<s>")
EOS_ID = bpe_tokenizer.token_to_id("</s>")

In [None]:
tokenizer.add_special_tokens(["<s>", "<pad>", "</s>", "<unk>"])

0

In [None]:
from torchtext import data

class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
            def pool(d, random_shuffler):
                for p in data.batch(d, self.batch_size * 100):
                    p_batch = data.batch(
                        sorted(p, key=self.sort_key),
                        self.batch_size, self.batch_size_fn)
                    for b in random_shuffler(list(p_batch)):
                        yield b
            self.batches = pool(self.data(), self.random_shuffler)

        else:
            self.batches = []
            for b in data.batch(self.data(), self.batch_size,
                                          self.batch_size_fn):
                self.batches.append(sorted(b, key=self.sort_key))

global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch,  len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

In [None]:
def nopeak_mask(size, device):
    """Tạo mask được sử dụng trong decoder để lúc dự đoán trong quá trình huấn luyện
     mô hình không nhìn thấy được các từ ở tương lai
    """
    np_mask = np.triu(np.ones((1, size, size)),
    k=1).astype('uint8')
    np_mask =  Variable(torch.from_numpy(np_mask) == 0)
    np_mask = np_mask.to(device)

    return np_mask

def create_masks(src, trg, src_pad, trg_pad, device):
    """ Tạo mask cho encoder,
    để mô hình không bỏ qua thông tin của các kí tự PAD do chúng ta thêm vào
    """
    src_mask = (src != src_pad).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, device)
        if trg.is_cuda:
            np_mask.cuda()
        trg_mask = trg_mask & np_mask

    else:
        trg_mask = None
    return src_mask, trg_mask

In [None]:
def init_vars(src, model, device, k, max_len):
    """
    Initialize variables for beam search (BPE version)
    """

    # Special token IDs (global or imported)
    init_tok = SOS_ID

    # Source padding mask
    src_mask = (src != PAD_ID).unsqueeze(-2)

    # Encoder output
    e_output = model.encoder(src, src_mask)

    # Initialize decoder input with <s>
    outputs = torch.LongTensor([[init_tok]]).to(device)

    # First decoding step
    trg_mask = nopeak_mask(1, device)
    out = model.out(
        model.decoder(outputs, e_output, src_mask, trg_mask)
    )
    out = F.softmax(out, dim=-1)

    # Top-k candidates
    probs, ix = out[:, -1].topk(k)
    log_scores = torch.log(probs)

    # Prepare beam outputs
    outputs = torch.zeros(k, max_len, dtype=torch.long).to(device)
    outputs[:, 0] = init_tok
    outputs[:, 1] = ix[0]

    # Repeat encoder outputs for k beams
    e_outputs = e_output.repeat(k, 1, 1)

    return outputs, e_outputs, log_scores

def k_best_outputs(outputs, out, log_scores, i, k):
    """
    Select k best next tokens for beam search
    """

    # out: [k, i, vocab_size]
    probs, ix = out[:, -1].topk(k)          # [k, k]
    log_probs = torch.log(probs) + log_scores.transpose(0, 1)

    k_probs, k_ix = log_probs.view(-1).topk(k)

    row = k_ix // k
    col = k_ix % k

    outputs[:, :i] = outputs[row, :i]
    outputs[:, i] = ix[row, col]

    log_scores = k_probs.unsqueeze(0)

    return outputs, log_scores

def beam_search(src, model, device, k, max_len):
    """
    Beam search decoding (BPE version)
    """

    outputs, e_outputs, log_scores = init_vars(
        src, model, device, k, max_len
    )

    eos_tok = EOS_ID
    src_mask = (src != PAD_ID).unsqueeze(-2)

    ind = None

    for i in range(2, max_len):

        trg_mask = nopeak_mask(i, device)

        out = model.out(
            model.decoder(outputs[:, :i], e_outputs, src_mask, trg_mask)
        )

        out = F.softmax(out, dim=-1)

        outputs, log_scores = k_best_outputs(
            outputs, out, log_scores, i, k
        )

        # Check EOS positions
        eos_positions = (outputs == eos_tok).nonzero(as_tuple=False)
        sentence_lengths = torch.zeros(k, dtype=torch.long, device=device)

        for row, col in eos_positions:
            if sentence_lengths[row] == 0:
                sentence_lengths[row] = col

        num_finished = (sentence_lengths > 0).sum().item()

        if num_finished == k:
            alpha = 0.7
            scores = log_scores / (sentence_lengths.float() ** alpha)
            _, ind = scores.max(dim=1)
            ind = ind.item()
            break

    # Decode best hypothesis
    if ind is None:
        best = outputs[0]
    else:
        best = outputs[ind]

    # Cut at EOS if exists
    eos_pos = (best == eos_tok).nonzero(as_tuple=False)
    length = eos_pos[0].item() if len(eos_pos) > 0 else max_len

    return bpe_tokenizer.decode(
        best[1:length].tolist(),
        skip_special_tokens=True
    )

In [None]:
def translate_sentence(sentence, model, device, k, max_len):
    """
    Translate one sentence using beam search (BPE version)
    """
    model.eval()

    # 1. BPE tokenize → IDs
    src_ids = bpe_tokenizer.encode(sentence).ids

    # 2. Convert to tensor
    src_tensor = torch.LongTensor([src_ids]).to(device)

    # 3. Beam search
    output = beam_search(
        src_tensor,
        model,
        device=device,
        k=k,
        max_len=max_len
    )

    return output

In [None]:
def bpe_tokenize(text):
    return bpe_tokenizer.encode(text).ids

In [None]:
from torchtext import data

def create_fields():

    SRC = data.Field(
        tokenize=bpe_tokenize,
        use_vocab=False,
        pad_token=PAD_ID
    )

    TRG = data.Field(
        tokenize=bpe_tokenize,
        use_vocab=False,
        pad_token=PAD_ID,
        init_token=SOS_ID,
        eos_token=EOS_ID
    )

    return SRC, TRG

In [None]:
vocab_size = bpe_tokenizer.get_vocab_size()
pad_idx = PAD_ID
print(vocab_size)
pad_idx

30000


1

In [None]:
import os
import dill as pickle
import pandas as pd

def read_data(src_file, trg_file):
    src_data = open(src_file).read().strip().split('\n')

    trg_data = open(trg_file).read().strip().split('\n')

    return src_data, trg_data

def create_dataset(src_data, trg_data, max_len, batchsize, device, SRC, TRG, istrain=True):

    raw_data = {
        'src': src_data,
        'trg': trg_data
    }
    df = pd.DataFrame(raw_data)

    df = df[
        df['src'].apply(lambda x: len(bpe_tokenize(x)) < max_len) &
        df['trg'].apply(lambda x: len(bpe_tokenize(x)) < max_len)
    ]

    df.to_csv("translate_transformer_temp.csv", index=False)

    data_fields = [('src', SRC), ('trg', TRG)]
    dataset = data.TabularDataset(
        path="translate_transformer_temp.csv",
        format="csv",
        fields=data_fields
    )

    iterator = MyIterator(
        dataset,
        batch_size=batchsize,
        device=device,
        repeat=False,
        sort_key=lambda x: (len(x.src), len(x.trg)),
        batch_size_fn=batch_size_fn,
        train=istrain,
        shuffle=istrain
    )

    os.remove("translate_transformer_temp.csv")
    return iterator

In [None]:
opt = {
    'train_src_data':'/content/VLSP_data/train_new.en',
    'train_trg_data':'/content/VLSP_data/train_new.vi',
    'valid_src_data':'/content/VLSP_data/valid.en',
    'valid_trg_data':'/content/VLSP_data/valid.vi',
    'src_lang':'en',
    'trg_lang':'en',#'vi_spacy_model',
    'max_strlen':160,
    'batchsize':1500,
    'device':'cuda',
    'd_model': 512,
    'n_layers': 6,
    'heads': 8,
    'dropout': 0.1,
    'lr':0.0001,
    'epochs':30,
    'printevery': 200,
    'k':5,
    'd_ff': 2048
}

In [None]:
train_src_data, train_trg_data = read_data(opt['train_src_data'], opt['train_trg_data'])
valid_src_data, valid_trg_data = read_data(opt['valid_src_data'], opt['valid_trg_data'])

# SRC, TRG = create_fields(opt['src_lang'], opt['trg_lang'])
SRC, TRG = create_fields()
train_iter = create_dataset(train_src_data, train_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=True)
valid_iter = create_dataset(valid_src_data, valid_trg_data, opt['max_strlen'], opt['batchsize'], opt['device'], SRC, TRG, istrain=False)

In [None]:
test_src_data, test_trg_data = read_data('/content/VLSP_data/public_test.en.txt', '/content/VLSP_data/public_test.vi.txt')

In [None]:
src_pad = bpe_tokenizer.token_to_id("<pad>")
trg_pad = bpe_tokenizer.token_to_id("<pad>")

In [None]:
def step(model, optimizer, batch, criterion, step_num, device):
    model.train()

    # batch.src / batch.trg are already token IDs (BPE)
    src = batch.src.transpose(0, 1).to(device)
    trg = batch.trg.transpose(0, 1).to(device)

    # Teacher forcing
    trg_input = trg[:, :-1]
    trg_gold = trg[:, 1:].contiguous().view(-1)

    # Masks (same PAD_ID for src & trg)
    src_mask, trg_mask = create_masks(
        src, trg_input, PAD_ID, PAD_ID, device
    )

    # Forward
    preds = model(src, trg_input, src_mask, trg_mask)

    # Loss
    optimizer.zero_grad()
    loss = criterion(
        preds.view(-1, preds.size(-1)),
        trg_gold
    )

    loss.backward()
    optimizer.step_and_update_lr()

    return {
        "loss": loss.item(),
        "step": step_num + 1
    }

In [None]:
def validate(model, valid_iter, criterion, device):
    """Compute validation loss and perplexity."""
    model.eval()
    total_loss = []

    with torch.no_grad():
        for batch in valid_iter:
            src = batch.src.transpose(0, 1).to(device)
            trg = batch.trg.transpose(0, 1).to(device)

            trg_input = trg[:, :-1]
            trg_gold = trg[:, 1:].contiguous().view(-1)

            src_mask, trg_mask = create_masks(
                src, trg_input, PAD_ID, PAD_ID, device
            )

            preds = model(src, trg_input, src_mask, trg_mask)

            loss = criterion(
                preds.view(-1, preds.size(-1)),
                trg_gold
            )

            total_loss.append(loss.item())

    avg_loss = float(np.mean(total_loss))
    perplexity = math.exp(avg_loss)

    return avg_loss, perplexity


In [None]:
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, init_lr, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.init_lr = init_lr
        self.d_model = d_model
        self.n_warmup_steps = n_warmup_steps
        self.n_steps = 0


    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()


    def zero_grad(self):
        "Zero out the gradients with the inner optimizer"
        self._optimizer.zero_grad()


    def _get_lr_scale(self):
        d_model = self.d_model
        n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps
        return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5))

    def state_dict(self):
        optimizer_state_dict = {
            'init_lr':self.init_lr,
            'd_model':self.d_model,
            'n_warmup_steps':self.n_warmup_steps,
            'n_steps':self.n_steps,
            '_optimizer':self._optimizer.state_dict(),
        }

        return optimizer_state_dict

    def load_state_dict(self, state_dict):
        self.init_lr = state_dict['init_lr']
        self.d_model = state_dict['d_model']
        self.n_warmup_steps = state_dict['n_warmup_steps']
        self.n_steps = state_dict['n_steps']

        self._optimizer.load_state_dict(state_dict['_optimizer'])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, padding_idx, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim
        self.padding_idx = padding_idx

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 2))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            true_dist[:, self.padding_idx] = 0
            mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
            if mask.dim() > 0:
                true_dist.index_fill_(0, mask.squeeze(), 0.0)

        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [None]:
model = Transformer(
    vocab_size,          # src vocab size
    vocab_size,          # trg vocab size (shared BPE)
    opt['d_model'],
    opt['n_layers'],
    opt['heads'],
    opt['d_ff'],
    opt['dropout']
)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(opt['device'])

In [None]:
optimizer = ScheduledOptim(
        torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        1.0, opt['d_model'], 4000) #previously 0.2

criterion = LabelSmoothingLoss(vocab_size, padding_idx=PAD_ID, smoothing=0.1)

In [None]:
def save_checkpoint(model, optimizer, epoch, step, best_bleu, path="checkpoint.pth"):
    checkpoint = {
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
        "epoch": epoch,
        "step": step,
        "best_bleu": best_bleu,
        "rng_state": torch.get_rng_state(),
        "cuda_rng_state": torch.cuda.get_rng_state(),
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved to {path}")

In [None]:
import torch

def load_checkpoint(path, model, optimizer, device):
    print(f"Loading checkpoint from {path}...")
    try:
        # Load the checkpoint onto the correct device
        checkpoint = torch.load(path, map_location=device)
    except FileNotFoundError:
        print(f"Error: Checkpoint file not found at {path}")
        return 0, 0, 0.0 # Return initial values if file doesn't exist

    # ... (Restoring model, optimizer, epoch, step, best_bleu is omitted for brevity) ...
    model.load_state_dict(checkpoint["model_state"])
    optimizer.load_state_dict(checkpoint["optimizer_state"])
    epoch = checkpoint["epoch"]
    step = checkpoint["step"]
    best_bleu = checkpoint["best_bleu"]

    # 5. Restore RNG states - CRITICAL FIX HERE
    try:
        # Convert the CPU RNG state to the expected torch.ByteTensor
        cpu_rng_state = checkpoint["rng_state"].type(torch.ByteTensor)
        torch.set_rng_state(cpu_rng_state)
    except Exception as e:
        print(f"Warning: Could not restore CPU RNG state. Error: {e}")

    if 'cuda_rng_state' in checkpoint and torch.cuda.is_available():
        try:
            # Convert the CUDA RNG state to the expected torch.ByteTensor
            cuda_rng_state = checkpoint["cuda_rng_state"].type(torch.ByteTensor)
            torch.cuda.set_rng_state(cuda_rng_state)
        except Exception as e:
             print(f"Warning: Could not restore CUDA RNG state. Error: {e}")

    print(f"Checkpoint successfully loaded.")
    print(f"  - Resuming from Epoch: {epoch}, Step: {step}")
    print(f"  - Best BLEU Score recorded: {best_bleu:.4f}")

    return epoch, step, best_bleu

In [None]:
checkpoint_path = "/content/drive/My Drive/VLSP_best_model.pth"

In [None]:
start_epoch, start_step, best_bleu = load_checkpoint(
    path=checkpoint_path,
    model=model,
    optimizer=optimizer,
    device=opt['device']
)

Loading checkpoint from /content/drive/My Drive/VLSP_best_model.pth...
Checkpoint successfully loaded.
  - Resuming from Epoch: 2, Step: 37813
  - Best BLEU Score recorded: 46.6980


In [None]:
patience = 3          # stop after 5 epochs without BLEU improvement
min_delta = 1e-4      # minimum BLEU improvement to count
wait = 0              # how many epochs we have waited

#### **Training steps and tools**

#### **Preparing for BLEU score calculation**

In [None]:
import re

def detokenize(text: str) -> str:
    # 1. Remove BPE markers
    text = text.replace('@@ ', '').replace('@@', '')

    # 2. Fix spacing before punctuation
    text = re.sub(r'\s+([?.!,;:])', r'\1', text)

    # 3. Fix quotes
    text = re.sub(r"\s+'", "'", text)
    text = re.sub(r"'\s+", "'", text)

    # 4. Normalize spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

In [None]:
import sacrebleu

def bleu_sacre(valid_src_data, valid_trg_data, model, device, k=5, max_len=80):
    model.eval()
    preds = []

    with torch.no_grad():
        for sent in valid_src_data:
            pred = translate_sentence(sent, model, device, k, max_len)
            preds.append(detokenize(pred))

    refs = [detokenize(ref) for ref in valid_trg_data]

    bleu = sacrebleu.corpus_bleu(preds, [refs])
    return bleu.score

In [None]:
import time

step_num = 0
best_bleu = 0

# Training for RoPE + PreLayer Norm
for epoch in range(start_epoch, opt['epochs']):
    total_loss = 0

    for i, batch in enumerate(train_iter):
        s = time.time()

        # step() now returns the loss only; step_num is tracked here
        info = step(model, optimizer, batch, criterion, step_num, opt['device'])
        loss = info['loss']
        step_num = info['step']

        total_loss += loss

        if (i + 1) % opt['printevery'] == 0:
            avg_loss = total_loss / opt['printevery']
            print('epoch: {:03d} - iter: {:05d} - train loss: {:.4f} - time: {:.4f}'.format(
                epoch, i, avg_loss, time.time() - s
            ))
            total_loss = 0

    # Validation
    s = time.time()
    valid_loss, perplexity = validate(model, valid_iter, criterion, opt['device'])
    bleuscore = bleu_sacre(
        valid_src_data[:500], valid_trg_data[:500],
        model, opt['device'], 1, opt['max_strlen']
    )

    print('epoch: {:03d} - iter: {:05d} - valid loss: {:.4f} - bleu score: {:.4f} - perplexity: {:.4f} - time: {:.4f}'.format(
        epoch, i, valid_loss, bleuscore, perplexity, time.time() - s
    ))

    # ---- EARLY STOPPING LOGIC ----
    if bleuscore > best_bleu + min_delta:
        best_bleu = bleuscore
        wait = 0

        # save ONLY the best model
        save_checkpoint(
            path="VLSP_best_model.pth",
            model=model,
            optimizer=optimizer,
            epoch=epoch,
            step=step_num,
            best_bleu=best_bleu
        )
    else:
        wait += 1
        print(f"No BLEU improvement for {wait}/{patience} epochs")

    if wait >= patience:
        print(f"Early stopping triggered at epoch {epoch}")
        break

epoch: 002 - iter: 00199 - train loss: 2.4238 - time: 0.2503
epoch: 002 - iter: 00399 - train loss: 2.4355 - time: 0.2404
epoch: 002 - iter: 00599 - train loss: 2.4247 - time: 0.2571
epoch: 002 - iter: 00799 - train loss: 2.4316 - time: 0.2715
epoch: 002 - iter: 00999 - train loss: 2.4491 - time: 0.2426
epoch: 002 - iter: 01199 - train loss: 2.4343 - time: 0.2643
epoch: 002 - iter: 01399 - train loss: 2.4289 - time: 0.2596
epoch: 002 - iter: 01599 - train loss: 2.4162 - time: 0.2493
epoch: 002 - iter: 01799 - train loss: 2.4582 - time: 0.2615
epoch: 002 - iter: 01999 - train loss: 2.4274 - time: 0.2554
epoch: 002 - iter: 02199 - train loss: 2.4320 - time: 0.2729
epoch: 002 - iter: 02399 - train loss: 2.4588 - time: 0.2493
epoch: 002 - iter: 02599 - train loss: 2.4565 - time: 0.2767
epoch: 002 - iter: 02799 - train loss: 2.4622 - time: 0.2646
epoch: 002 - iter: 02999 - train loss: 2.4649 - time: 0.2589
epoch: 002 - iter: 03199 - train loss: 2.4523 - time: 0.2609
epoch: 002 - iter: 03399

KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/VLSP_best_model_final.pth /content/drive/MyDrive/


In [None]:
torch.save(
    model.state_dict(),
    "/content/drive/MyDrive/VLSP_model_dict.pt"
)

In [None]:
sentence='The determinants of knowledge and practices in public health service utilization among health insurance card''s holders were distance and time taken to health services, time of health insurance and health insurance information provided.'
trans_sent = translate_sentence(sentence, model, opt['device'], opt['k'], opt['max_strlen'])
trans_sent

'Các yếu tố liên quan đến kiến thức, thực hành sử dụng dịch vụ y tế công cộng của người dân bảo hiểm y tế là khoảng cách và thời gian tham gia dịch vụ y tế, thời gian bảo hiểm y tế và thông tin bảo hiểm y tế được cung cấp.'

In [None]:
sentence='The purpose of this study was to evaluate the effects of a mixture extract of C chrysantha and G pentaphyllum on weight loss and lowering lipid blood levels in obese Swiss mice.'
trans_sent = translate_sentence(sentence, model, opt['device'], opt['k'], opt['max_strlen'])
trans_sent

'Mục đích của nghiên cứu này là đánh giá tác động của hỗn hợp cao phối hợp C. chrysantha và G. tuần hoàn trong việc giảm cân và hạ lipid máu trên chuột nhắt trắng bị béo phì.'

In [None]:
sentence='Mice in each group was assessed for weight weekly and the levels of Total Cholesterol (CT), HDLCholesterol (HDL-C), LDL-Cholesterol (LDL-C) and Triglyceride (TC) was recorded at initial time (after obesity was induced for 8 weeks) and 1 hour after taking the extracted mixtures on the last day.'
trans_sent = translate_sentence(sentence, model, opt['device'], opt['k'], opt['max_strlen'])
trans_sent

'Mỗi nhóm được đánh giá cân nặng hàng tuần và nồng độ Cholesterol toàn phần (CT), HDL cholesterol (HDL-C), LDL-C (LDL-C) và triglyceride (TC) được ghi nhận vào thời điểm ban đầu (sau khi gây béo phì trong 8 tuần) và 1 giờ sau khi uống hỗn hợp dịch chiết vào ngày cuối.'

In [None]:
sentence='Conclusion: The proportion of proton pump inhibitors was not safe and reasonable and the proportion of prescription drugs with no instructions on how long to use proton pump inhibitors were low. The proportion of prescription interacting drugs accounted for a high proportion, clopidogrel was the most interactive drug commonly used with PPIs.'
trans_sent = translate_sentence(sentence, model, opt['device'], opt['k'], opt['max_strlen'])
trans_sent

'Kết luận: Tỷ lệ sử dụng thuốc ức chế bơm proton không an toàn và hợp lý và tỷ lệ thuốc theo toa không có hướng dẫn sử dụng thuốc ức chế bơm proton còn thấp, tỷ lệ các thuốc có tác dụng tương tác thuốc chiếm tỷ lệ cao, clopidogrel là thuốc tương tác được sử dụng phổ biến nhất với PPI.'

In [None]:
test_src_data, test_trg_data = read_data('', '/content/KC4.0_MultilingualNMT/data/iwslt_en_vi/tst2013.vi')

In [None]:
score = bleu_sacre(test_src_data[:10000], test_trg_data[:10000], model, opt['device'], opt['k'], opt['max_strlen'])
print(score)

45.276843593577155


In [None]:
import sacrebleu
import torch

def ter_sacre(valid_src_data, valid_trg_data, model, device, k=5, max_len=80):
    model.eval()
    preds = []

    with torch.no_grad():
        for sent in valid_src_data:
            pred = translate_sentence(sent, model, device, k, max_len)
            preds.append(detokenize(pred))

    refs = [detokenize(ref) for ref in valid_trg_data]

    ter = sacrebleu.metrics.TER()
    score = ter.corpus_score(preds, [refs])

    return score.score  # lower is better


In [None]:
ter_score = ter_sacre(test_src_data[:1000], test_trg_data[:1000], model, opt['device'])
print("TER:", ter_score)

TER: 46.93931837073982


In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def tokenize_for_meteor(text):
    return text.split()

In [None]:
def detokenize_pred(text: str) -> str:
    """
    Minimal detokenization for model predictions:
    - Remove BPE continuation markers
    - Do NOT normalize punctuation or spacing
    """
    return text.replace('@@ ', '').replace('@@', '').strip()


In [None]:
from nltk.translate.meteor_score import meteor_score
import torch

def meteor_corpus(src_data, trg_data, model, device, k=5, max_len=80):
    model.eval()
    scores = []

    with torch.no_grad():
        for src, ref in zip(src_data, trg_data):
            pred = translate_sentence(src, model, device, k, max_len)

            pred_text = detokenize_pred(pred)
            ref_text = ref  # original reference text

            pred_tokens = pred_text.split()
            ref_tokens = ref_text.split()

            score = meteor_score([ref_tokens], pred_tokens)
            scores.append(score)

    return sum(scores) / len(scores)



In [None]:
meteor = meteor_corpus(test_src_data[:1000], test_trg_data[:1000], model, opt['device'])
print("METEOR:", meteor)

METEOR: 0.6612023580010491
