In [37]:
import os
import re
import torch
import torch.nn as nn
import torch.optim as optim
import math
import numpy as np
from torch.utils.data import Dataset, DataLoader, Subset
from collections import Counter
from torch.utils.data import random_split, DataLoader
from torch.optim.lr_scheduler import LambdaLR
from torchmetrics.text import BLEUScore
import spacy
from typing import List

# Load Train data

In [2]:
import sentencepiece as spm
import os

# 1. ƒê∆∞·ªùng d·∫´n file model
model_file = "/kaggle/input/fist-model-10-ep/pytorch/default/1/spm_en_vi.model"

sp = spm.SentencePieceProcessor()
sp.load(model_file)
print(">>> ƒê√£ load Tokenizer th√†nh c√¥ng!")

print(f"K√≠ch th∆∞·ªõc Vocab: {sp.get_piece_size()}")

# 5. Test th·ª≠
text_test = "Hello Vietnam"
print(f"Test encode '{text_test}': {sp.encode_as_ids(text_test)}")
print(f"Test decode: {sp.decode(sp.encode_as_ids(text_test))}")

>>> ƒê√£ load Tokenizer th√†nh c√¥ng!
K√≠ch th∆∞·ªõc Vocab: 16000
Test encode 'Hello Vietnam': [11183, 9082]
Test decode: Hello Vietnam


In [41]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Subset

# ƒê·ªãnh nghƒ©a c√°c ID ƒë·∫∑c bi·ªát kh·ªõp v·ªõi l√∫c train SentencePiece
PAD_ID = 0
UNK_ID = 1
BOS_ID = 2
EOS_ID = 3

class IWSLTDataset(Dataset):
    def __init__(self, src_file, tgt_file):
        self.src_lines = []
        self.tgt_lines = []
        
        print("ƒêang ƒë·ªçc v√† l·ªçc d·ªØ li·ªáu...")
        with open(src_file, 'r', encoding='utf-8') as fs, \
             open(tgt_file, 'r', encoding='utf-8') as ft:
            
            # D√πng zip ƒë·ªÉ ƒë·ªçc song song 2 file c√πng l√∫c
            # N·∫øu 2 file l·ªách d√≤ng g·ªëc, zip s·∫Ω t·ª± d·ª´ng ·ªü file ng·∫Øn h∆°n -> Tr√°nh crash
            for s_line, t_line in zip(fs, ft):
                s_clean = s_line.strip()
                t_clean = t_line.strip()
                
                # Logic quan tr·ªçng: Ch·ªâ l·∫•y khi C·∫¢ 2 ƒë·ªÅu c√≥ d·ªØ li·ªáu
                if s_clean and t_clean:
                    self.src_lines.append(s_clean)
                    self.tgt_lines.append(t_clean)
        
        print(f"Ho√†n t·∫•t load data. S·ªë c·∫∑p c√¢u h·ª£p l·ªá: {len(self.src_lines)}")

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        return self.src_lines[idx], self.tgt_lines[idx]


def collate_batch(batch):
    """
    H√†m n√†y x·ª≠ l√Ω m·ªôt batch d·ªØ li·ªáu:
    1. Tokenize text th√†nh list of IDs.
    2. Th√™m BOS (Start) v√† EOS (End) tokens.
    3. Pad (ƒëi·ªÅn s·ªë 0) ƒë·ªÉ c√°c c√¢u c√≥ ƒë·ªô d√†i b·∫±ng nhau.
    """
    src_batch, tgt_batch = [], []
    
    for src_text, tgt_text in batch:
        # Tokenize v√† th√™m BOS/EOS
        src_ids = [BOS_ID] + sp.encode_as_ids(src_text) + [EOS_ID]
        tgt_ids = [BOS_ID] + sp.encode_as_ids(tgt_text) + [EOS_ID]
        
        src_batch.append(torch.tensor(src_ids, dtype=torch.long))
        tgt_batch.append(torch.tensor(tgt_ids, dtype=torch.long))
    
    # Pad sequence: t·∫°o tensor h√¨nh ch·ªØ nh·∫≠t (Batch_Size, Max_Len)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=PAD_ID)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_ID)
    
    return src_padded, tgt_padded

# --- C·∫§U H√åNH V√Ä T·∫†O DATALOADER ---

# ƒê∆∞·ªùng d·∫´n file (s·ª≠ d·ª•ng file b·∫°n ƒë√£ t·∫°o ra ·ªü cell ƒë·∫ßu ti√™n)
train_src_file = "/kaggle/input/medicaldataset-vlsp/MedicalDataset_VLSP/train.en.txt"
train_tgt_file = "/kaggle/input/medicaldataset-vlsp/MedicalDataset_VLSP/train.vi.txt"

# T·∫°o Dataset
full_train_dataset = Subset(IWSLTDataset(train_src_file, train_tgt_file),range(5000))

# T·∫°o DataLoader
BATCH_SIZE = 16 # B·∫°n c√≥ th·ªÉ gi·∫£m xu·ªëng 16 n·∫øu b·ªã tr√†n RAM (OOM)

ƒêang ƒë·ªçc v√† l·ªçc d·ªØ li·ªáu...
Ho√†n t·∫•t load data. S·ªë c·∫∑p c√¢u h·ª£p l·ªá: 500000


In [42]:
# 1. T√≠nh to√°n k√≠ch th∆∞·ªõc 90% train, 10% validation
val_size = int(len(full_train_dataset) * 0.1)
train_size = len(full_train_dataset) - val_size

# 2. T√°ch dataset
# D√πng manual_seed ƒë·ªÉ ƒë·∫£m b·∫£o l·∫ßn n√†o ch·∫°y c≈©ng chia y h·ªát nhau
train_subset, val_subset = random_split(
    full_train_dataset, 
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42) 
)

# 3. T·∫°o DataLoader t·ª´ c√°c t·∫≠p con
train_loader = DataLoader(train_subset,num_workers=2,
                          batch_size=BATCH_SIZE, shuffle=True,
                          collate_fn=collate_batch)
val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE,num_workers=2,   
                        shuffle=False, collate_fn=collate_batch)

print(f"Chia xong: {len(train_subset)} c√¢u train, {len(val_subset)} c√¢u val.")
print(f"-> S·∫µn s√†ng d√πng `train_loader` v√† `val_loader` ƒë·ªÉ hu·∫•n luy·ªán.")

Chia xong: 4500 c√¢u train, 500 c√¢u val.
-> S·∫µn s√†ng d√πng `train_loader` v√† `val_loader` ƒë·ªÉ hu·∫•n luy·ªán.


# X√¢y d·ª±ng Ki·∫øn tr√∫c Transformer

In [5]:
import copy
# clone helper
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [6]:
def attention(query, key, value, mask= None, dropout= None):
    """
    query, key, value: (batch_size x heads x seq_len x d_k )
    mask: ?
    dropout: nn.Dropout module from whichever module using this function
    returns attention output (batch_size x heads x seq_len x d_k)
      and weights
    """

    d_k = query.size(-1) # get head size
    # score = query x key_transpose/ head_size
    score = torch.matmul(query, key.transpose(-2, -1) / math.sqrt(d_k))

    if mask is not None:
        score = score.masked_fill(mask == 0, float('-inf')) # big negative

    # softmax to probabilities
    score = nn.functional.softmax(score, dim = -1)

    if dropout is not None:
        score = dropout(score)
    output = torch.matmul(score, value) # (seq_len x seq_len) @ (seq_len x d_model), no transpose :)
    return output, score # don't really need score

In [7]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [8]:
class RotaryPositionalEmbedding(nn.Module):
    def __init__(self, dim, base=10000):
        super().__init__()
        self.dim = dim
        self.base = base
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, seq_len, *, device=None, dtype=None):
        if device is None:
            device = self.inv_freq.device
        t = torch.arange(seq_len, device=device, dtype=dtype)
        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
        # Complex numbers for rotation
        emb = torch.cat((freqs, freqs), dim=-1)
        return emb.cos(), emb.sin()

def apply_rotary_pos_emb(q, k, freqs):
    # freqs contains (cos, sin) values
    cos, sin = freqs

    # Reshape cos and sin for broadcasting: (1, 1, seq_len, dim)
    # q, k shape: (batch_size, num_heads, seq_len, dim_per_head)
    cos = cos.unsqueeze(0).unsqueeze(0)
    sin = sin.unsqueeze(0).unsqueeze(0)

    # Apply rotation to query and key using complex number multiplication equivalent
    # [x0, x1, x2, x3, ...] -> [x0*cos - x1*sin, x0*sin + x1*cos, x2*cos - x3*sin, ...]
    q_rot = q * cos - rotate_half(q) * sin
    k_rot = k * cos - rotate_half(k) * sin

    return q_rot, k_rot

def rotate_half(x):
    # Rotates the second half of the embedding dimension by 180 degrees.
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)

In [9]:
class MultiHeadedAttention(nn.Module):
  # d_model = embedding dimensions
  def __init__(self, h, d_model, dropout=0.1):
    super(MultiHeadedAttention, self).__init__()
    assert d_model % h == 0
    self.d_k = d_model // h
    self.h = h
    self.linears = get_clones(nn.Linear(d_model, d_model), 4)
    self.attn = None
    self.dropout = nn.Dropout(p=dropout)
    # self.rotary = rotary # Store the RoPE instance

  def forward(self, query, key, value, mask= None):
    """
    q,k,v: batch_size x seq_length x d_model
    mask?
    output: batch_size x seq_length x d_model
    """
    if mask is not None:
      # Same mask applied to all h heads.
      mask = mask.unsqueeze(1)

    nbatches = query.size(0)

    query, key, value = [
        lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        for lin, x in zip(self.linears, (query, key, value))
    ]
    # Apply attention on all the projected vectors in batch.
    x, self.attn = attention(
        query, key, value, mask=mask, dropout=self.dropout
    )

    #  "Concat" using a view and apply a final linear.
    x = (
        x.transpose(1, 2)
        .contiguous()
        .view(nbatches, -1, self.h * self.d_k)
    )
    del query
    del key
    del value
    return self.linears[-1](x)

In [10]:
# embedding before encoder
# lut = lookup table, x is the tensor to look up
#
class Embedding(nn.Module):
  def __init__(self, d_model, vocab):
    super(Embedding, self).__init__()
    self.lut = nn.Embedding(vocab, d_model)
    self.d_model = d_model
  def forward(self, x):
    return self.lut(x) * math.sqrt(self.d_model) # why???

In [11]:
class LayerNorm(nn.Module):
    # implement layer normalization
    # eps for numerical stability
    def __init__(self, features, eps=1e-6):
      super(LayerNorm, self).__init__()
      self.a_2 = nn.Parameter(torch.ones(features))
      self.b_2 = nn.Parameter(torch.zeros(features))
      self.eps = eps
    def forward(self, x):
      mean = x.mean(-1, keepdim=True)
      std = x.std(-1, keepdim=True)
      return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


In [12]:
class SublayerConnection(nn.Module):
  # residual + norm
  # norm before or after? before!
  """ connect sub layers """
  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))

In [13]:
class PositionwiseFeedForward(nn.Module):
  # add non-linearities to attention
  # just two linear transformation with an activation (ReLU)
  # we can try using something else like SiLU, but can we justify it?
  def __init__(self, d_model, d_ff, dropout=0.1):
    super(PositionwiseFeedForward, self).__init__()
    self.w_1 = nn.Linear(d_model, d_ff)
    self.w_2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)
  def forward(self, x):
    return self.w_2(self.dropout(self.w_1(x).relu()))

In [14]:
class Encoder(nn.Module):
    # nn.Module l√† l·ªõp c∆° s·ªü cho t·∫•t c·∫£ c√°c m·∫°ng neural trong PyTorch
    """multiple stacked layers of EncoderLayer"""
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = get_clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [15]:
class EncoderLayer(nn.Module):
  """a layer of encoder"""
  """self attention then ffw"""
  # size = d_model
  # x is input?
  def __init__(self, size, self_attn, ffw, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.ffw = ffw
    self.sublayer = get_clones(SublayerConnection(size, dropout), 2)
    self.size = size
  def forward(self, x, mask):
    x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) # q,k,v is the same!
    return self.sublayer[1](x, lambda x: self.ffw(x))


In [16]:
class DecoderLayer(nn.Module):
  # self-explanatory
  # needs mask for self-attention
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = get_clones(SublayerConnection(size, dropout), 3)
    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [17]:
# mask helper
def subsequent_mask(size):
    "Mask out future positions"
    return torch.tril(torch.ones(size, size, dtype=torch.bool)).unsqueeze(0)

In [18]:
class Decoder(nn.Module):
  # c≈©ng nhi·ªÅu l·ªõp decoder x·∫øp ch·ªìng l√™n nhau
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = get_clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [19]:
class EncoderDecoder(nn.Module):
    """
    wrap the things together
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [20]:
class Generator(nn.Module):
  """final linear + softmax step"""
  def __init__(self, d_model, vocab):
      super(Generator, self).__init__()
      self.proj = nn.Linear(d_model, vocab)
  def forward(self, x):
      return nn.functional.log_softmax(self.proj(x), dim=-1)

# Hu·∫•n luy·ªán v√† ƒê√°nh gi√°

In [21]:
class Batch:
    def __init__(self, src, tgt=None, pad=PAD_ID):  # 2 = <blank>
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    # hide the PADDING too
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask

In [22]:
def make_model(src_vocab, tgt_vocab, layers , h, d_model , d_ff , dropout = 0.1):
  # just make a model :)
  c = copy.deepcopy
  # rotary_pe = RotaryPositionalEmbedding(d_model // h)
  attn = MultiHeadedAttention(h, d_model) # Pass rotary_pe to attention
  ff = PositionwiseFeedForward(d_model, d_ff, dropout)
  position = PositionalEncoding(d_model, dropout)
  model = EncoderDecoder(
    Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), layers),
    Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), layers),
    nn.Sequential(Embedding(d_model, src_vocab),c(position)),
    nn.Sequential(Embedding(d_model, tgt_vocab),c(position)),
    Generator(d_model, tgt_vocab),
  )
  for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
  return model

In [23]:
# used in inference
def beam_search(model, memory, src_mask, start_symbol, end_symbol, max_len=100, beam_size=5, device='cpu'):
    # memory: output of the encoder
    # src_mask: source mask
    # start_symbol: token for <s>
    # end_symbol: token for </s>
    # max_len: maximum length of generated sequence
    # beam_size: number of candidates to keep at each step

    # Ensure the model is in evaluation mode
    model.eval()

    # Initialize with the start symbol
    ys = torch.full((1, 1), start_symbol, dtype=torch.long, device=device)

    # Store (log_probability, sequence_so_far)
    candidates = [(0.0, ys)]

    for _ in range(max_len - 1):
        new_candidates = []
        # Iterate through current best candidates
        for log_prob, current_sequence in candidates:

            # If the sequence already ended, keep it as is
            if current_sequence[0, -1].item() == end_symbol:
                new_candidates.append((log_prob, current_sequence))
                continue

            # Create target mask for the current sequence
            tgt_mask = subsequent_mask(current_sequence.size(-1)).type_as(memory)

            # Decode the next token
            out = model.decode(memory, src_mask, current_sequence, tgt_mask)
            prob = model.generator(out[:, -1]) # Get probabilities for the last token
            log_probs = prob.log_softmax(dim=-1) # Convert to log probabilities

            # Get top 'beam_size' next tokens and their log probabilities
            top_k_log_probs, top_k_indices = log_probs.topk(beam_size)

            for i in range(beam_size):
                next_token_log_prob = top_k_log_probs[0, i].item()
                next_token = top_k_indices[0, i].item()

                # Extend the current sequence with the new token
                extended_sequence = torch.cat(
                    [current_sequence, torch.full((1, 1), next_token, dtype=torch.long, device=device)], dim=1
                )
                new_candidates.append((log_prob + next_token_log_prob, extended_sequence))

        # Sort all new candidates by their log probability and select the top 'beam_size'
        candidates = sorted(new_candidates, key=lambda x: x[0], reverse=True)[:beam_size]

        # If all best candidates have ended, we can stop early (optional optimization)
        if all(cand[1][0, -1].item() == end_symbol for cand in candidates):
            break

    # Return the best sequence (highest log probability)
    return candidates[0][1].cpu().squeeze().tolist()


In [24]:
def rate(step, d_model, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        d_model ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [25]:
#loss
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, padding_idx = 0, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim
        self.padding_idx = padding_idx

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            # true_dist = pred.data.clone()
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 2))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            true_dist[:, self.padding_idx] = 0
            mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
            if mask.dim() > 0:
                true_dist.index_fill_(0, mask.squeeze(), 0.0)

        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [26]:
import math

def evaluate_loss(model, val_dataloader, criterion, device, pad_idx = PAD_ID):
    """
    Ch·∫°y model tr√™n t·∫≠p Val ƒë·ªÉ t√≠nh Loss v√† Perplexity
    """
    model.eval() # Chuy·ªÉn sang ch·∫ø ƒë·ªô ƒë√°nh gi√° (t·∫Øt Dropout)
    total_loss = 0
    
    with torch.no_grad(): # T·∫Øt t√≠nh to√°n gradient ƒë·ªÉ ti·∫øt ki·ªám RAM
        for batch_data in val_dataloader:
            src, tgt = batch_data[0].to(device), batch_data[1].to(device)
            batch = Batch(src, tgt, pad=pad_idx)
            
            # Forward pass
            out = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
            out = model.generator(out)
            
            # T√≠nh Loss
            loss = criterion(out.contiguous().view(-1, out.size(-1)), 
                             batch.tgt_y.contiguous().view(-1))
            total_loss += loss.item()
            
    avg_loss = total_loss / len(val_dataloader)
    
    # T√≠nh Perplexity (PPL) = exp(Loss)
    # PPL c√†ng th·∫•p c√†ng t·ªët
    try:
        ppl = math.exp(avg_loss)
    except OverflowError:
        ppl = float('inf')
        
    model.train() # Quan tr·ªçng: Chuy·ªÉn l·∫°i v·ªÅ ch·∫ø ƒë·ªô train cho epoch sau
    return avg_loss, ppl

In [27]:
import torch

def translate_sentence(sentence, model, sp_model, device, max_len=100):
    model.eval() # Chuy·ªÉn sang ch·∫ø ƒë·ªô ƒë√°nh gi√°
    
    # 1. Tokenize c√¢u ƒë·∫ßu v√†o
    # Th√™m BOS v√† EOS gi·ªëng h·ªát l√∫c train
    tokens = [BOS_ID] + sp_model.encode_as_ids(sentence) + [EOS_ID]
    src = torch.tensor(tokens).long().unsqueeze(0).to(device) # (1, seq_len)
    
    # 2. T·∫°o mask
    src_mask = (src != PAD_ID).unsqueeze(-2)
    
    # 3. Encoder
    with torch.no_grad():
        memory = model.encode(src, src_mask)
        
    # 4. Beam Search (H√†m n√†y b·∫°n ƒë√£ c√≥ trong code g·ªëc)
    # L∆∞u √Ω: C·∫ßn ƒë·∫£m b·∫£o h√†m beam_search ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a
    output_ids = beam_search(model, memory, src_mask, BOS_ID, EOS_ID, max_len, beam_size=3, device=device)
    
    # 5. Decode
    if EOS_ID in output_ids:
        output_ids = output_ids[:output_ids.index(EOS_ID)]
        
    translation = sp_model.decode(output_ids)
    
    model.train() # Chuy·ªÉn l·∫°i v·ªÅ mode train
    return translation

In [28]:
# Thi·∫øt l·∫≠p thi·∫øt b·ªã
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

D_MODEL = 128
VOCAB_SIZE = sp.get_piece_size() 
D_FF = 1024
LAYERS=4
H=4
# Quan tr·ªçng: LabelSmoothingLoss c·∫ßn input ƒë√∫ng k√≠ch th∆∞·ªõc vocab
criterion = LabelSmoothingLoss(classes=VOCAB_SIZE, padding_idx=PAD_ID, smoothing=0.1)

# Kh·ªüi t·∫°o m√¥ h√¨nh (ƒë·∫£m b·∫£o h√†m make_model ƒë√£ ƒë∆∞·ª£c s·ª≠a nh∆∞ m·ª•c 3)
model = make_model(src_vocab=VOCAB_SIZE, tgt_vocab=VOCAB_SIZE, 
                   layers=LAYERS, h=H, d_model=D_MODEL, d_ff=D_FF, dropout=0.1)

model.to(device)

print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters.")

Using device: cpu
Model created with 9064576 parameters.


In [29]:
def train_with_validation_and_checkpointing(
    model, train_dataloader, val_dataloader, criterion, optimizer, lr_scheduler, 
    device, pad_idx, sp_model, num_epochs, sample_sentence,
    start_epoch=0, best_val_loss=float('inf')
):
    model.train()
    scaler = torch.amp.GradScaler("cuda")
    print(f"{'='*10} B·∫ÆT ƒê·∫¶U TRAINING V·ªöI  {'='*10}")

    # Kh·ªüi t·∫°o history ƒë·ªÉ l∆∞u k·∫øt qu·∫£
    history = {
        "train_loss": [],
        "val_loss": [],
        "val_ppl": []
    }
    print(f"{'='*10} B·∫ÆT ƒê·∫¶U TRAINING T·ª™ EPOCH {start_epoch + 1} {'='*10}")
    for epoch in range(start_epoch, num_epochs):
        print(f"\n--- Epoch {epoch + 1}/{num_epochs} ---")
        
        # 1. TRAINING LOOP
        model.train()
        train_loss = 0
        pbar = tqdm(train_dataloader, desc="Training")
        
        for i, batch_data in enumerate(pbar):
            src, tgt = batch_data[0].to(device), batch_data[1].to(device)
            # Skip c√¢u qu√° d√†i
            if src.size(1) > 150 or tgt.size(1) > 150: continue
            
            batch = Batch(src, tgt, pad=pad_idx)
            with torch.amp.autocast(device_type="cuda"):
            # with autocast():
                out = model(batch.src, batch.tgt, batch.src_mask, batch.tgt_mask)
                out = model.generator(out)
                loss = criterion(out.contiguous().view(-1, out.size(-1)), batch.tgt_y.contiguous().view(-1))

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            lr_scheduler.step()
            
            train_loss += loss.item()
            pbar.set_postfix({"loss": loss.item(), "lr": optimizer.param_groups[0]["lr"]})
            
        avg_train_loss = train_loss / len(train_dataloader)
        
        # 2. VALIDATION LOOP (Sau khi xong 1 epoch train)
        print("ƒêang ch·∫°y validate...")
        avg_val_loss, val_ppl = evaluate_loss(model, val_dataloader, criterion, device, pad_idx)
        
        print(f"üìå K·∫æT QU·∫¢ EPOCH {epoch+1}:")
        print(f"   - Train Loss: {avg_train_loss:.4f}")
        print(f"   - Val Loss:   {avg_val_loss:.4f} (Perplexity: {val_ppl:.2f})")
        
        # 1. L∆∞u checkpoint c·ªßa epoch cu·ªëi c√πng (ƒë·ªÉ ph√≤ng khi b·ªã crash gi·ªØa ch·ª´ng)
        torch.save({
            'epoch': epoch + 1, # L∆∞u epoch k·∫ø ti·∫øp s·∫Ω ch·∫°y
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': lr_scheduler.state_dict(),
            'best_val_loss': best_val_loss,
        }, "last_vlsp_checkpoint.pth")
        
        # 2. L∆∞u checkpoint t·ªët nh·∫•t (d·ª±a tr√™n val_loss)
        if avg_val_loss < best_val_loss:
            print(f"‚úÖ Loss gi·∫£m ({best_val_loss:.4f} -> {avg_val_loss:.4f}). L∆∞u best checkpoint!")
            best_val_loss = avg_val_loss
            # G√≥i t·∫•t c·∫£ v√†o m·ªôt dictionary
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': lr_scheduler.state_dict(),
                'best_val_loss': best_val_loss,
            }, "best_vlsp_checkpoint.pth")
        else:
            print(f"‚ö†Ô∏è Loss kh√¥ng gi·∫£m (Best: {best_val_loss:.4f}). Kh√¥ng c·∫≠p nh·∫≠t best checkpoint.")
            
        # L∆∞u k·∫øt qu·∫£ v√†o history
        history["train_loss"].append(avg_train_loss)
        history["val_loss"].append(avg_val_loss)
        history["val_ppl"].append(val_ppl)
        # 4. D·ªãch th·ª≠
        if sp_model:
            try:
                pred = translate_sentence(sample_sentence, model, sp_model, device)
                print(f"   - D·ªãch th·ª≠: {pred}")
            except: pass
            
    print(f"\nTraining ho√†n t·∫•t! Best Val Loss: {best_val_loss:.4f}")
    return history

In [30]:
def load_checkpoint(model, optimizer, scheduler, filename="last_checkpoint.pth"):
    start_epoch = 0
    best_val_loss = float('inf')
    
    if os.path.exists(filename):
        print(f"--- ƒêang t·∫£i checkpoint t·ª´: {filename} ---")
        checkpoint = torch.load(filename, map_location=device)
        
        # Kh√¥i ph·ª•c tr·∫°ng th√°i
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        
        # L·∫•y th√¥ng tin epoch v√† loss ƒë·ªÉ train ti·∫øp
        start_epoch = checkpoint['epoch']
        best_val_loss = checkpoint['best_val_loss']
        
        print(f"‚úÖ Kh√¥i ph·ª•c th√†nh c√¥ng! S·∫µn s√†ng train ti·∫øp t·ª´ Epoch {start_epoch + 1}.")
        print(f"   - Best Val Loss ƒë√£ ghi nh·∫≠n: {best_val_loss:.4f}")
        print(f"   - Learning Rate hi·ªán t·∫°i: {optimizer.param_groups[0]['lr']:.6f}")
    else:
        print("--- Kh√¥ng t√¨m th·∫•y checkpoint. B·∫Øt ƒë·∫ßu training t·ª´ ƒë·∫ßu. ---")
        
    return start_epoch, best_val_loss


In [31]:
import matplotlib.pyplot as plt

def plot_metrics(history):
    """
    V·∫Ω ƒë·ªì th·ªã Train Loss, Val Loss, v√† Val Perplexity.
    """
    # L·∫•y d·ªØ li·ªáu t·ª´ history
    train_loss = history['train_loss']
    val_loss = history['val_loss']
    val_ppl = history['val_ppl']
    epochs = range(1, len(train_loss) + 1)
    
    # T·∫°o figure v·ªõi 2 subplot
    plt.figure(figsize=(14, 5))
    
    # --- Bi·ªÉu ƒë·ªì 1: Loss ---
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss, 'bo-', label='Training Loss')
    plt.plot(epochs, val_loss, 'ro-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    # --- Bi·ªÉu ƒë·ªì 2: Perplexity ---
    plt.subplot(1, 2, 2)
    plt.plot(epochs, val_ppl, 'go-', label='Validation Perplexity')
    plt.title('Validation Perplexity')
    plt.xlabel('Epochs')
    plt.ylabel('Perplexity')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

# EXE

In [None]:
from tqdm import tqdm
optimizer = torch.optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.98), eps=1e-9)
lr_scheduler = LambdaLR(optimizer=optimizer,
    lr_lambda=lambda step: rate(step, d_model=D_MODEL, factor=1, warmup=4000))

# --- 2. LOAD CHECKPOINT (N·∫æU C√ì) ---
# H√†m n√†y s·∫Ω t·ª± ƒë·ªông ƒëi·ªÅn tr·∫°ng th√°i ƒë√£ l∆∞u v√†o c√°c object ·ªü tr√™n
start_epoch, best_val_loss = load_checkpoint(model, optimizer, lr_scheduler,
                                             '/kaggle/input/best/pytorch/default/1/best_checkpoint.pth')
# Train l·∫°i t·ª´ ƒë·∫ßu ho·∫∑c train ti·∫øp 

try:
    history = train_with_validation_and_checkpointing(
        model=model,
        train_dataloader=train_loader,
        val_dataloader=val_loader,
        criterion=criterion,
        optimizer=optimizer,         # Truy·ªÅn optimizer ƒë√£ ƒë∆∞·ª£c load state
        lr_scheduler=lr_scheduler,   # Truy·ªÅn scheduler ƒë√£ ƒë∆∞·ª£c load state
        device=device,
        pad_idx=PAD_ID,
        sp_model=sp,
        num_epochs=25,               # V√≠ d·ª• mu·ªën train t·ªïng c·ªông 20 epochs
        sample_sentence="Knowledge, practices in public health service utilization among health insurance card‚Äôs holders and influencing factors in Vientiane, Lao",
        start_epoch=start_epoch,     # B·∫Øt ƒë·∫ßu t·ª´ epoch ƒë√£ load
        best_val_loss=best_val_loss  # D√πng best_loss ƒë√£ load ƒë·ªÉ so s√°nh
    )
except KeyboardInterrupt:
    print("ƒê√£ d·ª´ng training th·ªß c√¥ng.")
finally:
    # 2. Sau khi ch·∫°y xong (d√π th√†nh c√¥ng hay th·∫•t b·∫°i), h√£y ki·ªÉm tra xem 'history' c√≥ gi√° tr·ªã h·ª£p l·ªá kh√¥ng
    if history is not None:
        print("Training ho√†n t·∫•t. ƒêang v·∫Ω ƒë·ªì th·ªã...")
        plot_metrics(history)
    else:
        print("Kh√¥ng c√≥ d·ªØ li·ªáu history ƒë·ªÉ v·∫Ω ƒë·ªì th·ªã do training kh√¥ng th√†nh c√¥ng.")

--- ƒêang t·∫£i checkpoint t·ª´: /kaggle/input/best/pytorch/default/1/best_checkpoint.pth ---
‚úÖ Kh√¥i ph·ª•c th√†nh c√¥ng! S·∫µn s√†ng train ti·∫øp t·ª´ Epoch 21.
   - Best Val Loss ƒë√£ ghi nh·∫≠n: 1.2044
   - Learning Rate hi·ªán t·∫°i: 0.000218

--- Epoch 21/25 ---


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 282/282 [04:44<00:00,  1.01s/it, loss=1.85, lr=0.000218]

ƒêang ch·∫°y validate...





üìå K·∫æT QU·∫¢ EPOCH 21:
   - Train Loss: 1.9698
   - Val Loss:   1.8546 (Perplexity: 6.39)
‚ö†Ô∏è Loss kh√¥ng gi·∫£m (Best: 1.2044). Kh√¥ng c·∫≠p nh·∫≠t best checkpoint.
   - D·ªãch th·ª≠:  ‚Åá in c·∫£m ∆°n r·∫•t nhi·ªÅu l·∫ßn.

--- Epoch 22/25 ---


Training:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 135/282 [02:04<02:23,  1.02it/s, loss=1.87, lr=0.000218]

# Test

In [None]:
# import torch
# import os
# import pandas as pd  # D√πng pandas ƒë·ªÉ hi·ªÉn th·ªã b·∫£ng cho ƒë·∫πp

# # 1. C·∫•u h√¨nh Model (Ph·∫£i kh·ªõp 100% v·ªõi l√∫c train)
# # N·∫øu b·∫°n ƒë√£ ƒë·ªïi config l√∫c train th√¨ s·ª≠a l·∫°i ·ªü ƒë√¢y nh√©
# real_vocab_size = sp.get_piece_size()

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 2. Kh·ªüi t·∫°o l·∫°i model r·ªóng
# model_eval = make_model(src_vocab=real_vocab_size, tgt_vocab=real_vocab_size, 
#                         layers=layers, h=h, d_model=d_model, d_ff=d_ff, dropout=0.1)
# model_eval.to(device)

# # 3. Danh s√°ch c√¢u c·∫ßn soi (Test Set)
# sentences_to_track = [
#     "Thank you very much for your time.",  # C√¢u "huy·ªÅn tho·∫°i" c·ªßa b·∫°n
#     "The world is changing very fast.",    # C√¢u v·ªÅ s·ª± thay ƒë·ªïi
#     "Dickhead.",                           # C√¢u kinh ƒëi·ªÉn
#     "We need to protect our environment.", # C√¢u ph·ª©c t·∫°p h∆°n ch√∫t
#     "Education is important."              # C√¢u ƒë∆°n gi·∫£n
# ]

# # 4. H√†m ch·∫°y v√≤ng l·∫∑p qua c√°c Epoch
# def compare_epochs(start_ep=1, end_ep=10):
#     results = {sent: [] for sent in sentences_to_track}
#     epochs_found = []

#     print(f"ƒêang so s√°nh t·ª´ Epoch {start_ep} ƒë·∫øn {end_ep}...\n")

#     for epoch in range(start_ep, end_ep + 1):
#         filename = f"transformer_epoch_{epoch}.pth"
        
#         if not os.path.exists(filename):
#             print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file {filename}, b·ªè qua.")
#             continue
            
#         # Load weights v√†o model
#         model_eval.load_state_dict(torch.load(filename, map_location=device))
#         epochs_found.append(f"Ep {epoch}")
        
#         # D·ªãch t·ª´ng c√¢u
#         for sent in sentences_to_track:
#             try:
#                 # D√πng h√†m translate_sentence b·∫°n ƒë√£ c√≥
#                 pred = translate_sentence(sent, model_eval, sp, device)
#                 results[sent].append(pred)
#             except:
#                 results[sent].append("Error")

#     return results, epochs_found

# # --- TH·ª∞C THI V√Ä HI·ªÇN TH·ªä ---
# comparison_data, cols = compare_epochs(1, 10) # Ch·ªânh s·ªë epoch t√πy theo file b·∫°n c√≥

# # Hi·ªÉn th·ªã k·∫øt qu·∫£ (Gom nh√≥m theo t·ª´ng c√¢u ƒë·ªÉ d·ªÖ so s√°nh s·ª± ti·∫øn h√≥a)
# for sent, translations in comparison_data.items():
#     print(f"üî¥ SRC: {sent}")
#     print("-" * 60)
#     # T·∫°o DataFrame nh·ªè ƒë·ªÉ hi·ªÉn th·ªã cho ƒë·∫πp
#     df = pd.DataFrame(translations, index=cols, columns=["B·∫£n d·ªãch"])
#     print(df)
#     print("\n" + "="*60 + "\n")

In [None]:
# # Test th·ª≠ xem Tokenizer x·ª≠ l√Ω ch·ªØ Xin ra sao
# print("Tokenize 'Xin':", sp.encode_as_pieces("Xin"))
# print("Tokenize 'xin':", sp.encode_as_pieces("xin"))
# print("Tokenize 'Xin c·∫£m ∆°n':", sp.encode_as_pieces("Xin c·∫£m ∆°n"))

In [34]:
def translate_batch_greedy(src_batch, src_mask, model, max_len=100):
    """
    D·ªãch m·ªôt batch c√¢u b·∫±ng ph∆∞∆°ng ph√°p Greedy Search.
    R·∫•t nhanh nh∆∞ng ch·∫•t l∆∞·ª£ng c√≥ th·ªÉ k√©m h∆°n Beam Search m·ªôt ch√∫t.
    """
    batch_size = src_batch.size(0)
    device = src_batch.device
    
    # 1. Ch·∫°y Encoder M·ªòT L·∫¶N cho c·∫£ batch
    with torch.no_grad():
        memory = model.encode(src_batch, src_mask)

    # 2. Kh·ªüi t·∫°o decoder input v·ªõi token BOS cho t·∫•t c·∫£ c√°c c√¢u trong batch
    ys = torch.full((batch_size, 1), BOS_ID, dtype=torch.long, device=device)

    # 3. V√≤ng l·∫∑p gi·∫£i m√£ Greedy
    for _ in range(max_len - 1):
        with torch.no_grad():
            # Ch·∫°y decoder
            tgt_mask = subsequent_mask(ys.size(1)).type_as(src_mask.data)
            out = model.decode(memory, src_mask, ys, tgt_mask)
            
            # L·∫•y token cu·ªëi c√πng v√† ch·∫°y qua generator
            prob = model.generator(out[:, -1])
            
            # L·∫•y token c√≥ x√°c su·∫•t cao nh·∫•t (Greedy)
            _, next_word = torch.max(prob, dim=1)
            
            # N·ªëi token m·ªõi v√†o chu·ªói k·∫øt qu·∫£
            ys = torch.cat([ys, next_word.unsqueeze(1)], dim=1)
            
    return ys

def calculate_bleu_fast(model, dataloader, sp_model, device):
    """T√≠nh BLEU score b·∫±ng ph∆∞∆°ng ph√°p d·ªãch theo l√¥."""
    print("B·∫Øt ƒë·∫ßu t√≠nh BLEU score (phi√™n b·∫£n nhanh)...")
    
    bleu = BLEUScore(n_gram=4)
    model.eval()
    
    all_preds = []
    all_refs = []

    for batch in tqdm(dataloader, desc="D·ªãch theo l√¥"):
        src, tgt = batch[0].to(device), batch[1].to(device)
        
        # T·∫°o source mask
        src_mask = (src != PAD_ID).unsqueeze(-2)
        
        # D·ªãch c·∫£ batch b·∫±ng Greedy Search
        pred_ids_batch = translate_batch_greedy(src, src_mask, model)
        
        # Decode t·ª´ ID sang text v√† l∆∞u k·∫øt qu·∫£
        for i in range(pred_ids_batch.size(0)):
            # L·∫•y c√¢u g·ªëc (reference)
            ref_ids = tgt[i, 1:] # B·ªè BOS
            ref_text = sp_model.decode([id for id in ref_ids.tolist() if id != PAD_ID and id != EOS_ID])
            all_refs.append([ref_text])
            
            # L·∫•y c√¢u d·ªãch (prediction)
            pred_ids = pred_ids_batch[i, 1:] # B·ªè BOS
            pred_text = sp_model.decode([id for id in pred_ids.tolist() if id != PAD_ID and id != EOS_ID])
            all_preds.append(pred_text)
            
    # T√≠nh ƒëi·ªÉm BLEU cu·ªëi c√πng
    score = bleu(all_preds, all_refs)
    
    print("\n" + "="*30)
    print(f"K·∫æT QU·∫¢ BLEU SCORE (Nhanh): {score.item():.4f}")
    print("="*30)
    
    return score.item(), all_preds, all_refs

In [36]:
# 1. ƒê∆∞·ªùng d·∫´n file Test (ki·ªÉm tra l·∫°i trong folder input c·ªßa b·∫°n xem t√™n ch√≠nh x√°c l√† g√¨)
# Th∆∞·ªùng l√† tst2012 ho·∫∑c tst2013
test_src_file = "/kaggle/input/medicaldataset-vlsp/MedicalDataset_VLSP/public_test.en.txt"
test_tgt_file = "/kaggle/input/medicaldataset-vlsp/MedicalDataset_VLSP/public_test.vi.txt"

# 2. T·∫°o Dataset cho t·∫≠p Test
if os.path.exists(test_src_file):
    print("ƒêang t·∫°o Test Dataset...")
    test_loader_for_bleu = DataLoader(
        Subset(IWSLTDataset(test_src_file, test_tgt_file), range(64)), 
        batch_size= 8,
        shuffle=False, # Kh√¥ng shuffle khi test
        collate_fn=collate_batch
    )
    # 3. T√≠nh BLEU tr√™n t·∫≠p Test 
    print("ƒêang ch·∫•m ƒëi·ªÉm tr√™n t·∫≠p Test (D·ªØ li·ªáu l·∫°)...")
    score, preds, refs = calculate_bleu_fast(
        model=model,
        dataloader=test_loader_for_bleu,
        sp_model=sp,
        device=device
    )
else:
    print("Kh√¥ng t√¨m th·∫•y file Test. B·∫°n h√£y ki·ªÉm tra l·∫°i ƒë∆∞·ªùng d·∫´n trong Input.")

ƒêang t·∫°o Test Dataset...
ƒêang ƒë·ªçc v√† l·ªçc d·ªØ li·ªáu...
Ho√†n t·∫•t load data. S·ªë c·∫∑p c√¢u h·ª£p l·ªá: 3000
ƒêang ch·∫•m ƒëi·ªÉm tr√™n t·∫≠p Test (D·ªØ li·ªáu l·∫°)...
B·∫Øt ƒë·∫ßu t√≠nh BLEU score (phi√™n b·∫£n nhanh)...


D·ªãch theo l√¥: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:26<00:00,  3.36s/it]



K·∫æT QU·∫¢ BLEU SCORE (Nhanh): 0.1111


In [None]:
score, preds, refs = calculate_bleu_fast(
    model=model,
    dataloader=test_loader_for_bleu,
    sp_model=sp,
    device=device
)

In [None]:
# import os

# # N√©n to√†n b·ªô file trong working th√†nh output.zip
# os.system("zip -r output.zip .")

# from IPython.display import FileLink
# print("B·∫•m v√†o ƒë√¢y ƒë·ªÉ t·∫£i tr·ªçn b·ªô (ZIP):")
# FileLink(r'output.zip')

In [None]:
# os.remove("best_model_iwslt.pth")
