# Xây dựng kiến trúc Encoder-Decoder gồm 3 lớp LSTM cho module encoder và 3 lớp LSTM cho module decoder, với hidden size là 256, cho bài toán dịch máy từ tiếng Anh sang tiếng Việt. Module decoder được trang bị kỹ thuật attention theo mô tả của nghiên cứu "[Neural Machine Translation by Jointly Learning to Align and Translate](https://arxiv.org/abs/1409.0473)". Huấn luyện mô hình này trên bộ dữ liệu PhoMT sử dụng Adam làm phương thức tối ưu tham số. Đánh giá độ hiệu quả của mô hình sử dụn độ đo ROUGE-L.

In [1]:
# Cài đặt thư viện cần thiết
%pip -q install torch rouge-score tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import random
import re
from collections import Counter
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer
from tqdm import tqdm

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

HID_DIM = 256
EMB_DIM = 256
N_LAYERS = 3
DROPOUT = 0.2
BATCH_SIZE = 64
N_EPOCHS = 150
TEACHER_FORCING = 0.5
MAX_LEN = 120
LR = 1e-3

TRAIN_PATH = "/kaggle/input/small-phomt/train.json"
DEV_PATH = "/kaggle/input/small-phomt/dev.json"
TEST_PATH = "/kaggle/input/small-phomt/test.json"

In [3]:
def load_split(path: str):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train_data = load_split(TRAIN_PATH)
dev_data = load_split(DEV_PATH)
test_data = load_split(TEST_PATH)

print(f"Train/dev/test sizes: {len(train_data)}/{len(dev_data)}/{len(test_data)}")

Train/dev/test sizes: 20000/2000/2000


In [4]:
def tokenize(text: str) -> List[str]:
    return re.findall(r"\w+|[^\w\s]", text.lower())

class Vocab:
    def __init__(self, texts: List[str], min_freq: int = 2):
        specials = ["<pad>", "<unk>", "<bos>", "<eos>"]
        counter = Counter()
        for line in texts:
            counter.update(tokenize(line))
        tokens = [tok for tok, cnt in counter.items() if cnt >= min_freq]
        self.itos = specials + sorted(tokens)
        self.stoi = {tok: idx for idx, tok in enumerate(self.itos)}
        self.pad_idx = self.stoi["<pad>"]
        self.unk_idx = self.stoi["<unk>"]
        self.bos_idx = self.stoi["<bos>"]
        self.eos_idx = self.stoi["<eos>"]

    def encode(self, text: str) -> List[int]:
        return [self.stoi.get(tok, self.unk_idx) for tok in tokenize(text)]

    def decode(self, ids: List[int]) -> str:
        tokens = []
        for idx in ids:
            if idx == self.eos_idx:
                break
            if idx in (self.pad_idx, self.bos_idx):
                continue
            tokens.append(self.itos[idx])
        return " ".join(tokens)

src_vocab = Vocab([ex["english"] for ex in train_data], min_freq=2)
tgt_vocab = Vocab([ex["vietnamese"] for ex in train_data], min_freq=2)

print(f"Src vocab: {len(src_vocab.itos)} | Tgt vocab: {len(tgt_vocab.itos)}")

Src vocab: 10015 | Tgt vocab: 4264


In [5]:
class TranslationDataset(Dataset):
    def __init__(self, data, src_vocab: Vocab, tgt_vocab: Vocab, max_len: int):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        src_ids = [self.src_vocab.bos_idx] + self.src_vocab.encode(sample["english"])[: self.max_len - 2] + [self.src_vocab.eos_idx]
        tgt_ids = [self.tgt_vocab.bos_idx] + self.tgt_vocab.encode(sample["vietnamese"])[: self.max_len - 2] + [self.tgt_vocab.eos_idx]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_padded = nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=src_vocab.pad_idx)
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_seqs, batch_first=True, padding_value=tgt_vocab.pad_idx)
    return src_padded, tgt_padded

train_dataset = TranslationDataset(train_data, src_vocab, tgt_vocab, MAX_LEN)
dev_dataset = TranslationDataset(dev_data, src_vocab, tgt_vocab, MAX_LEN)
test_dataset = TranslationDataset(test_data, src_vocab, tgt_vocab, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

src_vocab_size = len(src_vocab.itos)
tgt_vocab_size = len(tgt_vocab.itos)

print(f"Batches: train {len(train_loader)}, dev {len(dev_loader)}, test {len(test_loader)}")

Batches: train 313, dev 32, test 32


In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float, pad_idx: int):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hid_dim: int):
        super().__init__()
        self.W_enc = nn.Linear(hid_dim, hid_dim)
        self.W_dec = nn.Linear(hid_dim, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden: torch.Tensor, encoder_outputs: torch.Tensor):
        # hidden: (batch, hid), encoder_outputs: (batch, src_len, hid)
        src_len = encoder_outputs.size(1)
        hidden_expanded = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.W_enc(encoder_outputs) + self.W_dec(hidden_expanded))
        scores = self.v(energy).squeeze(-1)
        attn = torch.softmax(scores, dim=1)
        context = torch.bmm(attn.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attn

class Decoder(nn.Module):
    def __init__(self, output_dim: int, emb_dim: int, hid_dim: int, n_layers: int, dropout: float, pad_idx: int):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim + hid_dim, hid_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2, output_dim)
        self.attn = BahdanauAttention(hid_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.embedding(input)
        # attention with top-layer hidden
        dec_hidden_top = hidden[-1]
        context, attn = self.attn(dec_hidden_top, encoder_outputs)
        lstm_input = torch.cat([embedded, context.unsqueeze(1)], dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = output.squeeze(1)
        logits = self.fc_out(torch.cat([output, context], dim=1))
        return logits, hidden, cell, attn

class Seq2Seq(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, device: torch.device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio: float = 0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(batch_size, trg_len, vocab_size, device=self.device)
        enc_outputs, hidden, cell = self.encoder(src)

        input_token = trg[:, 0]
        for t in range(1, trg_len):
            logits, hidden, cell, _ = self.decoder(input_token, hidden, cell, enc_outputs)
            outputs[:, t, :] = logits
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = logits.argmax(1)
            input_token = trg[:, t] if teacher_force else top1
        return outputs

    def greedy_decode(self, src, max_len: int):
        self.eval()
        with torch.no_grad():
            batch_size = src.size(0)
            enc_outputs, hidden, cell = self.encoder(src)
            input_token = torch.full((batch_size,), tgt_vocab.bos_idx, dtype=torch.long, device=self.device)
            outputs = []
            for _ in range(max_len):
                logits, hidden, cell, _ = self.decoder(input_token, hidden, cell, enc_outputs)
                next_token = logits.argmax(1)
                outputs.append(next_token.unsqueeze(1))
                input_token = next_token
            return torch.cat(outputs, dim=1)

In [7]:
encoder = Encoder(src_vocab_size, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, src_vocab.pad_idx)
decoder = Decoder(tgt_vocab_size, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, tgt_vocab.pad_idx)
model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab.pad_idx)

def train_epoch(model, loader):
    model.train()
    total_loss = 0.0
    for src, tgt in tqdm(loader, desc="train", leave=False):
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(src, tgt, teacher_forcing_ratio=TEACHER_FORCING)
        logits = outputs[:, 1:].reshape(-1, tgt_vocab_size)
        gold = tgt[:, 1:].reshape(-1)
        loss = criterion(logits, gold)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate_loss(model, loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for src, tgt in tqdm(loader, desc="eval", leave=False):
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            outputs = model(src, tgt, teacher_forcing_ratio=0.0)
            logits = outputs[:, 1:].reshape(-1, tgt_vocab_size)
            gold = tgt[:, 1:].reshape(-1)
            loss = criterion(logits, gold)
            total_loss += loss.item()
    return total_loss / len(loader)

In [8]:
best_dev = float("inf")
for epoch in range(1, N_EPOCHS + 1):
    train_loss = train_epoch(model, train_loader)
    dev_loss = evaluate_loss(model, dev_loader)
    if dev_loss < best_dev:
        best_dev = dev_loss
        torch.save(model.state_dict(), "best_attn_seq2seq.pt")
    print(f"Epoch {epoch}: train {train_loss:.4f} | dev {dev_loss:.4f}")

model.load_state_dict(torch.load("best_attn_seq2seq.pt", map_location=DEVICE))

                                                     

Epoch 1: train 6.1592 | dev 6.2523


                                                     

Epoch 2: train 5.9397 | dev 6.1857


                                                     

Epoch 3: train 5.6490 | dev 6.0192


                                                     

Epoch 4: train 5.3720 | dev 5.8515


                                                     

Epoch 5: train 5.1088 | dev 5.7611


                                                     

Epoch 6: train 4.8986 | dev 5.6620


                                                     

Epoch 7: train 4.6984 | dev 5.5892


                                                     

Epoch 8: train 4.5126 | dev 5.5716


                                                     

Epoch 9: train 4.3591 | dev 5.5370


                                                     

Epoch 10: train 4.2077 | dev 5.5539


                                                     

Epoch 11: train 4.0750 | dev 5.5983


                                                     

Epoch 12: train 3.9552 | dev 5.5851


                                                     

Epoch 13: train 3.8364 | dev 5.6008


                                                     

Epoch 14: train 3.7231 | dev 5.6333


                                                     

Epoch 15: train 3.6314 | dev 5.6401


                                                     

Epoch 16: train 3.5513 | dev 5.6305


                                                     

Epoch 17: train 3.4721 | dev 5.7153


                                                     

Epoch 18: train 3.3883 | dev 5.7055


                                                     

Epoch 19: train 3.3183 | dev 5.7514


                                                     

Epoch 20: train 3.2525 | dev 5.7794


                                                     

Epoch 21: train 3.1886 | dev 5.8160


                                                     

Epoch 22: train 3.1401 | dev 5.8821


                                                     

Epoch 23: train 3.0816 | dev 5.8883


                                                     

Epoch 24: train 3.0253 | dev 5.9112


                                                     

Epoch 25: train 2.9798 | dev 5.9324


                                                     

Epoch 26: train 2.9349 | dev 5.9647


                                                     

Epoch 27: train 2.8914 | dev 5.9729


                                                     

Epoch 28: train 2.8307 | dev 6.0779


                                                     

Epoch 29: train 2.7892 | dev 6.0928


                                                     

Epoch 30: train 2.7555 | dev 6.0730


                                                     

Epoch 31: train 2.7125 | dev 6.1512


                                                     

Epoch 32: train 2.6565 | dev 6.1589


                                                     

Epoch 33: train 2.6306 | dev 6.2025


                                                     

Epoch 34: train 2.6007 | dev 6.2166


                                                     

Epoch 35: train 2.5569 | dev 6.2910


                                                     

Epoch 36: train 2.5279 | dev 6.2985


                                                     

Epoch 37: train 2.5070 | dev 6.3095


                                                     

Epoch 38: train 2.4722 | dev 6.3100


                                                     

Epoch 39: train 2.4327 | dev 6.3947


                                                     

Epoch 40: train 2.4021 | dev 6.4278


                                                     

Epoch 41: train 2.3702 | dev 6.4752


                                                     

Epoch 42: train 2.3556 | dev 6.4649


                                                     

Epoch 43: train 2.3289 | dev 6.5028


                                                     

Epoch 44: train 2.3012 | dev 6.5259


                                                     

Epoch 45: train 2.2702 | dev 6.5743


                                                     

Epoch 46: train 2.2509 | dev 6.5726


                                                     

Epoch 47: train 2.2239 | dev 6.6429


                                                     

Epoch 48: train 2.2097 | dev 6.6461


                                                     

Epoch 49: train 2.1828 | dev 6.6315


                                                     

Epoch 50: train 2.1594 | dev 6.7268


                                                     

Epoch 51: train 2.1361 | dev 6.7585


                                                     

Epoch 52: train 2.1228 | dev 6.7270


                                                     

Epoch 53: train 2.1033 | dev 6.8112


                                                     

Epoch 54: train 2.0686 | dev 6.8339


                                                     

Epoch 55: train 2.0585 | dev 6.8240


                                                     

Epoch 56: train 2.0540 | dev 6.8849


                                                     

Epoch 57: train 2.0287 | dev 6.8760


                                                     

Epoch 58: train 2.0014 | dev 6.9195


                                                     

Epoch 59: train 1.9970 | dev 6.9296


                                                     

Epoch 60: train 1.9788 | dev 6.9441


                                                     

Epoch 61: train 1.9533 | dev 6.9793


                                                     

Epoch 62: train 1.9426 | dev 6.9737


                                                     

Epoch 63: train 1.9229 | dev 7.0121


                                                     

Epoch 64: train 1.8973 | dev 7.0303


                                                     

Epoch 65: train 1.8724 | dev 7.0660


                                                     

Epoch 66: train 1.8788 | dev 7.0779


                                                     

Epoch 67: train 1.8607 | dev 7.0833


                                                     

Epoch 68: train 1.8493 | dev 7.1516


                                                     

Epoch 69: train 1.8389 | dev 7.1423


                                                     

Epoch 70: train 1.8144 | dev 7.1659


                                                     

Epoch 71: train 1.7997 | dev 7.1661


                                                     

Epoch 72: train 1.7949 | dev 7.1924


                                                     

Epoch 73: train 1.7769 | dev 7.2272


                                                     

Epoch 74: train 1.7699 | dev 7.2514


                                                     

Epoch 75: train 1.7523 | dev 7.2453


                                                     

Epoch 76: train 1.7426 | dev 7.2537


                                                     

Epoch 77: train 1.7321 | dev 7.3229


                                                     

Epoch 78: train 1.7085 | dev 7.3604


                                                     

Epoch 79: train 1.7053 | dev 7.3842


                                                     

Epoch 80: train 1.7017 | dev 7.3928


                                                     

Epoch 81: train 1.6831 | dev 7.3871


                                                     

Epoch 82: train 1.6628 | dev 7.4345


                                                     

Epoch 83: train 1.6517 | dev 7.4295


                                                     

Epoch 84: train 1.6502 | dev 7.4700


                                                     

Epoch 85: train 1.6249 | dev 7.4661


                                                     

Epoch 86: train 1.6194 | dev 7.5168


                                                     

Epoch 87: train 1.6049 | dev 7.4773


                                                     

Epoch 88: train 1.5911 | dev 7.5772


                                                     

Epoch 89: train 1.5814 | dev 7.6265


                                                     

Epoch 90: train 1.5806 | dev 7.5585


                                                     

Epoch 91: train 1.5698 | dev 7.5673


                                                     

Epoch 92: train 1.5539 | dev 7.6355


                                                     

Epoch 93: train 1.5347 | dev 7.6027


                                                     

Epoch 94: train 1.5374 | dev 7.6794


                                                     

Epoch 95: train 1.5422 | dev 7.6595


                                                     

Epoch 96: train 1.5232 | dev 7.6974


                                                     

Epoch 97: train 1.5099 | dev 7.7345


                                                     

Epoch 98: train 1.4990 | dev 7.7305


                                                     

Epoch 99: train 1.5004 | dev 7.7258


                                                     

Epoch 100: train 1.4799 | dev 7.7969


                                                     

Epoch 101: train 1.4696 | dev 7.8516


                                                     

Epoch 102: train 1.4538 | dev 7.7982


                                                     

Epoch 103: train 1.4605 | dev 7.7681


                                                     

Epoch 104: train 1.4594 | dev 7.8199


                                                     

Epoch 105: train 1.4483 | dev 7.8400


                                                     

Epoch 106: train 1.4400 | dev 7.7940


                                                     

Epoch 107: train 1.4485 | dev 7.8113


                                                     

Epoch 108: train 1.4171 | dev 7.8368


                                                     

Epoch 109: train 1.4177 | dev 7.9395


                                                     

Epoch 110: train 1.4005 | dev 7.9650


                                                     

Epoch 111: train 1.3884 | dev 7.9354


                                                     

Epoch 112: train 1.4016 | dev 7.9555


                                                     

Epoch 113: train 1.3842 | dev 7.9690


                                                     

Epoch 114: train 1.3737 | dev 8.0240


                                                     

Epoch 115: train 1.3773 | dev 8.0469


                                                     

Epoch 116: train 1.3469 | dev 8.0535


                                                     

Epoch 117: train 1.3486 | dev 8.0482


                                                     

Epoch 118: train 1.3483 | dev 8.0988


                                                     

Epoch 119: train 1.3411 | dev 8.0411


                                                     

Epoch 120: train 1.3347 | dev 8.0947


                                                     

Epoch 121: train 1.3288 | dev 8.0578


                                                     

Epoch 122: train 1.3097 | dev 8.0955


                                                     

Epoch 123: train 1.3013 | dev 8.1025


                                                     

Epoch 124: train 1.3059 | dev 8.1141


                                                     

Epoch 125: train 1.3066 | dev 8.1390


                                                     

Epoch 126: train 1.2976 | dev 8.1778


                                                     

Epoch 127: train 1.2755 | dev 8.1799


                                                     

Epoch 128: train 1.2642 | dev 8.1372


                                                     

Epoch 129: train 1.2776 | dev 8.1516


                                                     

Epoch 130: train 1.2826 | dev 8.1599


                                                     

Epoch 131: train 1.2588 | dev 8.2544


                                                     

Epoch 132: train 1.2590 | dev 8.2434


                                                     

Epoch 133: train 1.2548 | dev 8.3506


                                                     

Epoch 134: train 1.2372 | dev 8.3108


                                                     

Epoch 135: train 1.2261 | dev 8.2995


                                                     

Epoch 136: train 1.2345 | dev 8.2961


                                                     

Epoch 137: train 1.2254 | dev 8.3021


                                                     

Epoch 138: train 1.2130 | dev 8.3338


                                                     

Epoch 139: train 1.2090 | dev 8.3590


                                                     

Epoch 140: train 1.1952 | dev 8.3416


                                                     

Epoch 141: train 1.2027 | dev 8.3992


                                                     

Epoch 142: train 1.1967 | dev 8.3079


                                                     

Epoch 143: train 1.1873 | dev 8.4286


                                                     

Epoch 144: train 1.1776 | dev 8.3922


                                                     

Epoch 145: train 1.1759 | dev 8.4152


                                                     

Epoch 146: train 1.1834 | dev 8.4306


                                                     

Epoch 147: train 1.1632 | dev 8.4605


                                                     

Epoch 148: train 1.1694 | dev 8.4108


                                                     

Epoch 149: train 1.1626 | dev 8.5140


                                                     

Epoch 150: train 1.1485 | dev 8.4831




<All keys matched successfully>

In [9]:
def translate_batch(model, src_batch: torch.Tensor):
    src_batch = src_batch.to(DEVICE)
    preds = model.greedy_decode(src_batch, max_len=MAX_LEN)
    outputs = []
    for seq in preds:
        outputs.append(tgt_vocab.decode(seq.tolist()))
    return outputs

def compute_rouge(model, loader):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    scores = []
    model.eval()
    with torch.no_grad():
        for src, tgt in tqdm(loader, desc="rouge", leave=False):
            pred_texts = translate_batch(model, src)
            tgt_texts = []
            for seq in tgt:
                tgt_texts.append(tgt_vocab.decode(seq.tolist()))
            for pred, ref in zip(pred_texts, tgt_texts):
                score = scorer.score(ref, pred)["rougeL"].fmeasure
                scores.append(score)
    return sum(scores) / len(scores)

rouge_l = compute_rouge(model, test_loader)
print(f"ROUGE-L trên tập test: {rouge_l:.4f}")

                                                      

ROUGE-L trên tập test: 0.3580


