<a href="https://colab.research.google.com/github/once-upon-an-april/Thuc-Hanh-Deep-Learning-trong-Khoa-Hoc-Du-Lieu-DS201.Q11.1/blob/main/Bai3/22520975_Lab4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CHUẨN BỊ MÔI TRƯỜNG

In [None]:
!pip install -q datasets pyvi torchmetrics

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from datasets import load_dataset
from pyvi import ViTokenizer
from torchmetrics.text.rouge import ROUGEScore
import numpy as np
import random
from collections import Counter
import time

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device đang sử dụng: {device}")

# TIỀN XỬ LÝ DỮ LIỆU

In [None]:
MAX_SEQ_LEN = 100
BATCH_SIZE = 32

class Vocab:
    def __init__(self, dataset_iterator, key, is_src=True, max_size=20000, min_freq=3):
        self.token2idx = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.idx2token = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.is_src = is_src

        print(f"Đang xây dựng từ điển ({'Tiếng Anh' if is_src else 'Tiếng Việt'})...")
        counter = Counter()
        for item in dataset_iterator:
            text = item[key]
            tokens = self.tokenize(text)
            counter.update(tokens)

        most_common = [token for token, freq in counter.most_common(max_size) if freq >= min_freq]
        for idx, token in enumerate(most_common, start=4):
            self.token2idx[token] = idx
            self.idx2token[idx] = token

        print(f"-> Hoàn tất. Kích thước Vocab: {len(self.token2idx)}")

    def tokenize(self, text):
        text = str(text).lower().strip()
        if not self.is_src:
            return ViTokenizer.tokenize(text).split()
        return text.split()

    def encode(self, text, max_len=None):
        tokens = self.tokenize(text)
        # TRUNCATION: Cắt ngắn nếu quá dài
        if max_len is not None and len(tokens) > max_len - 2: # -2 cho SOS và EOS
            tokens = tokens[:max_len-2]

        return [1] + [self.token2idx.get(t, 3) for t in tokens] + [2]

    def decode(self, indices):
        tokens = []
        for idx in indices:
            if isinstance(idx, torch.Tensor): idx = idx.item()
            if idx == 2: break
            if idx in [0, 1, 3]: continue
            tokens.append(self.idx2token.get(idx, "<UNK>"))
        return " ".join(tokens).replace("_", " ")

    def __len__(self):
        return len(self.token2idx)

class PhoMTDataset(Dataset):
    def __init__(self, hf_dataset, src_vocab=None, tgt_vocab=None):
        self.data = hf_dataset
        self.src_key = 'en' if 'en' in self.data.column_names else 'src'
        self.tgt_key = 'vi' if 'vi' in self.data.column_names else 'tgt'

        if src_vocab is None:
            self.src_vocab = Vocab(self.data, self.src_key, is_src=True)
            self.tgt_vocab = Vocab(self.data, self.tgt_key, is_src=False)
        else:
            self.src_vocab = src_vocab
            self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        src_text = item[self.src_key]
        tgt_text = item[self.tgt_key]
        # ÁP DỤNG MAX LENGTH TẠI ĐÂY
        src_encoded = self.src_vocab.encode(src_text, max_len=MAX_SEQ_LEN)
        tgt_encoded = self.tgt_vocab.encode(tgt_text, max_len=MAX_SEQ_LEN)
        return torch.tensor(src_encoded), torch.tensor(tgt_encoded)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_pad = pad_sequence(src_batch, padding_value=0, batch_first=True)
    tgt_pad = pad_sequence(tgt_batch, padding_value=0, batch_first=True)
    return src_pad, tgt_pad

In [None]:
print("Đang tải dataset ura-hcmut/PhoMT từ Hugging Face...")
dataset = load_dataset("ura-hcmut/PhoMT", "default")

In [None]:
train_data_full = dataset['train']
valid_data_full = dataset['validation']

In [None]:
print(f"Số lượng mẫu Train: {len(train_data_full)}")
print(f"Số lượng mẫu Validation: {len(valid_data_full)}")

In [None]:
print("Đang xử lý dữ liệu...")

train_dataset = PhoMTDataset(train_data_full)
valid_dataset = PhoMTDataset(valid_data_full, src_vocab=train_dataset.src_vocab, tgt_vocab=train_dataset.tgt_vocab)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

# MODEL

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=3, dropout=0.5):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class DecoderBasic(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=3, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs=None):
        embedded = self.dropout(self.embedding(input_token))
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query.unsqueeze(1)) + self.Ua(keys)))
        weights = torch.softmax(scores, dim=1)
        context = torch.bmm(weights.transpose(1, 2), keys)
        return context, weights

class DecoderBahdanau(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=3, dropout=0.5):
        super().__init__()
        self.attention = BahdanauAttention(hidden_size)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim + hidden_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        embedded = self.dropout(self.embedding(input_token))
        query = hidden[-1]
        context, _ = self.attention(query, encoder_outputs)
        rnn_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.W = nn.Linear(hidden_size, hidden_size)

    def forward(self, query, keys):
        keys_proj = self.W(keys)
        scores = torch.bmm(query, keys_proj.transpose(1, 2))
        weights = torch.softmax(scores, dim=2)
        context = torch.bmm(weights, keys)
        return context, weights

class DecoderLuong(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=3, dropout=0.5):
        super().__init__()
        self.attention = LuongAttention(hidden_size)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc_concat = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_token, hidden, cell, encoder_outputs):
        embedded = self.dropout(self.embedding(input_token))
        lstm_out, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        context, _ = self.attention(lstm_out, encoder_outputs)
        concat_input = torch.cat((lstm_out, context), dim=2)
        concat_output = torch.tanh(self.fc_concat(concat_input))
        prediction = self.fc_out(concat_output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        tgt_len = tgt.shape[1]
        vocab_size = self.decoder.fc_out.out_features
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input_token = tgt[:, 0].unsqueeze(1)

        for t in range(1, tgt_len):
            if isinstance(self.decoder, DecoderBasic):
                prediction, hidden, cell = self.decoder(input_token, hidden, cell)
            else:
                prediction, hidden, cell = self.decoder(input_token, hidden, cell, encoder_outputs)
            outputs[:, t] = prediction
            teacher_force = random.random() < teacher_forcing_ratio
            top_token = prediction.argmax(1).unsqueeze(1)
            input_token = tgt[:, t].unsqueeze(1) if teacher_force else top_token
        return outputs

# TRAINING

In [None]:
def train_epoch(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    # Clean memory trước khi train
    torch.cuda.empty_cache()
    gc.collect()

    for i, (src, tgt) in enumerate(iterator):
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

        if i % 100 == 0:
            print(f"Step {i}/{len(iterator)} | Loss: {loss.item():.4f}")

    return epoch_loss / len(iterator)

def evaluate_rouge(model, iterator, vocab_tgt):
    model.eval()
    rouge = ROUGEScore()
    preds_text = []
    targets_text = []

    with torch.no_grad():
        for i, (src, tgt) in enumerate(iterator):
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt, teacher_forcing_ratio=0)
            output_indices = output.argmax(2).tolist()
            tgt_indices = tgt.tolist()
            for j in range(len(output_indices)):
                p = vocab_tgt.decode(output_indices[j])
                t = vocab_tgt.decode(tgt_indices[j])
                preds_text.append(p)
                targets_text.append(t)
            if i > 50: break # Chỉ eval mẫu để tránh mất thời gian

    scores = rouge(preds_text, targets_text)
    return scores['rougeL_fmeasure'].item()

def run_experiment(task_name):
    # Cấu hình
    ENC_EMB = 256
    DEC_EMB = 256
    HID_DIM = 256
    N_LAYERS = 3
    ENC_DROP = 0.5
    DEC_DROP = 0.5
    LR = 0.001
    N_EPOCHS = 1 # Dù để 1 epoch nhưng nếu lâu quá bạn cứ bấm Stop

    INPUT_DIM = len(train_dataset.src_vocab)
    OUTPUT_DIM = len(train_dataset.tgt_vocab)

    # Dọn dẹp bộ nhớ
    torch.cuda.empty_cache()
    gc.collect()

    print(f"\n{'='*20}\nBẮT ĐẦU: {task_name}\n{'='*20}")

    # Khởi tạo model
    enc = Encoder(INPUT_DIM, ENC_EMB, HID_DIM, N_LAYERS, ENC_DROP)

    if task_name == 'Bai 1 (Basic)':
        dec = DecoderBasic(OUTPUT_DIM, DEC_EMB, HID_DIM, N_LAYERS, DEC_DROP)
    elif task_name == 'Bai 2 (Bahdanau)':
        dec = DecoderBahdanau(OUTPUT_DIM, DEC_EMB, HID_DIM, N_LAYERS, DEC_DROP)
    elif task_name == 'Bai 3 (Luong)':
        dec = DecoderLuong(OUTPUT_DIM, DEC_EMB, HID_DIM, N_LAYERS, DEC_DROP)

    model = Seq2Seq(enc, dec, device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # --- KHỐI LỆNH TRY-EXCEPT ĐỂ BẮT SỰ KIỆN DỪNG ---
    try:
        for epoch in range(N_EPOCHS):
            start_time = time.time()
            print(f"Epoch {epoch+1} đang chạy... (Bấm Stop để dừng sớm và giữ model)")

            # Train loop
            train_loss = train_epoch(model, train_loader, optimizer, criterion, 1)

            end_time = time.time()
            print(f'Epoch: {epoch+1:02} | Time: {end_time - start_time:.0f}s | Train Loss: {train_loss:.3f}')

    except KeyboardInterrupt:
        print("\n\n>>> ĐÃ DỪNG THỦ CÔNG! (Bạn đã bấm nút Stop)")
        print(">>> Đang lưu lại trạng thái Model hiện tại để đánh giá...")

    # Đánh giá model
    print("\nĐang đánh giá ROUGE-L trên tập Validation...")
    rouge_l = evaluate_rouge(model, valid_loader, train_dataset.tgt_vocab)
    print(f'>> KẾT QUẢ CUỐI CÙNG {task_name} - ROUGE-L: {rouge_l:.4f}')

    return model

# MAIN EXECUTION

In [None]:
import gc

run_experiment('Bai 1 (Basic)')

In [None]:
run_experiment('Bai 2 (Bahdanau)')

In [None]:
run_experiment('Bai 3 (Luong)')