## import định nghĩa các hằng sốs, tham số

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

import numpy as np
import random
import math
import time
import os
import re
import requests
from tqdm.notebook import tqdm
import zipfile
import traceback

# --- Constants ---
DATA_DIR = 'data'
EN_FILE = os.path.join(DATA_DIR, 'en_sents')
VI_FILE = os.path.join(DATA_DIR, 'vi_sents')
TOKENIZER_DIR = 'tokenizers'
GLOVE_DIR = 'glove_data'
GLOVE_ZIP_URL = 'http://nlp.stanford.edu/data/glove.6B.zip'
GLOVE_ZIP_FILENAME = 'glove.6B.zip'
GLOVE_FILENAME = 'glove.6B.300d.txt' 
GLOVE_PATH = os.path.join(GLOVE_DIR, GLOVE_FILENAME)
MODEL_SAVE_PATH = 'seq2seq-gru-bidir-glove-hf.pt'

# Special tokens
UNK_TOKEN = "[UNK]"
PAD_TOKEN = "[PAD]"
SOS_TOKEN = "[SOS]"
EOS_TOKEN = "[EOS]"
SPECIAL_TOKENS = [PAD_TOKEN, SOS_TOKEN, EOS_TOKEN, UNK_TOKEN]

# --- Hyperparameters ---
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM_GLOVE = 300
EMBEDDING_DIM_VI = 256
HIDDEN_DIM = 512
NUM_LAYERS = 2 # Số lớp cho mỗi chiều GRU Encoder và cho GRU Decoder
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.001
BATCH_SIZE = 16 
NUM_EPOCHS = 2  
CLIP = 1.0
TEACHER_FORCING_RATIO = 0.5
FREEZE_GLOVE = True
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Num Epochs: {NUM_EPOCHS}")

Using device: cpu
Batch Size: 16
Num Epochs: 2


## Định nghĩa các hàm tiện ích

In [3]:
# --- Utility Functions ---
def normalize_string(s):
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s).strip()
    return s

def read_raw_data(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [clean_s for s in f if (clean_s := normalize_string(s))]
        print(f"Successfully read {len(lines)} lines from {file_path}")
        return lines
    except FileNotFoundError: print(f"Error: File not found: {file_path}"); return None
    except Exception as e: print(f"Error reading {file_path}: {e}"); return None

def download_file(url, dest_folder, filename):
    os.makedirs(dest_folder, exist_ok=True)
    zip_dest_path = os.path.join(dest_folder, filename)
    if not os.path.exists(GLOVE_PATH):
        if not os.path.exists(zip_dest_path):
            print(f"Downloading {filename} from {url}...")
            try:
                response = requests.get(url, stream=True, timeout=60) # Tăng timeout
                response.raise_for_status()
                total_size = int(response.headers.get('content-length', 0))
                t = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {filename}")
                with open(zip_dest_path + '.tmp', 'wb') as f:
                    for data in response.iter_content(1024*10): t.update(len(data)); f.write(data) # Tăng block size
                t.close()
                if total_size != 0 and t.n != total_size: print(f"ERROR: DL incomplete."); os.remove(zip_dest_path + '.tmp'); return False
                os.rename(zip_dest_path + '.tmp', zip_dest_path); print(f"Downloaded {filename}.")
            except Exception as e: print(f"Download error: {e}"); return False
        else: print(f"{filename} (zip) exists.")
        if os.path.exists(zip_dest_path) and filename.endswith('.zip'):
            print(f"Unzipping {filename}...")
            try:
                with zipfile.ZipFile(zip_dest_path, 'r') as zf:
                    if GLOVE_FILENAME in zf.namelist(): zf.extract(GLOVE_FILENAME, dest_folder)
                    else: print(f"Warn: {GLOVE_FILENAME} not in zip. Extract all..."); zf.extractall(dest_folder)
                if not os.path.exists(GLOVE_PATH): print(f"Error: {GLOVE_FILENAME} not found after unzip."); return False
                print("Unzip successful."); return True
            except Exception as e: print(f"Unzip error: {e}"); return False
        elif not filename.endswith('.zip'): print(f"Expected zip, got {filename}"); return False
    else: print(f"{GLOVE_FILENAME} exists."); return True
    return False

# --- Tokenizer Training/Loading ---
def train_or_load_tokenizer(lang, sentences_iterator, vocab_size, min_frequency=2):
    tokenizer_path = os.path.join(TOKENIZER_DIR, f'{lang}_tokenizer.json')
    os.makedirs(TOKENIZER_DIR, exist_ok=True)
    if os.path.exists(tokenizer_path):
        print(f"Loading tokenizer for {lang} from {tokenizer_path}")
        try: tokenizer = Tokenizer.from_file(tokenizer_path)
        except Exception as e: print(f"Error loading tokenizer: {e}. Retraining..."); os.remove(tokenizer_path); return train_or_load_tokenizer(lang, sentences_iterator, vocab_size, min_frequency)
    else:
        print(f"Training tokenizer for {lang}...")
        tokenizer = Tokenizer(WordPiece(unk_token=UNK_TOKEN)); tokenizer.pre_tokenizer = Whitespace()
        trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=SPECIAL_TOKENS, min_frequency=min_frequency)
        try:
             sentence_list = list(sentences_iterator) # Cần list để lấy length
             if not sentence_list: print(f"No sentences for {lang} tokenizer."); return None
             tokenizer.train_from_iterator(sentence_list, trainer=trainer, length=len(sentence_list))
        except Exception as e: print(f"Error during {lang} tokenizer training: {e}"); return None
        sos_id, eos_id = tokenizer.token_to_id(SOS_TOKEN), tokenizer.token_to_id(EOS_TOKEN)
        if sos_id is not None and eos_id is not None:
            tokenizer.post_processor = TemplateProcessing(single=f"{SOS_TOKEN} $A {EOS_TOKEN}", special_tokens=[(SOS_TOKEN, sos_id), (EOS_TOKEN, eos_id)]); print(f"Set post-processor for {lang}.")
        else: print(f"Warning: SOS/EOS not in {lang} vocab.")
        try: tokenizer.save(tokenizer_path); print(f"Saved tokenizer for {lang}.")
        except Exception as e: print(f"Error saving tokenizer: {e}"); return None
    pad_id = tokenizer.token_to_id(PAD_TOKEN)
    if pad_id is not None: tokenizer.enable_padding(pad_id=pad_id, pad_token=PAD_TOKEN, direction='right'); print(f"Enabled padding for {lang} (ID: {pad_id}).")
    else: print(f"Warning: {PAD_TOKEN} not in {lang} tokenizer.")
    return tokenizer

# --- GloVe Loading ---
def load_glove_embeddings(glove_path, embedding_dim, tokenizer):
    """Loads GloVe embeddings into a tensor compatible with nn.Embedding."""
    print(f"Loading GloVe embeddings from {glove_path}...")
    if not os.path.exists(glove_path):
        print(f"Error: GloVe file not found at {glove_path}")
        return None

    word_to_idx = tokenizer.get_vocab() # Get word -> index mapping from trained tokenizer
    vocab_size = tokenizer.get_vocab_size()
    print(f"Tokenizer vocab size (for embedding matrix): {vocab_size}")
    # Initialize embedding matrix with zeros
    embeddings = np.zeros((vocab_size, embedding_dim), dtype=np.float32)
    found_words = 0

    try:
        line_count = 0 # Đếm dòng để theo dõi tiến trình
        print("Reading GloVe file (this may take a while)...")
        with open(glove_path, 'r', encoding='utf-8') as f:
            # ---- THAY ĐỔI Ở ĐÂY: Bỏ tqdm ----
            for line in f:
            # ----------------------------------
                line_count += 1
                if line_count % 100000 == 0: # In tiến trình sau mỗi 100k dòng
                    print(f"  Processed {line_count} lines...")

                parts = line.split()
                if len(parts) < embedding_dim + 1: # Basic check for malformed lines
                    continue
                word = parts[0]
                if word in word_to_idx: # Check if word from GloVe is in our tokenizer vocab
                    try:
                        vector = np.array(parts[1:], dtype=np.float32)
                        # Double-check dimension after conversion
                        if vector.shape[0] == embedding_dim:
                            embeddings[word_to_idx[word]] = vector
                            found_words += 1
                        # else: print(f"Glove line {line_num+1}: Dim mismatch for '{word}'. Expected {embedding_dim}, Got {vector.shape[0]}")
                    except ValueError:
                        # print(f"Glove line {line_num+1}: Cannot parse vector for '{word}'")
                        pass # Skip if vector part is not numeric
        print(f"Finished reading {line_count} lines.")
    except Exception as e:
        print(f"An error occurred while reading GloVe file: {e}")
        traceback.print_exc()
        return None

    print(f"Loaded {found_words}/{len(word_to_idx)} words from GloVe file into embedding matrix.")

    # --- Initialize embeddings for special tokens ---
    pad_idx = tokenizer.token_to_id(PAD_TOKEN)
    unk_idx = tokenizer.token_to_id(UNK_TOKEN)
    sos_idx = tokenizer.token_to_id(SOS_TOKEN)
    eos_idx = tokenizer.token_to_id(EOS_TOKEN)

    # PAD token MUST be zeros if using padding_idx in nn.Embedding
    if pad_idx is not None:
        embeddings[pad_idx] = np.zeros(embedding_dim)
        print(f"Set PAD token embedding (Index: {pad_idx}) to zeros.")

    # Initialize UNK with small random values if not found in GloVe
    if unk_idx is not None and np.all(embeddings[unk_idx] == 0):
        print(f"Initializing UNK token embedding (Index: {unk_idx}) randomly.")
        embeddings[unk_idx] = np.random.randn(embedding_dim) * 0.01

    # Optionally initialize SOS/EOS if they weren't found or are zeros
    if sos_idx is not None and np.all(embeddings[sos_idx] == 0):
        print(f"Initializing SOS token embedding (Index: {sos_idx}) randomly.")
        embeddings[sos_idx] = np.random.randn(embedding_dim) * 0.01
    if eos_idx is not None and np.all(embeddings[eos_idx] == 0):
        print(f"Initializing EOS token embedding (Index: {eos_idx}) randomly.")
        embeddings[eos_idx] = np.random.randn(embedding_dim) * 0.01

    return torch.tensor(embeddings, dtype=torch.float)

# --- Dataset and Collate Function ---
class TranslationDatasetHF(Dataset):
    def __init__(self, src_sentences, trg_sentences, src_tokenizer: Tokenizer, trg_tokenizer: Tokenizer):
        if not src_sentences or not trg_sentences: raise ValueError("Sentences list empty.")
        if len(src_sentences) != len(trg_sentences): raise ValueError("Sentence lists length mismatch.")
        self.src_s, self.trg_s, self.src_tok, self.trg_tok = src_sentences, trg_sentences, src_tokenizer, trg_tokenizer
        self.pad_id_src = src_tokenizer.token_to_id(PAD_TOKEN) # Lưu lại để xử lý getitem
        self.pad_id_trg = trg_tokenizer.token_to_id(PAD_TOKEN)
    def __len__(self): return len(self.src_s)
    def __getitem__(self, idx):
        src_enc = self.src_tok.encode(self.src_s[idx]); trg_enc = self.trg_tok.encode(self.trg_s[idx])
        src_ids = src_enc.ids if src_enc and src_enc.ids else [self.pad_id_src]
        trg_ids = trg_enc.ids if trg_enc and trg_enc.ids else [self.pad_id_trg]
        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(trg_ids, dtype=torch.long)

def collate_fn_hf(batch, pad_idx_src, pad_idx_trg):
    src_b, trg_b = [], []
    for src_i, trg_i in batch: src_b.append(src_i); trg_b.append(trg_i)
    return pad_sequence(src_b, batch_first=True, padding_value=pad_idx_src), \
           pad_sequence(trg_b, batch_first=True, padding_value=pad_idx_trg)

print("Utility functions, Tokenizer, GloVe loader, Dataset defined.")


Utility functions, Tokenizer, GloVe loader, Dataset defined.


## Định nghĩa các lớp model

In [4]:
# --- Model Definition (Encoder with Bidirectional GRU) ---
class EncoderGRU(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers, dropout,
                 embedding_weights=None, freeze_emb=True, pad_idx=0):
        super().__init__(); self.hidden_dim, self.num_layers, self.actual_emb_dim = hidden_dim, num_layers, emb_dim
        if embedding_weights is not None:
            print("Encoder: Using pre-trained embeddings.")
            if emb_dim != embedding_weights.shape[1]: print(f"Warn: Enc emb({emb_dim}) != GloVe({embedding_weights.shape[1]}). Using GloVe."); self.actual_emb_dim=embedding_weights.shape[1]
            self.embedding=nn.Embedding.from_pretrained(embedding_weights,freeze=freeze_emb,padding_idx=pad_idx)
        else: print("Encoder: Random embeddings."); self.embedding=nn.Embedding(input_dim,self.actual_emb_dim,padding_idx=pad_idx)
        print(f"Encoder emb shape: {self.embedding.weight.shape}, Frozen: {freeze_emb if embedding_weights is not None else 'N/A'}")
        self.gru=nn.GRU(self.actual_emb_dim,hidden_dim,num_layers,dropout=(dropout if num_layers>1 else 0),batch_first=True,bidirectional=True)
        self.dropout=nn.Dropout(dropout); self.fc_hidden=nn.Linear(hidden_dim*2,hidden_dim)
    def forward(self, src_seq):
        embedded = self.dropout(self.embedding(src_seq))
        enc_outputs, hidden = self.gru(embedded)
        combined_hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        context_vec = torch.tanh(self.fc_hidden(combined_hidden))
        processed_hidden = context_vec.unsqueeze(0).repeat(self.num_layers, 1, 1)
        return enc_outputs, processed_hidden

class DecoderGRU(nn.Module): # Basic Decoder (No Attention yet)
    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers, dropout, pad_idx=0):
        super().__init__(); self.output_dim, self.hidden_dim, self.num_layers = output_dim, hidden_dim, num_layers
        print("Decoder: Random embeddings."); self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        print(f"Decoder emb shape: {self.embedding.weight.shape}")
        self.gru=nn.GRU(emb_dim,hidden_dim,num_layers,dropout=(dropout if num_layers > 1 else 0),batch_first=True)
        self.fc_out=nn.Linear(hidden_dim,output_dim); self.dropout=nn.Dropout(dropout)
    def forward(self, input_step, hidden_state):
        input_step=input_step.unsqueeze(1); embedded=self.dropout(self.embedding(input_step))
        output,new_hidden=self.gru(embedded,hidden_state); output=output.squeeze(1)
        return self.fc_out(output),new_hidden

class Seq2SeqGRU(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__(); self.encoder,self.decoder,self.device = encoder,decoder,device
        assert encoder.num_layers == decoder.num_layers; assert encoder.hidden_dim == decoder.hidden_dim # After fc_hidden
    def forward(self, src_seq, trg_seq, teacher_forcing_ratio=0.5):
        batch_size,trg_len,trg_vocab_size = trg_seq.shape[0],trg_seq.shape[1],self.decoder.output_dim
        dec_outputs=torch.zeros(batch_size,trg_len,trg_vocab_size).to(self.device)
        enc_outputs, dec_hidden_init = self.encoder(src_seq) # enc_outputs sẽ dùng cho Attention
        dec_hidden = dec_hidden_init; dec_input = trg_seq[:,0]
        for t in range(1,trg_len):
            pred, dec_hidden = self.decoder(dec_input, dec_hidden) # Decoder cơ bản chưa dùng enc_outputs
            dec_outputs[:,t,:] = pred
            if random.random()<teacher_forcing_ratio: dec_input=trg_seq[:,t]
            else: dec_input=pred.argmax(1)
        return dec_outputs

print("Model classes (EncoderGRU, DecoderGRU, Seq2SeqGRU) defined.")

Model classes (EncoderGRU, DecoderGRU, Seq2SeqGRU) defined.


## Định nghĩa các hàm khởi tạo,huấn luyện, đánh giá, timing, inference

In [5]:
# --- Weight Initialization ---
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        is_encoder_embedding = 'encoder.embedding.weight' in name
        use_pretrained = 'glove_embeddings' in globals() and glove_embeddings is not None # Check if global exists
        is_frozen = FREEZE_GLOVE

        if is_encoder_embedding and use_pretrained and is_frozen: continue # Skip frozen
        if param.dim() > 1: nn.init.xavier_uniform_(param)
        elif 'bias' in name: nn.init.constant_(param, 0)

# --- Training & Evaluation Loops ---
def train(model, iterator, optimizer, criterion, clip):
    model.train(); epoch_loss=0; proc_batches=0
    print(f"Starting training epoch...") # Thêm print này
    pbar = tqdm(iterator, desc=f"Training", leave=False, dynamic_ncols=True) # Bỏ epoch khỏi desc
    for i,batch in enumerate(pbar):
        try:
            src,trg = batch; src,trg = src.to(DEVICE),trg.to(DEVICE)
            optimizer.zero_grad(); output = model(src,trg,TEACHER_FORCING_RATIO)
            out_dim = output.shape[-1]; output_loss = output[:,1:].reshape(-1,out_dim); trg_loss = trg[:,1:].reshape(-1)
            loss = criterion(output_loss,trg_loss)
            if torch.isnan(loss) or torch.isinf(loss): print(f"NaN/Inf loss at train batch {i}. Skip."); continue
            loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(),clip); optimizer.step()
            epoch_loss+=loss.item(); proc_batches+=1; pbar.set_postfix(loss=f"{loss.item():.4f}") # Format loss
        except Exception as e: print(f"\nTrain batch {i} err: {e}"); traceback.print_exc(); optimizer.zero_grad(); continue
    if proc_batches==0: return float('inf')
    return epoch_loss/proc_batches

def evaluate(model, iterator, criterion):
    model.eval(); epoch_loss=0; proc_batches=0
    print(f"Starting evaluation...") # Thêm print này
    pbar = tqdm(iterator, desc="Evaluating", leave=False, dynamic_ncols=True)
    with torch.no_grad():
        for i,batch in enumerate(pbar):
            try:
                src,trg = batch; src,trg = src.to(DEVICE),trg.to(DEVICE)
                output = model(src,trg,0) # NO teacher forcing
                out_dim=output.shape[-1]; output_loss = output[:,1:].reshape(-1,out_dim); trg_loss = trg[:,1:].reshape(-1)
                loss = criterion(output_loss,trg_loss)
                if torch.isnan(loss) or torch.isinf(loss): print(f"NaN/Inf loss at eval batch {i}. Skip."); continue
                epoch_loss+=loss.item(); proc_batches+=1; pbar.set_postfix(loss=f"{loss.item():.4f}") # Format loss
            except Exception as e: print(f"\nEval batch {i} err: {e}"); traceback.print_exc(); continue
    if proc_batches==0: return float('inf')
    return epoch_loss/proc_batches

# --- Helper function for timing ---
def epoch_time(start_time, end_time):
    el_time=end_time-start_time; el_mins=int(el_time/60); el_secs=int(el_time-(el_mins*60)); return el_mins,el_secs

# --- Inference Function ---
def translate_sentence_hf(sentence, src_tokenizer, trg_tokenizer, model, device, max_len=50):
    model.eval()
    if not isinstance(sentence,str) or not sentence.strip(): return "Invalid input."
    clean_s = normalize_string(sentence); src_enc = src_tokenizer.encode(clean_s)
    if not src_enc or not src_enc.ids: return "Empty after tokenization."
    src_tensor = torch.LongTensor(src_enc.ids).unsqueeze(0).to(device)
    sos_id = trg_tokenizer.token_to_id(SOS_TOKEN)
    eos_id = trg_tokenizer.token_to_id(EOS_TOKEN)
    if sos_id is None or eos_id is None: print("Err: Target SOS/EOS not found."); return "Translation setup err."
    trg_ids_res = []
    try:
        with torch.no_grad():
            _, dec_hidden = model.encoder(src_tensor) # Chỉ cần hidden khởi tạo
            dec_input = torch.LongTensor([sos_id]).to(device)
            for _ in range(max_len):
                output, dec_hidden = model.decoder(dec_input, dec_hidden)
                pred_id = output.argmax(1).item()
                # Không thêm EOS vào kết quả cuối cùng
                if pred_id == eos_id: break
                trg_ids_res.append(pred_id)
                dec_input = torch.LongTensor([pred_id]).to(device)
        # Decode bỏ qua special tokens (bao gồm SOS/EOS nếu tokenizer làm vậy)
        return trg_tokenizer.decode(trg_ids_res, skip_special_tokens=True)
    except Exception as e: print(f"Inference err: {e}"); traceback.print_exc(); return "Translation err."

print("Helper functions (init, train, eval, time, translate) defined.")


Helper functions (init, train, eval, time, translate) defined.


## Chuẩn bị glove


In [6]:
#Chuẩn bị GloVe
print("--- Step 1: Preparing GloVe ---")
if not download_file(GLOVE_ZIP_URL, GLOVE_DIR, GLOVE_ZIP_FILENAME):
    print("GloVe preparation failed. Please ensure the GloVe file exists or download is possible.")
    # Thoát nếu bạn đang chạy script .py
    # exit(1)
    # Nếu dùng Notebook, bạn có thể dừng ở đây hoặc xử lý khác
    raise RuntimeError("GloVe preparation failed.")
else:
    print("GloVe preparation successful or file already exists.")

--- Step 1: Preparing GloVe ---
glove.6B.300d.txt exists.
GloVe preparation successful or file already exists.


## Tải và phân chia dữ liệu 

In [7]:
#Tải và Chia Dữ liệu
print("\n--- Step 2 & 3: Loading Raw Data and Splitting ---")
en_sents_raw = read_raw_data(EN_FILE)
vi_sents_raw = read_raw_data(VI_FILE)

if en_sents_raw is None or vi_sents_raw is None or len(en_sents_raw) != len(vi_sents_raw) or not en_sents_raw:
    raise RuntimeError("Data loading error or mismatch or empty. Exiting.")

combined = list(zip(en_sents_raw, vi_sents_raw)); random.seed(42); random.shuffle(combined)
total_len=len(combined); train_len=int(0.8*total_len); valid_len=int(0.1*total_len); test_len=total_len-train_len-valid_len

if train_len==0 or valid_len==0 or test_len==0:
    raise RuntimeError(f"Dataset too small ({total_len}) to split. Exiting.")

train_data = combined[:train_len]; valid_data = combined[train_len:train_len+valid_len]; test_data = combined[train_len+valid_len:]
en_train_sents, vi_train_sents = zip(*train_data)
en_valid_sents, vi_valid_sents = zip(*valid_data)
en_test_sents, vi_test_sents = zip(*test_data)

print(f"Data Split - Train: {len(en_train_sents)}, Validation: {len(en_valid_sents)}, Test: {len(en_test_sents)}")


--- Step 2 & 3: Loading Raw Data and Splitting ---
Successfully read 254090 lines from data/en_sents
Successfully read 254090 lines from data/vi_sents
Data Split - Train: 203272, Validation: 25409, Test: 25409


## Chuẩn bị Tokenizers

In [8]:
#Chuẩn bị Tokenizers
print("\n--- Step 4: Preparing Tokenizers ---")
# Pass lists directly as generators were consumed in the previous combined version
tokenizer_en = train_or_load_tokenizer("en", list(en_train_sents), vocab_size=MAX_VOCAB_SIZE)
tokenizer_vi = train_or_load_tokenizer("vi", list(vi_train_sents), vocab_size=MAX_VOCAB_SIZE)

if tokenizer_en is None or tokenizer_vi is None:
    raise RuntimeError("Tokenizer prep error. Exiting.")

INPUT_VOCAB_SIZE = tokenizer_en.get_vocab_size()
OUTPUT_VOCAB_SIZE = tokenizer_vi.get_vocab_size()
PAD_IDX_EN = tokenizer_en.token_to_id(PAD_TOKEN)
PAD_IDX_VI = tokenizer_vi.token_to_id(PAD_TOKEN)

if PAD_IDX_EN is None or PAD_IDX_VI is None:
    raise RuntimeError(f"Error: PAD token missing. Exiting.")

print(f"EN Vocab: {INPUT_VOCAB_SIZE}, VI Vocab: {OUTPUT_VOCAB_SIZE}, EN PAD: {PAD_IDX_EN}, VI PAD: {PAD_IDX_VI}")

# --- Lưu lại các index cần thiết cho inference ---
SOS_IDX_VI = tokenizer_vi.token_to_id(SOS_TOKEN)
EOS_IDX_VI = tokenizer_vi.token_to_id(EOS_TOKEN)
if SOS_IDX_VI is None or EOS_IDX_VI is None:
     print("Warning: SOS or EOS index for target language not found in tokenizer.")


--- Step 4: Preparing Tokenizers ---
Loading tokenizer for en from tokenizers/en_tokenizer.json
Enabled padding for en (ID: 0).
Loading tokenizer for vi from tokenizers/vi_tokenizer.json
Enabled padding for vi (ID: 0).
EN Vocab: 20000, VI Vocab: 9660, EN PAD: 0, VI PAD: 0


## Load Glove Embedding

In [9]:
# Khối 5.4: Load GloVe Embeddings
print("\n--- Step 5: Loading GloVe Embeddings ---")
glove_embeddings = load_glove_embeddings(GLOVE_PATH, EMBEDDING_DIM_GLOVE, tokenizer_en)
# if glove_embeddings is None:
#     print("Warning: GloVe failed. Encoder using random embeddings.")


--- Step 5: Loading GloVe Embeddings ---
Loading GloVe embeddings from glove_data/glove.6B.300d.txt...
Tokenizer vocab size (for embedding matrix): 20000
Reading GloVe file (this may take a while)...
  Processed 100000 lines...
  Processed 200000 lines...
  Processed 300000 lines...
  Processed 400000 lines...
Finished reading 400000 lines.
Loaded 14781/20000 words from GloVe file into embedding matrix.
Set PAD token embedding (Index: 0) to zeros.
Initializing UNK token embedding (Index: 3) randomly.
Initializing SOS token embedding (Index: 1) randomly.
Initializing EOS token embedding (Index: 2) randomly.


## Tạo dataset và dataloaders

In [10]:
# Khối 5.5: Tạo Datasets và DataLoaders
print("\n--- Step 6: Creating Datasets and DataLoaders ---")
try:
    train_dataset = TranslationDatasetHF(en_train_sents, vi_train_sents, tokenizer_en, tokenizer_vi)
    valid_dataset = TranslationDatasetHF(en_valid_sents, vi_valid_sents, tokenizer_en, tokenizer_vi)
    test_dataset  = TranslationDatasetHF(en_test_sents, vi_test_sents, tokenizer_en, tokenizer_vi)
except ValueError as e:
     print(f"Dataset creation error: {e}"); raise e # Re-raise error

collate_with_padding_hf = lambda batch: collate_fn_hf(batch, PAD_IDX_EN, PAD_IDX_VI)

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_with_padding_hf, drop_last=True)
valid_iterator = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_with_padding_hf, drop_last=False)
test_iterator  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_with_padding_hf, drop_last=False)
print("DataLoaders created.")


--- Step 6: Creating Datasets and DataLoaders ---
DataLoaders created.


## Khởi tạo Model,optimizer và criterion

In [11]:
#Khởi tạo Model, Optimizer, Criterion
print("\n--- Step 7: Initializing Model (with Bidirectional Encoder) ---")
encoder = EncoderGRU(INPUT_VOCAB_SIZE, EMBEDDING_DIM_GLOVE, HIDDEN_DIM, NUM_LAYERS, DROPOUT_RATE,
                     embedding_weights=glove_embeddings, freeze_emb=FREEZE_GLOVE, pad_idx=PAD_IDX_EN).to(DEVICE)
decoder = DecoderGRU(OUTPUT_VOCAB_SIZE, EMBEDDING_DIM_VI, HIDDEN_DIM, NUM_LAYERS, DROPOUT_RATE,
                     pad_idx=PAD_IDX_VI).to(DEVICE)
model = Seq2SeqGRU(encoder, decoder, DEVICE).to(DEVICE)

# Quan trọng: Áp dụng khởi tạo trọng số SAU KHI chuyển model lên DEVICE
model.apply(init_weights); print("Model weights initialized.")

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX_VI); print("Optimizer & Criterion set.")


--- Step 7: Initializing Model (with Bidirectional Encoder) ---
Encoder: Using pre-trained embeddings.
Encoder emb shape: torch.Size([20000, 300]), Frozen: True
Decoder: Random embeddings.
Decoder emb shape: torch.Size([9660, 256])
Model weights initialized.
Optimizer & Criterion set.


## Vòng lặp huấn luyện 

In [None]:
#Vòng lặp Huấn luyện
best_valid_loss = float('inf')
print("\n--- Step 8: Starting Training Loop ---")
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    print(f"\nEpoch: {epoch+1:02}/{NUM_EPOCHS}")
    # Hàm train và evaluate đã có print bên trong
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    end_time = time.time(); epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # Lưu model nếu validation loss tốt hơn
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss; torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"\t-> Saved Best Model (Val Loss: {valid_loss:.3f})")
    else: print(f"\t   Validation loss did not improve from {best_valid_loss:.3f}")

    # In kết quả epoch
    print(f'\tEpoch Time: {epoch_mins}m {epoch_secs}s')
    if train_loss != float('inf'): print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(min(train_loss, 700)):7.3f}')
    else: print('\tTrain Loss: Inf')
    if valid_loss != float('inf'): print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(min(valid_loss, 700)):7.3f}')
    else: print('\t Val. Loss: Inf')
print("\n--- Training Finished ---")


--- Step 8: Starting Training Loop ---

Epoch: 01/2
Starting training epoch...


Training:   0%|          | 0/12704 [00:00<?, ?it/s]

## Đánh giá trên tập test

In [None]:
#Đánh giá trên Tập Test
print(f"\n--- Step 9: Evaluating Best Model on Test Set ---")
try:
    print(f"Loading best model from {MODEL_SAVE_PATH} for final test evaluation...")
    # Load vào model instance hiện có (đã có cấu trúc đúng)
    model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=DEVICE))
    print("Best model loaded.")
    test_loss = evaluate(model, test_iterator, criterion) # Đánh giá model tốt nhất
    if test_loss != float('inf'):
        print(f'| Final Test Loss: {test_loss:.3f} | Final Test PPL: {math.exp(min(test_loss, 700)):7.3f} |')
    else:
        print("| Test Loss: Inf |")
except FileNotFoundError:
    print(f"Model file '{MODEL_SAVE_PATH}' not found. Skipping test set evaluation.")
except Exception as e:
    print(f"Error during test set evaluation: {e}"); traceback.print_exc()

## Vòng lặp tương tác

In [None]:
#Vòng lặp Tương tác
print(f"\n--- Step 10 & 11: Loading Best Model & Interactive Translation ---")
interactive_mode = False
try:
    # Đảm bảo model tốt nhất đã được load ở bước trước hoặc load lại
    # model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=DEVICE)) # Load lại nếu cần
    print("Using best model loaded previously for interactive mode.")
    # Cần kiểm tra xem tokenizer đã được load thành công chưa
    if 'tokenizer_en' in locals() and 'tokenizer_vi' in locals() and tokenizer_en and tokenizer_vi:
         interactive_mode = True
    else:
         print("Error: Tokenizers not available for interactive mode.")

except FileNotFoundError: print(f"Error: Model file '{MODEL_SAVE_PATH}' not found for interactive mode.")
except Exception as e: print(f"Error loading model for interactive mode: {e}"); traceback.print_exc()

if interactive_mode:
    print("\nEnter an English sentence to translate (or type 'quit' to exit):")
    while True:
        try:
            input_sentence = input("> ")
            if input_sentence.lower().strip() == 'quit': break
            if not input_sentence.strip(): continue
            # Gọi hàm translate (đã được định nghĩa ở Khối 4)
            translated_sentence = translate_sentence_hf(input_sentence, tokenizer_en, tokenizer_vi, model, DEVICE)
            print(f"VI: {translated_sentence}")
        except KeyboardInterrupt: print("\nExiting interactive mode."); break
        except Exception as e: print(f"Translation error: {e}"); traceback.print_exc()
    print("\nExited interactive translation mode.")
else:
    print("\nInteractive mode could not be started.")