In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import re
import string
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from tqdm import tqdm

2025-08-07 11:19:58.195436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754565598.558658      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754565598.667925      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- DEVICE SETUP ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# --- CLEANING FUNCTION ---
def clean_text(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# --- LOAD DATA ---
df = pd.read_csv("/kaggle/input/vietnamese-sentences/sentences.csv")
df = df.dropna(subset=["sentence_with_diacritics", "sentence_without_diacritics"]).reset_index(drop=True)

# --- SHUFFLE ---
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# --- SPLIT ---
n = len(df)
n_train = int(0.8 * n)
n_val   = int(0.1 * n)
n_test  = n - n_train - n_val

df_train = df[:n_train].copy()
df_val   = df[n_train:n_train + n_val].copy()
df_test  = df[n_train + n_val:].copy()

# --- CLEAN + TOKENIZE ---
for df_ in [df_train, df_val, df_test]:
    df_['no_diacritics_clean']   = df_['sentence_without_diacritics'].astype(str).apply(clean_text)
    df_['with_diacritics_clean'] = df_['sentence_with_diacritics'].astype(str).apply(clean_text)
    df_['with_diacritics_clean'] = df_['with_diacritics_clean'].apply(lambda x: '<start> ' + x + ' <end>')

In [4]:
# --- TOKENIZATION ---
filters = '"!#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'  # preserve <> for start/end
src_tokenizer = Tokenizer(oov_token='<unk>', filters=filters)
tgt_tokenizer = Tokenizer(oov_token='<unk>', filters=filters)
src_tokenizer.fit_on_texts(df_train['no_diacritics_clean'])
tgt_tokenizer.fit_on_texts(df_train['with_diacritics_clean'])

SRC_VOCAB_SIZE = len(src_tokenizer.word_index) + 1
TGT_VOCAB_SIZE = len(tgt_tokenizer.word_index) + 1
MAX_LEN = 70

with open("tokenized.pkl", "wb") as f:
    pickle.dump({
        "src_tokenizer": src_tokenizer,
        "tgt_tokenizer": tgt_tokenizer,
        "SRC_VOCAB_SIZE": SRC_VOCAB_SIZE,
        "TGT_VOCAB_SIZE": TGT_VOCAB_SIZE,
        "MAX_LEN": MAX_LEN
    }, f)

In [5]:
# encode & pad
def encode_and_pad(texts, tokenizer):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding='post')

train_src = encode_and_pad(df_train['no_diacritics_clean'], src_tokenizer)
train_tgt = encode_and_pad(df_train['with_diacritics_clean'], tgt_tokenizer)
val_src   = encode_and_pad(df_val['no_diacritics_clean'], src_tokenizer)
val_tgt   = encode_and_pad(df_val['with_diacritics_clean'], tgt_tokenizer)

In [6]:
# Câu gốc không dấu và có dấu
no_diacritic_sentence = df_train['no_diacritics_clean'].iloc[0]
with_diacritic_sentence = df_train['with_diacritics_clean'].iloc[0]

# Tokenized & padded
src_tokens = train_src[0]
tgt_tokens = train_tgt[0]

print("Câu không dấu     :", no_diacritic_sentence)
print("Câu có dấu        :", with_diacritic_sentence)
print("Token hóa (src)   :", src_tokens)
print("Token hóa (tgt)   :", tgt_tokens)

# Nếu muốn xem lại dạng từ (decode ngược)
print("\nGiải mã lại từ token:")
print("src decode:", ' '.join(src_tokenizer.sequences_to_texts([src_tokens])[0].split()))
print("tgt decode:", ' '.join(tgt_tokenizer.sequences_to_texts([tgt_tokens])[0].split()))

Câu không dấu     : theo danh gia cua gioi chuyen mon nhieu kha nang viet nam se co them it nhat hai chiec hc vang nua
Câu có dấu        : <start> theo đánh giá của giới chuyên môn nhiều khả năng việt nam sẽ có thêm ít nhất hai chiếc hc vàng nữa <end>
Token hóa (src)   : [ 92 146  32   3 211 104 328  67 280 135 140  27  50   2 309 381  89  74
 397 859 362 314   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
Token hóa (tgt)   : [   2   58  280   89    4  176  355  471   40  459  225  120   80   24
    6  297  391   73   85  423 1569  515  377    3    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]

Giải mã lại từ token:
src decode: theo danh gia cua gioi chuyen mo

In [7]:
# --- DATASET & LOADER ---
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = torch.LongTensor(src)
        self.tgt = torch.LongTensor(tgt)
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

BATCH_SIZE = 64
train_loader = DataLoader(TranslationDataset(train_src, train_tgt), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TranslationDataset(val_src, val_tgt), batch_size=BATCH_SIZE)

In [8]:
# --- SCRATCH LSTM MODULE ---
class LSTMScratch(nn.Module):
    def __init__(self, input_size, hidden_size, sigma=0.01):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        def init_weight(*shape):
            return nn.Parameter(torch.randn(*shape) * sigma)

        def init_gate():
            return (
                init_weight(input_size, hidden_size),
                init_weight(hidden_size, hidden_size),
                nn.Parameter(torch.zeros(hidden_size))
            )

        # Input gate
        self.W_xi, self.W_hi, self.b_i = init_gate()
        # Forget gate
        self.W_xf, self.W_hf, self.b_f = init_gate()
        # Output gate
        self.W_xo, self.W_ho, self.b_o = init_gate()
        # Candidate cell state
        self.W_xc, self.W_hc, self.b_c = init_gate()

    def forward(self, X, H, C):
        I = torch.sigmoid(X @ self.W_xi + H @ self.W_hi + self.b_i)
        F = torch.sigmoid(X @ self.W_xf + H @ self.W_hf + self.b_f)
        O = torch.sigmoid(X @ self.W_xo + H @ self.W_ho + self.b_o)
        C_tilde = torch.tanh(X @ self.W_xc + H @ self.W_hc + self.b_c)
        C_next = F * C + I * C_tilde
        H_next = O * torch.tanh(C_next)
        return H_next, C_next

# --- ATTENTION MODULE ---
class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.linear = nn.Linear(hidden_size, hidden_size, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: (batch, hidden)
        # encoder_outputs: (seq_len, batch, hidden)
        seq_len, batch_size, hidden = encoder_outputs.size()
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # (batch, seq_len, hidden)
        decoder_hidden = decoder_hidden.unsqueeze(1)        # (batch, 1, hidden)
        # Score = h_t^T W h_s
        energy = torch.bmm(decoder_hidden, self.linear(encoder_outputs).transpose(1, 2))  # (batch, 1, seq_len)
        attn_weights = torch.softmax(energy, dim=-1)  # (batch, 1, seq_len)
        context = torch.bmm(attn_weights, encoder_outputs)  # (batch, 1, hidden)
        return context.squeeze(1), attn_weights.squeeze(1)  # (batch, hidden), (batch, seq_len)

# --- ENCODER WITH LSTM ---
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = LSTMScratch(embed_size, hidden_size)

    def forward(self, x):
        embeddings = self.embedding(x)
        H = torch.zeros(x.shape[1], self.lstm.hidden_size, device=x.device)
        C = torch.zeros_like(H)
        outputs = []
        for emb in embeddings:
            H, C = self.lstm(emb, H, C)
            outputs.append(H.unsqueeze(0))
        return torch.cat(outputs, dim=0), (H, C)

# --- DECODER WITH LSTM AND ATTENTION ---
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attn = LuongAttention(hidden_size)
        self.lstm = LSTMScratch(embed_size + hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x, state, encoder_outputs):
        H, C = state
        embeddings = self.embedding(x)
        outputs = []
        for emb in embeddings:
            context, _ = self.attn(H, encoder_outputs)
            lstm_input = torch.cat([emb, context], dim=1)
            H, C = self.lstm(lstm_input, H, C)
            out = self.fc(torch.cat([H, context], dim=1))
            outputs.append(out.unsqueeze(0))
        return torch.cat(outputs, dim=0), (H, C)

# --- SEQ2SEQ ---
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        enc_outputs, (hidden, cell) = self.encoder(src)
        outputs, _ = self.decoder(tgt, (hidden, cell), enc_outputs)
        return outputs.transpose(0, 1)


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import csv
import os

# --- MODEL INIT ---
EMBED_SIZE = 256
HIDDEN_SIZE = 512
encoder = Encoder(SRC_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
decoder = Decoder(TGT_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# --- LOAD BEST CHECKPOINT ---
model.load_state_dict(torch.load("/kaggle/input/pre-model/best_model.pt"))
model.train()  

# --- TRAINING SETUP ---
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
scaler = GradScaler()

# --- LOG & CALLBACK SETUP ---
log_history = []
best_val_loss = float('inf')  # Nếu bạn nhớ giá trị epoch 5 thì gán lại
patience = 5
counter = 0

# --- FILE LOG ---
log_file = "training_log.csv"
write_header = not os.path.exists(log_file)
if write_header:
    with open(log_file, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["epoch", "train_loss", "train_acc", "val_loss", "val_acc"])
        writer.writeheader()

# --- TRAIN LOOP ---
EPOCHS = 50
START_EPOCH = 1 

for epoch in range(START_EPOCH, EPOCHS):
    model.train()criterion
    total_loss = total_tokens = total_correct = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", leave=False)
    for src, tgt in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        with autocast():
            logits = model(src, tgt_input)  # (batch, tgt_len-1, vocab)
            logits = logits.reshape(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        # metrics
        mask = tgt_flat != 0
        total_loss += loss.item() * mask.sum().item()
        total_tokens += mask.sum().item()
        preds = logits.argmax(dim=1)
        total_correct += (preds == tgt_flat).masked_select(mask).sum().item()

        batch_loss = loss.item()
        batch_acc = (preds == tgt_flat).masked_select(mask).float().mean().item()
        progress_bar.set_postfix({
            "Batch Loss": f"{batch_loss:.4f}",
            "Batch Acc": f"{batch_acc:.4f}"
        })

    scheduler.step()
    train_loss = total_loss / total_tokens
    train_acc = total_correct / total_tokens
    print(f"\n📘 Epoch {epoch+1:02d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")

    # --- VALIDATION ---
    model.eval()
    val_loss = val_tokens = val_correct = 0

    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1} Validation", leave=False)
    with torch.no_grad():
        for src, tgt in val_bar:
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            logits = model(src, tgt_input).reshape(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)

            mask = tgt_flat != 0
            val_loss += loss.item() * mask.sum().item()
            val_tokens += mask.sum().item()
            preds = logits.argmax(dim=1)
            val_correct += (preds == tgt_flat).masked_select(mask).sum().item()

            val_batch_loss = loss.item()
            val_batch_acc = (preds == tgt_flat).masked_select(mask).float().mean().item()
            val_bar.set_postfix({
                "Val Loss": f"{val_batch_loss:.4f}",
                "Val Acc": f"{val_batch_acc:.4f}"
            })

    val_loss_avg = val_loss / val_tokens
    val_acc_avg = val_correct / val_tokens
    print(f"✅ [Val] Loss: {val_loss_avg:.4f} | Acc: {val_acc_avg:.4f}")

    # --- LOGGING ---
    log_record = {
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss_avg,
        "val_acc": val_acc_avg
    }
    log_history.append(log_record)

    # Ghi log từng epoch vào CSV ngay
    with open(log_file, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["epoch", "train_loss", "train_acc", "val_loss", "val_acc"])
        writer.writerow(log_record)

    # --- CALLBACK: EARLY STOPPING & CHECKPOINT ---
    if val_loss_avg < best_val_loss:
        best_val_loss = val_loss_avg
        counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
        print("💾 Model improved. Checkpoint saved.")
    else:
        counter += 1
        print(f"⏳ No improvement. EarlyStop counter: {counter}/{patience}")
        if counter >= patience:
            print("🛑 Early stopping triggered.")
            break

SyntaxError: invalid syntax (1382961625.py, line 45)

In [None]:
# --- Khởi tạo model ---
EMBED_SIZE = 256
HIDDEN_SIZE = 512
encoder = Encoder(SRC_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
decoder = Decoder(TGT_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# --- Load checkpoint tốt nhất ---
model.load_state_dict(torch.load("/kaggle/input/version1/best_model.pt"))
model.eval() 

In [None]:
test_src = encode_and_pad(df_test['no_diacritics_clean'], src_tokenizer)
test_tgt = encode_and_pad(df_test['with_diacritics_clean'], tgt_tokenizer)

test_loader = DataLoader(TranslationDataset(test_src, test_tgt), batch_size=32)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # bỏ qua <pad> token
total_loss = 0.0
total_correct = 0
total_tokens = 0

with torch.no_grad():
    for src, tgt in tqdm(test_loader, desc="Evaluating on test set"):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        logits = model(src, tgt_input)
        logits = logits.reshape(-1, logits.shape[-1])
        tgt_flat = tgt_output.reshape(-1)

        loss = criterion(logits, tgt_flat)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        mask = tgt_flat != 0
        correct = (preds == tgt_flat) & mask
        total_correct += correct.sum().item()
        total_tokens += mask.sum().item()

avg_loss = total_loss / len(test_loader)
accuracy = total_correct / total_tokens

print(f"Test Loss: {avg_loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
idx2word = {idx: word for word, idx in tgt_tokenizer.word_index.items()}
idx2word[0] = '<pad>'

def beam_search_decode(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, beam_width=3, max_len=70):
    model.eval()

    # --- Clean and encode input ---
    cleaned_input = input_sentence.strip().lower()
    input_seq = pad_sequences(src_tokenizer.texts_to_sequences([cleaned_input]), maxlen=max_len, padding='post')
    input_tensor = torch.LongTensor(input_seq).to(device)

    start_token = tgt_tokenizer.word_index.get('<start>', 1)
    end_token = tgt_tokenizer.word_index.get('<end>', 2)

    sequences = [[start_token]]
    scores = [0.0]
    completed_sequences = []

    for _ in range(max_len):
        all_candidates = []
        for seq, score in zip(sequences, scores):
            if seq[-1] == end_token:
                completed_sequences.append((seq, score))
                continue

            tgt_tensor = torch.LongTensor([seq]).to(device)
            with torch.no_grad():
                output = model(input_tensor, tgt_tensor)
                logits = output[0, -1, :]
                log_probs = torch.log_softmax(logits, dim=-1)

            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)
            for j in range(beam_width):
                next_token = topk_indices[j].item()
                next_score = score + topk_log_probs[j].item()
                all_candidates.append((seq + [next_token], next_score))

        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = [seq for seq, _ in ordered[:beam_width]]
        scores = [score for _, score in ordered[:beam_width]]

        if all(seq[-1] == end_token for seq in sequences):
            break

    if completed_sequences:
        best_seq = max(completed_sequences, key=lambda tup: tup[1])[0]
    else:
        best_seq = sequences[0]

    decoded = []
    for token in best_seq[1:]:
        if token == end_token:
            break
        decoded.append(idx2word.get(token, '<unk>'))

    return ' '.join(decoded)

# --- Example test ---
test_sentences = [
    "toi yeu tieng viet",
    "chung ta se chien thang",
    "ha noi la thu do cua viet nam"
]

print("\nKết quả dự đoán:")
for sent in test_sentences:
    print("Input:", sent)
    print("Output:", beam_search_decode(model, sent, src_tokenizer, tgt_tokenizer, {v:k for k,v in tgt_tokenizer.word_index.items()}, beam_width=5))
    print()

In [None]:
!pip install sacrebleu

In [None]:
# --- Advanced BLEU Evaluation Function (Optimized and Precise) ---
import sacrebleu
from tqdm import tqdm
import torch

def evaluate_bleu(model, df, src_tokenizer, tgt_tokenizer, idx2word, decode_fn, max_len=70):
    model.eval()
    references = []
    hypotheses = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating BLEU"):
        input_sentence = row['no_diacritics_clean']
        reference = row['with_diacritics_clean']

        # Clean target (remove <start> and <end>)
        reference = reference.replace('<start>', '').replace('<end>', '').strip()

        # Decode prediction from model
        prediction = decode_fn(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=max_len)
        prediction = prediction.strip()

        references.append([reference])
        hypotheses.append(prediction)

    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(hypotheses, list(map(list, zip(*references))))
    print(f"\nFinal BLEU Score: {bleu.score:.2f}")
    return bleu.score

In [None]:
# score = evaluate_bleu(
#     model,
#     df_test.sample(n=5000, random_state=42).reset_index(drop=True),
#     src_tokenizer,
#     tgt_tokenizer,
#     idx2word,
#     decode_fn=beam_search_decode
# )