In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import re
import string
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# --- DEVICE SETUP ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# --- DATA CLEANING ---
def clean_text(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# --- LOAD & PREPROCESS ---
path = "/kaggle/input/vietnamese-diacritics-dataset/"
df_train = pd.read_csv(path + "ViDiacritics_train.csv")
df_val   = pd.read_csv(path + "ViDiacritics_val.csv")
df_test  = pd.read_csv(path + "ViDiacritics_test.csv")

# sample for speed
df_train = df_train.sample(frac=0.05, random_state=42).reset_index(drop=True)
df_val   = df_val.sample(frac=0.05, random_state=42).reset_index(drop=True)
df_test  = df_test.sample(frac=0.05, random_state=42).reset_index(drop=True)

# apply cleaning and add tokens
for df in [df_train, df_val, df_test]:
    df['no_diacritics_clean']   = df['no_diacritics'].astype(str).apply(clean_text)
    df['with_diacritics_clean'] = df['with_diacritics'].astype(str).apply(clean_text)
    df['with_diacritics_clean'] = df['with_diacritics_clean'].apply(lambda x: '<start> ' + x + ' <end>')

In [None]:
# --- TOKENIZATION ---
filters = '"!#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'  # preserve <> for start/end
src_tokenizer = Tokenizer(oov_token='<unk>', filters=filters)
tgt_tokenizer = Tokenizer(oov_token='<unk>', filters=filters)
src_tokenizer.fit_on_texts(df_train['no_diacritics_clean'])
tgt_tokenizer.fit_on_texts(df_train['with_diacritics_clean'])

SRC_VOCAB_SIZE = len(src_tokenizer.word_index) + 1
TGT_VOCAB_SIZE = len(tgt_tokenizer.word_index) + 1
MAX_LEN = 70

In [None]:
# encode & pad
def encode_and_pad(texts, tokenizer):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding='post')

train_src = encode_and_pad(df_train['no_diacritics_clean'], src_tokenizer)
train_tgt = encode_and_pad(df_train['with_diacritics_clean'], tgt_tokenizer)
val_src   = encode_and_pad(df_val['no_diacritics_clean'], src_tokenizer)
val_tgt   = encode_and_pad(df_val['with_diacritics_clean'], tgt_tokenizer)

In [None]:
# --- DATASET & LOADER ---
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = torch.LongTensor(src)
        self.tgt = torch.LongTensor(tgt)
    def __len__(self):
        return len(self.src)
    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

BATCH_SIZE = 64
train_loader = DataLoader(TranslationDataset(train_src, train_tgt), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TranslationDataset(val_src, val_tgt), batch_size=BATCH_SIZE)

In [None]:
# --- SCRATCH LSTM MODULE ---
class LSTMScratch(nn.Module):
    def __init__(self, input_size, hidden_size, sigma=0.01):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        def init_weight(*shape):
            return nn.Parameter(torch.randn(*shape) * sigma)

        def init_gate():
            return (
                init_weight(input_size, hidden_size),
                init_weight(hidden_size, hidden_size),
                nn.Parameter(torch.zeros(hidden_size))
            )

        # Input gate
        self.W_xi, self.W_hi, self.b_i = init_gate()
        # Forget gate
        self.W_xf, self.W_hf, self.b_f = init_gate()
        # Output gate
        self.W_xo, self.W_ho, self.b_o = init_gate()
        # Candidate cell state
        self.W_xc, self.W_hc, self.b_c = init_gate()

    def forward(self, X, H, C):
        I = torch.sigmoid(X @ self.W_xi + H @ self.W_hi + self.b_i)
        F = torch.sigmoid(X @ self.W_xf + H @ self.W_hf + self.b_f)
        O = torch.sigmoid(X @ self.W_xo + H @ self.W_ho + self.b_o)
        C_tilde = torch.tanh(X @ self.W_xc + H @ self.W_hc + self.b_c)
        C_next = F * C + I * C_tilde
        H_next = O * torch.tanh(C_next)
        return H_next, C_next

# --- ATTENTION MODULE ---
class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.linear = nn.Linear(hidden_size, hidden_size, bias=False)

    def forward(self, decoder_hidden, encoder_outputs):
        # decoder_hidden: (batch, hidden)
        # encoder_outputs: (seq_len, batch, hidden)
        seq_len, batch_size, hidden = encoder_outputs.size()
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # (batch, seq_len, hidden)
        decoder_hidden = decoder_hidden.unsqueeze(1)        # (batch, 1, hidden)
        # Score = h_t^T W h_s
        energy = torch.bmm(decoder_hidden, self.linear(encoder_outputs).transpose(1, 2))  # (batch, 1, seq_len)
        attn_weights = torch.softmax(energy, dim=-1)  # (batch, 1, seq_len)
        context = torch.bmm(attn_weights, encoder_outputs)  # (batch, 1, hidden)
        return context.squeeze(1), attn_weights.squeeze(1)  # (batch, hidden), (batch, seq_len)

# --- ENCODER WITH LSTM ---
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = LSTMScratch(embed_size, hidden_size)

    def forward(self, x):
        embeddings = self.embedding(x)
        H = torch.zeros(x.shape[1], self.lstm.hidden_size, device=x.device)
        C = torch.zeros_like(H)
        outputs = []
        for emb in embeddings:
            H, C = self.lstm(emb, H, C)
            outputs.append(H.unsqueeze(0))
        return torch.cat(outputs, dim=0), (H, C)

# --- DECODER WITH LSTM AND ATTENTION ---
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attn = LuongAttention(hidden_size)
        self.lstm = LSTMScratch(embed_size + hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x, state, encoder_outputs):
        H, C = state
        embeddings = self.embedding(x)
        outputs = []
        for emb in embeddings:
            context, _ = self.attn(H, encoder_outputs)
            lstm_input = torch.cat([emb, context], dim=1)
            H, C = self.lstm(lstm_input, H, C)
            out = self.fc(torch.cat([H, context], dim=1))
            outputs.append(out.unsqueeze(0))
        return torch.cat(outputs, dim=0), (H, C)

# --- SEQ2SEQ ---
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        src = src.transpose(0, 1)
        tgt = tgt.transpose(0, 1)
        enc_outputs, (hidden, cell) = self.encoder(src)
        outputs, _ = self.decoder(tgt, (hidden, cell), enc_outputs)
        return outputs.transpose(0, 1)


In [None]:
from tqdm import tqdm

# --- MODEL INIT ---
EMBED_SIZE = 256
HIDDEN_SIZE = 512
encoder = Encoder(SRC_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
decoder = Decoder(TGT_VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
model = Seq2Seq(encoder, decoder).to(device)

# --- TRAINING SETUP ---
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
scaler = GradScaler()

# --- TRAIN LOOP ---
EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    
    total_loss = total_tokens = total_correct = 0
    for src, tgt in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        src, tgt = src.to(device), tgt.to(device)
        # prepare decoder input and target
        tgt_input = tgt[:, :-1]
        tgt_output = tgt[:, 1:]

        optimizer.zero_grad()
        with autocast():
            logits = model(src, tgt_input)  # (batch, tgt_len-1, vocab)
            logits = logits.reshape(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        # metrics
        mask = tgt_flat != 0
        total_loss += loss.item() * mask.sum().item()
        total_tokens += mask.sum().item()
        preds = logits.argmax(dim=1)
        total_correct += (preds == tgt_flat).masked_select(mask).sum().item()

    scheduler.step()
    print(f"Epoch {epoch+1} | Loss: {total_loss/total_tokens:.4f} | Acc: {total_correct/total_tokens:.4f}")

    # validation
    model.eval()
    val_loss = val_tokens = val_correct = 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            src, tgt = src.to(device), tgt.to(device)
            tgt_input = tgt[:, :-1]
            tgt_output = tgt[:, 1:]
            logits = model(src, tgt_input).reshape(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)
            mask = tgt_flat != 0
            val_loss += loss.item() * mask.sum().item()
            val_tokens += mask.sum().item()
            preds = logits.argmax(dim=1)
            val_correct += (preds == tgt_flat).masked_select(mask).sum().item()
    print(f"[Val] Loss: {val_loss/val_tokens:.4f} | Acc: {val_correct/val_tokens:.4f}\n")

In [None]:
# --- SAVE MODEL & TOKENIZERS ---
import pickle

torch.save(model.state_dict(), "lstm_seq2seq.pt")
with open("src_tokenizer.pkl", "wb") as f:
    pickle.dump(src_tokenizer, f)
with open("tgt_tokenizer.pkl", "wb") as f:
    pickle.dump(tgt_tokenizer, f)

print("Training complete, model and tokenizers saved.")

In [None]:
# --- GREEDY DECODE (LSTM-Compatible with Attention) ---
idx2word = {idx: word for word, idx in tgt_tokenizer.word_index.items()}
idx2word[0] = '<pad>'

def greedy_decode(model, sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=70):
    model.eval()
    cleaned = clean_text(sentence)
    seq = encode_and_pad([cleaned], src_tokenizer)
    src_tensor = torch.LongTensor(seq).to(device).transpose(0, 1)

    start_id = tgt_tokenizer.word_index['<start>']
    end_id = tgt_tokenizer.word_index['<end>']
    tgt_ids = [start_id]
    result = []

    with torch.no_grad():
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
        for _ in range(max_len):
            tgt_tensor = torch.LongTensor([tgt_ids[-1]]).unsqueeze(0).to(device)  # (1, 1)
            logits, (hidden, cell) = model.decoder(tgt_tensor, (hidden, cell), encoder_outputs)
            next_id = logits[-1, 0].argmax().item()
            if next_id == end_id:
                break
            result.append(idx2word.get(next_id, '<unk>'))
            tgt_ids.append(next_id)

    return ' '.join(result)

# --- TEST ---
test_sentences = [
    "toi yeu tieng viet",
    "chung ta se chien thang",
    "ha noi la thu do cua viet nam"
]
for sent in test_sentences:
    print(f"Input: {sent}")
    print(f"Output: {greedy_decode(model, sent, src_tokenizer, tgt_tokenizer, idx2word)}")

In [None]:
def beam_search_decode(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, beam_width=3, max_len=70):
    model.eval()

    # --- Clean and encode input ---
    cleaned_input = input_sentence.strip().lower()
    input_seq = pad_sequences(src_tokenizer.texts_to_sequences([cleaned_input]), maxlen=max_len, padding='post')
    input_tensor = torch.LongTensor(input_seq).to(device)

    start_token = tgt_tokenizer.word_index.get('<start>', 1)
    end_token = tgt_tokenizer.word_index.get('<end>', 2)

    sequences = [[start_token]]
    scores = [0.0]
    completed_sequences = []

    for _ in range(max_len):
        all_candidates = []
        for seq, score in zip(sequences, scores):
            if seq[-1] == end_token:
                completed_sequences.append((seq, score))
                continue

            tgt_tensor = torch.LongTensor([seq]).to(device)
            with torch.no_grad():
                output = model(input_tensor, tgt_tensor)
                logits = output[0, -1, :]
                log_probs = torch.log_softmax(logits, dim=-1)

            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)
            for j in range(beam_width):
                next_token = topk_indices[j].item()
                next_score = score + topk_log_probs[j].item()
                all_candidates.append((seq + [next_token], next_score))

        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = [seq for seq, _ in ordered[:beam_width]]
        scores = [score for _, score in ordered[:beam_width]]

        if all(seq[-1] == end_token for seq in sequences):
            break

    if completed_sequences:
        best_seq = max(completed_sequences, key=lambda tup: tup[1])[0]
    else:
        best_seq = sequences[0]

    decoded = []
    for token in best_seq[1:]:
        if token == end_token:
            break
        decoded.append(idx2word.get(token, '<unk>'))

    return ' '.join(decoded)

# --- Example test ---
test_sentences = [
    "toi yeu tieng viet",
    "chung ta se chien thang",
    "ha noi la thu do cua viet nam"
]

print("\nKết quả dự đoán:")
for sent in test_sentences:
    print("Input:", sent)
    print("Output:", beam_search_decode(model, sent, src_tokenizer, tgt_tokenizer, {v:k for k,v in tgt_tokenizer.word_index.items()}, beam_width=5))
    print()

In [None]:
!pip install sacrebleu

In [None]:
# --- Advanced BLEU Evaluation Function (Optimized and Precise) ---
import sacrebleu
from tqdm import tqdm
import torch

def evaluate_bleu(model, df, src_tokenizer, tgt_tokenizer, idx2word, decode_fn, max_len=70):
    model.eval()
    references = []
    hypotheses = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating BLEU"):
        input_sentence = row['no_diacritics_clean']
        reference = row['with_diacritics_clean']

        # Clean target (remove <start> and <end>)
        reference = reference.replace('<start>', '').replace('<end>', '').strip()

        # Decode prediction from model
        prediction = decode_fn(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=max_len)
        prediction = prediction.strip()

        references.append([reference])
        hypotheses.append(prediction)

    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(hypotheses, list(map(list, zip(*references))))
    print(f"\nFinal BLEU Score: {bleu.score:.2f}")
    return bleu.score

In [None]:
score = evaluate_bleu(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=greedy_decode)

In [None]:
score = evaluate_bleu(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=beam_search_decode)

In [None]:
# --- Advanced ChrF++ Evaluation Function ---
import sacrebleu
from tqdm import tqdm

def evaluate_chrf(model, df, src_tokenizer, tgt_tokenizer, idx2word, decode_fn, max_len=70):
    model.eval()
    references = []
    hypotheses = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating ChrF++"):
        input_sentence = row['no_diacritics_clean']
        reference = row['with_diacritics_clean'].replace('<start>', '').replace('<end>', '').strip()

        # Decode prediction using the provided decode_fn
        prediction = decode_fn(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=max_len)
        prediction = prediction.strip()  # already stripped from <start>/<end> if decode_fn is correct

        # Append for evaluation
        references.append([reference])  # list of references for each sentence
        hypotheses.append(prediction)   # single prediction per sentence

    # Compute ChrF++ score
    chrf = sacrebleu.corpus_chrf(hypotheses, list(map(list, zip(*references))))
    print(f"\nFinal ChrF++ Score: {chrf.score:.2f}")
    return chrf.score


In [None]:
chrf_score = evaluate_chrf(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=greedy_decode)

In [None]:
chrf_score = evaluate_chrf(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=beam_search_decode)