In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import re
import string
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

2025-06-28 10:40:43.178786: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751107243.584917      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751107243.691809      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# --- DEVICE SETUP ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# --- LOAD DATA ---
path = "/kaggle/input/vietnamese-diacritics-dataset/"
df_train = pd.read_csv(path + "ViDiacritics_train.csv")
df_val   = pd.read_csv(path + "ViDiacritics_val.csv")
df_test  = pd.read_csv(path + "ViDiacritics_test.csv")

df_train = df_train.sample(frac=0.1, random_state=42).reset_index(drop=True)
df_val   = df_val.sample(frac=0.1, random_state=42).reset_index(drop=True)
df_test  = df_test.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [4]:
# --- CLEAN TEXT ---
def clean_text(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def apply_cleaning(df):
    df['no_diacritics_clean']   = df['no_diacritics'].astype(str).apply(clean_text)
    df['with_diacritics_clean'] = df['with_diacritics'].astype(str).apply(clean_text)
    df['with_diacritics_clean'] = df['with_diacritics_clean'].apply(lambda x: '<start> ' + x + ' <end>')
    return df

df_train = apply_cleaning(df_train)
df_val   = apply_cleaning(df_val)
df_test  = apply_cleaning(df_test)

In [5]:
# --- TOKENIZATION ---
tokenizer_filters = '"!#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'  # giữ <>
src_tokenizer = Tokenizer(oov_token='<unk>', filters=tokenizer_filters)
tgt_tokenizer = Tokenizer(oov_token='<unk>', filters=tokenizer_filters)
src_tokenizer.fit_on_texts(df_train['no_diacritics_clean'])
tgt_tokenizer.fit_on_texts(df_train['with_diacritics_clean'])

SRC_VOCAB_SIZE = len(src_tokenizer.word_index) + 1
TGT_VOCAB_SIZE = len(tgt_tokenizer.word_index) + 1
MAX_LEN = 70

In [6]:
# --- ENCODING ---
def encode_and_pad(texts, tokenizer):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding='post')

train_src = encode_and_pad(df_train['no_diacritics_clean'], src_tokenizer)
train_tgt = encode_and_pad(df_train['with_diacritics_clean'], tgt_tokenizer)
val_src   = encode_and_pad(df_val['no_diacritics_clean'], src_tokenizer)
val_tgt   = encode_and_pad(df_val['with_diacritics_clean'], tgt_tokenizer)

In [7]:
# --- DATASET ---
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = torch.LongTensor(src)
        self.tgt = torch.LongTensor(tgt)
    def __len__(self): return len(self.src)
    def __getitem__(self, idx): return self.src[idx], self.tgt[idx]

BATCH_SIZE = 64
train_loader = DataLoader(TranslationDataset(train_src, train_tgt), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TranslationDataset(val_src, val_tgt), batch_size=BATCH_SIZE, shuffle=False)

In [8]:
# --- TRANSFORMER MODULES ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product(self, Q, K, V, mask=None):
        Q, K, V = Q.float(), K.float(), V.float()
        scores = Q.matmul(K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e4)
        attn = torch.softmax(scores, dim=-1)
        return attn.matmul(V)

    def split_heads(self, x):
        b, seq_len, _ = x.size()
        return x.view(b, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        b, h, seq_len, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(b, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        out = self.scaled_dot_product(Q, K, V, mask)
        out = self.combine_heads(out)
        return self.W_o(out)

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, num_heads)
        self.ff   = PositionWiseFeedForward(d_model, d_ff)
        self.norm1= nn.LayerNorm(d_model)
        self.norm2= nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        x = self.norm1(x + self.dropout(self.attn(x,x,x,mask)))
        x = self.norm2(x + self.dropout(self.ff(x)))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.self_attn  = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff         = PositionWiseFeedForward(d_model, d_ff)
        self.norm1      = nn.LayerNorm(d_model)
        self.norm2      = nn.LayerNorm(d_model)
        self.norm3      = nn.LayerNorm(d_model)
        self.dropout    = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        x = self.norm1(x + self.dropout(self.self_attn(x,x,x,tgt_mask)))
        x = self.norm2(x + self.dropout(self.cross_attn(x,enc_out,enc_out,src_mask)))
        x = self.norm3(x + self.dropout(self.ff(x)))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=8, num_layers=2, d_ff=512, max_len=70, dropout=0.1):
        super().__init__()
        self.enc_emb = nn.Embedding(src_vocab_size, d_model)
        self.dec_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len)
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model,num_heads,d_ff,dropout) for _ in range(num_layers)])
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model,num_heads,d_ff,dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(2)
        seq_len = tgt.size(1)
        nopeak = torch.tril(torch.ones(1,1,seq_len,seq_len, device=tgt.device)).bool()
        tgt_mask = tgt_mask & nopeak
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        x = self.dropout(self.pos_enc(self.enc_emb(src)))
        y = self.dropout(self.pos_enc(self.dec_emb(tgt)))
        for layer in self.enc_layers:
            x = layer(x, src_mask)
        for layer in self.dec_layers:
            y = layer(y, x, src_mask, tgt_mask)
        return self.fc_out(y)

In [9]:
# --- MODEL & TRAINING ---
from tqdm import tqdm

model     = Transformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
scaler    = GradScaler()

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    total_loss, total_tokens, total_correct = 0, 0, 0
    for src, tgt in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        src, tgt = src.to(device), tgt.to(device)
        tgt_input, tgt_output = tgt[:, :-1], tgt[:, 1:]

        optimizer.zero_grad()
        with autocast():
            logits = model(src, tgt_input).view(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        mask = tgt_flat != 0
        total_loss   += loss.item() * mask.sum().item()
        total_tokens += mask.sum().item()
        preds = logits.argmax(dim=1)
        total_correct+= (preds == tgt_flat).masked_select(mask).sum().item()

    scheduler.step()
    train_loss = total_loss / total_tokens
    train_acc  = total_correct / total_tokens
    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f} | Acc: {train_acc:.4f}")

    # Validation
    model.eval()
    val_loss, val_tokens, val_correct = 0, 0, 0
    with torch.no_grad():
        for src, tgt in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
            src, tgt = src.to(device), tgt.to(device)
            tgt_input, tgt_output = tgt[:, :-1], tgt[:, 1:]
            logits = model(src, tgt_input).view(-1, TGT_VOCAB_SIZE)
            tgt_flat = tgt_output.reshape(-1)
            loss = criterion(logits, tgt_flat)
            mask = tgt_flat != 0
            val_loss   += loss.item() * mask.sum().item()
            val_tokens += mask.sum().item()
            preds = logits.argmax(dim=1)
            val_correct+= (preds == tgt_flat).masked_select(mask).sum().item()
    print(f"[Val] Loss: {val_loss/val_tokens:.4f} | Acc: {val_correct/val_tokens:.4f}\n")

  scaler    = GradScaler()
  with autocast():
Epoch 1 Training: 100%|██████████| 15688/15688 [46:09<00:00,  5.66it/s]


Epoch 1 | Loss: 1.1965 | Acc: 0.7878


Epoch 1 Validation: 100%|██████████| 1961/1961 [04:01<00:00,  8.12it/s]


[Val] Loss: 0.5842 | Acc: 0.8869



Epoch 2 Training: 100%|██████████| 15688/15688 [46:16<00:00,  5.65it/s]


Epoch 2 | Loss: 0.6247 | Acc: 0.8731


Epoch 2 Validation: 100%|██████████| 1961/1961 [04:02<00:00,  8.09it/s]


[Val] Loss: 0.4819 | Acc: 0.9063



Epoch 3 Training: 100%|██████████| 15688/15688 [46:04<00:00,  5.67it/s]


Epoch 3 | Loss: 0.5220 | Acc: 0.8925


Epoch 3 Validation: 100%|██████████| 1961/1961 [04:01<00:00,  8.11it/s]


[Val] Loss: 0.4305 | Acc: 0.9161



Epoch 4 Training: 100%|██████████| 15688/15688 [46:04<00:00,  5.68it/s]


Epoch 4 | Loss: 0.4614 | Acc: 0.9044


Epoch 4 Validation: 100%|██████████| 1961/1961 [04:01<00:00,  8.11it/s]


[Val] Loss: 0.3919 | Acc: 0.9239



Epoch 5 Training: 100%|██████████| 15688/15688 [46:15<00:00,  5.65it/s]


Epoch 5 | Loss: 0.4276 | Acc: 0.9111


Epoch 5 Validation: 100%|██████████| 1961/1961 [04:01<00:00,  8.11it/s]

[Val] Loss: 0.3758 | Acc: 0.9272






In [10]:
import os
import zipfile
import torch

# Tên file lưu model
model_filename = "transformer_model.pt"
torch.save(model.state_dict(), model_filename)
print(f"Model saved to {model_filename}")

# Lưu tokenizer (dùng pickle hoặc joblib)
import pickle
with open("src_tokenizer.pkl", "wb") as f:
    pickle.dump(src_tokenizer, f)
with open("tgt_tokenizer.pkl", "wb") as f:
    pickle.dump(tgt_tokenizer, f)

# Tạo danh sách file cần zip
files_to_zip = [
    "transformer_model.pt",
    "src_tokenizer.pkl",
    "tgt_tokenizer.pkl"
]

# Nén thành file zip
zip_filename = "transformer_model_package.zip"
with zipfile.ZipFile(zip_filename, "w") as zipf:
    for file in files_to_zip:
        if os.path.exists(file):
            zipf.write(file)

print(f"Model package zipped to {zip_filename}")

Model saved to transformer_model.pt
Model package zipped to transformer_model_package.zip


In [11]:
# --- UTILITY FUNCTIONS ---
idx2word = {idx:word for word,idx in tgt_tokenizer.word_index.items()}
idx2word[0] = '<pad>'

def greedy_decode(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=70):
    model.eval()
    cleaned = clean_text(input_sentence)
    seq = pad_sequences(src_tokenizer.texts_to_sequences([cleaned]), maxlen=max_len, padding='post')
    src_tensor = torch.LongTensor(seq).to(device)

    start_token = tgt_tokenizer.word_index['<start>']
    end_token   = tgt_tokenizer.word_index['<end>']
    tgt_tokens  = [start_token]
    decoded     = []

    for _ in range(max_len):
        tgt_tensor = torch.LongTensor([tgt_tokens]).to(device)
        with torch.no_grad():
            out = model(src_tensor, tgt_tensor)
            logits = out[0, -1, :]
            next_token = torch.argmax(logits).item()

        if next_token == end_token:
            break
        decoded.append(idx2word.get(next_token, '<unk>'))
        tgt_tokens.append(next_token)

    return ' '.join(decoded)

# --- TEST PREDICTIONS ---
test_sentences = [
    "toi yeu tieng viet",
    "chung ta se chien thang",
    "ha noi la thu do cua viet nam"
]
print("Kết quả dự đoán:")
for sent in test_sentences:
    print(f"Input: {sent}\nOutput: {greedy_decode(model, sent, src_tokenizer, tgt_tokenizer, idx2word)}\n")

Kết quả dự đoán:
Input: toi yeu tieng viet
Output: tôi yêu tiếng việt

Input: chung ta se chien thang
Output: chúng ta sẽ chiến thắng

Input: ha noi la thu do cua viet nam
Output: hà nội là thủ đô của việt nam



In [12]:
def beam_search_decode(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, beam_width=3, max_len=70):
    model.eval()

    # --- Clean and encode input ---
    cleaned_input = input_sentence.strip().lower()
    input_seq = pad_sequences(src_tokenizer.texts_to_sequences([cleaned_input]), maxlen=max_len, padding='post')
    input_tensor = torch.LongTensor(input_seq).to(device)

    start_token = tgt_tokenizer.word_index.get('<start>', 1)
    end_token = tgt_tokenizer.word_index.get('<end>', 2)

    sequences = [[start_token]]
    scores = [0.0]
    completed_sequences = []

    for _ in range(max_len):
        all_candidates = []
        for seq, score in zip(sequences, scores):
            if seq[-1] == end_token:
                completed_sequences.append((seq, score))
                continue

            tgt_tensor = torch.LongTensor([seq]).to(device)
            with torch.no_grad():
                output = model(input_tensor, tgt_tensor)
                logits = output[0, -1, :]
                log_probs = torch.log_softmax(logits, dim=-1)

            topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)
            for j in range(beam_width):
                next_token = topk_indices[j].item()
                next_score = score + topk_log_probs[j].item()
                all_candidates.append((seq + [next_token], next_score))

        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = [seq for seq, _ in ordered[:beam_width]]
        scores = [score for _, score in ordered[:beam_width]]

        if all(seq[-1] == end_token for seq in sequences):
            break

    if completed_sequences:
        best_seq = max(completed_sequences, key=lambda tup: tup[1])[0]
    else:
        best_seq = sequences[0]

    decoded = []
    for token in best_seq[1:]:
        if token == end_token:
            break
        decoded.append(idx2word.get(token, '<unk>'))

    return ' '.join(decoded)

# --- Example test ---
test_sentences = [
    "toi yeu tieng viet",
    "chung ta se chien thang",
    "ha noi la thu do cua viet nam"
]

print("\nKết quả dự đoán:")
for sent in test_sentences:
    print("Input:", sent)
    print("Output:", beam_search_decode(model, sent, src_tokenizer, tgt_tokenizer, {v:k for k,v in tgt_tokenizer.word_index.items()}, beam_width=5))
    print()


Kết quả dự đoán:
Input: toi yeu tieng viet
Output: tôi yêu tiếng việt

Input: chung ta se chien thang
Output: chúng ta sẽ chiến thắng

Input: ha noi la thu do cua viet nam
Output: hà nội là thủ đô của việt nam



In [13]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.2.0 sacrebleu-2.5.1


In [14]:
# --- Advanced BLEU Evaluation Function (Optimized and Precise) ---
import sacrebleu
from tqdm import tqdm
import torch

def evaluate_bleu(model, df, src_tokenizer, tgt_tokenizer, idx2word, decode_fn, max_len=70):
    model.eval()
    references = []
    hypotheses = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating BLEU"):
        input_sentence = row['no_diacritics_clean']
        reference = row['with_diacritics_clean']

        # Clean target (remove <start> and <end>)
        reference = reference.replace('<start>', '').replace('<end>', '').strip()

        # Decode prediction from model
        prediction = decode_fn(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=max_len)
        prediction = prediction.strip()

        references.append([reference])
        hypotheses.append(prediction)

    # Compute BLEU
    bleu = sacrebleu.corpus_bleu(hypotheses, list(map(list, zip(*references))))
    print(f"\nFinal BLEU Score: {bleu.score:.2f}")
    return bleu.score

In [15]:
score = evaluate_bleu(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=greedy_decode)

Evaluating BLEU: 100%|██████████| 10000/10000 [20:43<00:00,  8.04it/s]



Final BLEU Score: 80.77


In [16]:
score = evaluate_bleu(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=beam_search_decode)

Evaluating BLEU: 100%|██████████| 10000/10000 [1:06:06<00:00,  2.52it/s]



Final BLEU Score: 81.31


In [17]:
# --- Advanced ChrF++ Evaluation Function ---
import sacrebleu
from tqdm import tqdm

def evaluate_chrf(model, df, src_tokenizer, tgt_tokenizer, idx2word, decode_fn, max_len=70):
    model.eval()
    references = []
    hypotheses = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating ChrF++"):
        input_sentence = row['no_diacritics_clean']
        reference = row['with_diacritics_clean'].replace('<start>', '').replace('<end>', '').strip()

        # Decode prediction using the provided decode_fn
        prediction = decode_fn(model, input_sentence, src_tokenizer, tgt_tokenizer, idx2word, max_len=max_len)
        prediction = prediction.strip()  # already stripped from <start>/<end> if decode_fn is correct

        # Append for evaluation
        references.append([reference])  # list of references for each sentence
        hypotheses.append(prediction)   # single prediction per sentence

    # Compute ChrF++ score
    chrf = sacrebleu.corpus_chrf(hypotheses, list(map(list, zip(*references))))
    print(f"\nFinal ChrF++ Score: {chrf.score:.2f}")
    return chrf.score

In [18]:
chrf_score = evaluate_chrf(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=greedy_decode)

Evaluating ChrF++: 100%|██████████| 10000/10000 [21:12<00:00,  7.86it/s]



Final ChrF++ Score: 88.41


In [19]:
chrf_score = evaluate_chrf(model, df_test.sample(10000), src_tokenizer, tgt_tokenizer, idx2word, decode_fn=beam_search_decode)

Evaluating ChrF++: 100%|██████████| 10000/10000 [1:06:24<00:00,  2.51it/s]



Final ChrF++ Score: 88.57


In [20]:
# import torch
# import pickle
# import pandas as pd
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# import sacrebleu

# # --- Load tokenizers ---
# with open("/kaggle/input/transformer-seq2seq-diacritics/transformers/default/1/src_tokenizer.pkl", "rb") as f:
#     src_tokenizer = pickle.load(f)

# with open("/kaggle/input/transformer-seq2seq-diacritics/transformers/default/1/tgt_tokenizer.pkl", "rb") as f:
#     tgt_tokenizer = pickle.load(f)

# # --- Load model ---
# model_path = "/kaggle/input/transformer-seq2seq-diacritics/transformers/default/1/transformer_model.pt"
# model = Transformer(len(src_tokenizer.word_index)+1, len(tgt_tokenizer.word_index)+1)
# model.load_state_dict(torch.load(model_path, map_location=device))
# model.to(device)
# model.eval()

# # --- Create idx2word mapping ---
# idx2word = {idx: word for word, idx in tgt_tokenizer.word_index.items()}