In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/vlsp-2025-data/public_test.en.txt
/kaggle/input/vlsp-2025-data/train.en.txt
/kaggle/input/vlsp-2025-data/train.vi.txt
/kaggle/input/vlsp-2025-data/public_test.vi.txt
/kaggle/input/vlsp-2025-processed/vlsp_processed_train.csv
/kaggle/input/vlsp-2025-processed/vlsp_processed_val.csv


# Analysis

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

source_file = '/kaggle/input/vlsp-2025-data/train.en.txt'
target_file = '/kaggle/input/vlsp-2025-data/train.vi.txt'

with open(source_file, 'r', encoding='utf-8') as f:
    en_lines = [line.strip() for line in f.readlines()]

with open(target_file, 'r', encoding='utf-8') as f:
    vi_lines = [line.strip() for line in f.readlines()]

print("EN sentences:", len(en_lines))
print("VI sentences:", len(vi_lines))


df = pd.DataFrame({
    "en": en_lines,
    "vi": vi_lines
})
df.head()

EN sentences: 500000
VI sentences: 500000


Unnamed: 0,en,vi
0,"To evaluate clinical, subclinical symptoms of ...","Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bện..."
1,"Evaluate clinical, subclinical symptoms of pat...","Đánh giá đặc điểm lâm sàng, cận lâm sàng bệnh ..."
2,There was a relation between vasodilatation an...,Có sự liên quan giữa độ quá phát V.a với mức đ...
3,Otittis media effusion on V a is a common dise...,Kết luận: Viêm tai ứ dịch trên viêm V.a là bện...
4,"Main symptoms are rhinitis, nasal congestion, ...","Triệu chứng cơ năng nổi bật là chảy mũi, ngạt ..."


# Load Pre-trained model and Config

In [None]:
!pip install -q sacrebleu nltk evaluate

In [4]:
from datasets import Dataset, Features, Value

train_src_file = '/kaggle/input/vlsp-2025-data/train.en.txt'
train_tgt_file = '/kaggle/input/vlsp-2025-data/train.vi.txt'
val_src_file   = '/kaggle/input/vlsp-2025-data/public_test.en.txt'
val_tgt_file   = '/kaggle/input/vlsp-2025-data/public_test.vi.txt'

def bidirectional_generator(src_path, tgt_path):
    with open(src_path, "r", encoding="utf-8") as fs, open(tgt_path, "r", encoding="utf-8") as ft:
        for s, t in zip(fs, ft):
            s = s.strip()
            t = t.strip()
            if not s or not t:
                continue
            yield {
                "src": s,
                "tgt": t,
                "src_lang": "English",
                "tgt_lang": "Vietnamese",
                "direction": "en-vi",
            }
            yield {
                "src": t,
                "tgt": s,
                "src_lang": "Vietnamese",
                "tgt_lang": "English",
                "direction": "vi-en",
            }

features = Features({
    "src": Value("string"),
    "tgt": Value("string"),
    "src_lang": Value("string"),
    "tgt_lang": Value("string"),
    "direction": Value("string"),
})

train_ds = Dataset.from_generator(
    bidirectional_generator,
    gen_kwargs={"src_path": train_src_file, "tgt_path": train_tgt_file},
    features=features
)

val_ds = Dataset.from_generator(
    bidirectional_generator,
    gen_kwargs={"src_path": val_src_file, "tgt_path": val_tgt_file},
    features=features
)

print(train_ds, val_ds)
print(type(train_ds), train_ds[0])


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['en', 'vi'],
    num_rows: 500000
}) Dataset({
    features: ['en', 'vi'],
    num_rows: 3000
})
<class 'datasets.arrow_dataset.Dataset'> {'en': 'To evaluate clinical, subclinical symptoms of patients with otitis media with effusion and V.a at otorhinolaryngology department – Thai Nguyen national hospital', 'vi': 'Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bệnh nhân viêm tai ứ dịch trên viêm V.A tại Khoa Tai mũi họng - Bệnh viện Trung ương Thái Nguyên'}


In [5]:
MAX_TRAIN_SAMPLES = 50_000
SEED = 42

train_ds_50k = (
    train_ds
    .shuffle(seed=SEED)
    .select(range(min(MAX_TRAIN_SAMPLES, len(train_ds))))
)

print(train_ds_50k)
print(train_ds_50k[0])


Dataset({
    features: ['en', 'vi'],
    num_rows: 50000
})
{'en': 'Among ambulatory people ≥ 65, adverse drug effects occur at a rate of about 50 events per 1000 person-years.', 'vi': 'Trong số người bệnh ≥ 65 tuổi, tác dụng phụ bất lợi của thuốc xảy ra với tỉ lệ khoảng 50 lần / 1000 người-năm.'}


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.padding_side = "left"

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map=None,  
    trust_remote_code=True
)

# LoRA config (nhẹ, ổn cho 0.5B)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

2025-12-17 02:14:18.002561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765937658.423096      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765937658.535075      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


In [7]:
SYSTEM_PROMPT = (
    "You are a professional medical translator specialized in clinical and biomedical text. "
    "Follow these rules:\n"
    "1) Translate faithfully without adding or omitting information.\n"
    "2) Preserve numbers, units, dosage, and formatting.\n"
    "3) Use standard medical terminology and keep abbreviations as in the source.\n"
    "4) Do not add explanations, comments, or extra markup.\n"
    "5) Output only the translation."
)

def build_chat_prompt(src_text: str, src_lang: str, tgt_lang: str):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": (
                f"Translate from {src_lang} to {tgt_lang}.\n"
                f"Source ({src_lang}): {src_text}\n"
                f"Translation ({tgt_lang}):"
            ),
        },
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

def tokenize_with_mask(example, max_length=512):
    prompt = build_chat_prompt(example["src"], example["src_lang"], example["tgt_lang"])
    answer = example["tgt"].strip()

    full_text = prompt + answer + tokenizer.eos_token

    enc = tokenizer(
        full_text,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None,
    )

    prompt_ids = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None,
    )["input_ids"]

    labels = enc["input_ids"].copy()
    prompt_len = len(prompt_ids)

    labels[:prompt_len] = [-100] * min(prompt_len, len(labels))

    enc["labels"] = labels
    return enc

MAX_LEN = 200

train_tok = train_ds_50k.map(lambda x: tokenize_with_mask(x, MAX_LEN), remove_columns=train_ds.column_names, num_proc=2)
val_tok   = val_ds.map(lambda x: tokenize_with_mask(x, MAX_LEN), remove_columns=val_ds.column_names, num_proc=2)

print(train_tok[0].keys())


Map (num_proc=2):   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/3000 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])


# Training

In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    label_pad_token_id=-100,
)

In [9]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / Total: {total:,} ({trainable/total:.4%})")

model.config.use_cache = False
model.train()

# rất hay cần khi dùng gradient_checkpointing + PEFT
if hasattr(model, "enable_input_require_grads"):
    model.enable_input_require_grads()


Trainable: 8,798,208 / Total: 502,830,976 (1.7497%)


In [10]:
import numpy as np
from sacrebleu.metrics import BLEU, TER
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

try:
    from nltk.translate.meteor_score import meteor_score
    HAS_METEOR = True
except Exception as exc:
    HAS_METEOR = False
    print(f"METEOR disabled: {exc}")

bleu_metric = BLEU()
ter_metric = TER()

def safe_meteor(refs, hyps):
    if not HAS_METEOR:
        return float("nan")
    try:
        return float(np.mean([meteor_score([ref], pred) for ref, pred in zip(refs, hyps)]))
    except Exception as exc:
        print(f"METEOR failed: {exc}")
        return float("nan")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    bleu = bleu_metric.corpus_score(decoded_preds, [decoded_labels]).score
    ter = ter_metric.corpus_score(decoded_preds, [decoded_labels]).score
    meteor = safe_meteor(decoded_labels, decoded_preds)

    gen_lens = [len(p.split()) for p in decoded_preds]

    return {
        "bleu": bleu,
        "ter": ter,
        "meteor": meteor,
        "gen_len": float(np.mean(gen_lens)),
    }

args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/qwen_mt_lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=2,
    fp16=True,
    gradient_checkpointing=True,
    predict_with_generate=True,
    generation_max_length=256,
    generation_num_beams=1,
    report_to="none",
    dataloader_num_workers=2,
    ddp_find_unused_parameters=False,
)

model.config.use_cache = False
if hasattr(model, "generation_config") and model.generation_config is not None:
    model.generation_config.do_sample = False
    model.generation_config.temperature = None
    model.generation_config.top_p = None
    model.generation_config.top_k = None
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("/kaggle/working/qwen_mt_lora/final")
tokenizer.save_pretrained("/kaggle/working/qwen_mt_lora/final")


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss


('/kaggle/working/qwen_mt_lora/final/tokenizer_config.json',
 '/kaggle/working/qwen_mt_lora/final/special_tokens_map.json',
 '/kaggle/working/qwen_mt_lora/final/chat_template.jinja',
 '/kaggle/working/qwen_mt_lora/final/vocab.json',
 '/kaggle/working/qwen_mt_lora/final/merges.txt',
 '/kaggle/working/qwen_mt_lora/final/added_tokens.json',
 '/kaggle/working/qwen_mt_lora/final/tokenizer.json')

# Evaluation

In [None]:
import math
import matplotlib.pyplot as plt

def plot_training_curves(trainer):
    history = trainer.state.log_history
    train_steps, train_loss = [], []
    eval_steps, eval_loss = [], []
    eval_bleu, eval_ter, eval_meteor = [], [], []

    for entry in history:
        if "loss" in entry and "eval_loss" not in entry:
            train_steps.append(entry["step"])
            train_loss.append(entry["loss"])
        if "eval_loss" in entry:
            eval_steps.append(entry["step"])
            eval_loss.append(entry["eval_loss"])
            eval_bleu.append(entry.get("eval_bleu"))
            eval_ter.append(entry.get("eval_ter"))
            eval_meteor.append(entry.get("eval_meteor"))

    if train_steps:
        plt.figure(figsize=(7, 4))
        plt.plot(train_steps, train_loss, label="train_loss")
        if eval_steps:
            plt.plot(eval_steps, eval_loss, label="eval_loss")
        plt.title("Training Loss")
        plt.xlabel("Step")
        plt.ylabel("Loss")
        plt.legend()
        plt.grid(True)
        plt.show()

    if eval_steps:
        ppl = [math.exp(x) for x in eval_loss]
        fig, ax1 = plt.subplots(figsize=(7, 4))
        ax1.plot(eval_steps, eval_bleu, color="tab:blue", label="BLEU")
        ax1.set_xlabel("Step")
        ax1.set_ylabel("BLEU", color="tab:blue")
        ax1.tick_params(axis="y", labelcolor="tab:blue")
        ax1.grid(True)

        ax2 = ax1.twinx()
        ax2.plot(eval_steps, ppl, color="tab:orange", label="Perplexity")
        ax2.set_ylabel("Perplexity", color="tab:orange")
        ax2.tick_params(axis="y", labelcolor="tab:orange")

        plt.title("BLEU and Perplexity on Eval")
        fig.tight_layout()
        plt.show()

plot_training_curves(trainer)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM  

model_dir = "/kaggle/working/qwen_mt_lora/final"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_dir, 
    dtype="auto", 
    device_map="auto" # Tự động đẩy sang GPU (thay cho .to("cuda"))
)

if hasattr(model, "generation_config") and model.generation_config is not None:
    model.generation_config.do_sample = False
    model.generation_config.temperature = None
    model.generation_config.top_p = None
    model.generation_config.top_k = None

model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=896, out_features=896, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=896, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=896, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=896, out_features=128, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropou

In [13]:
from tqdm.auto import tqdm

@torch.no_grad()
def translate_batch(examples, max_input_len=512, max_new_tokens=192, batch_size=8):
    model.eval()
    hyps = []

    for i in range(0, len(examples), batch_size):
        chunk = examples[i:i+batch_size]
        prompts = [
            build_chat_prompt(x["src"], x["src_lang"], x["tgt_lang"]) for x in chunk
        ]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_input_len
        ).to(model.device)

        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            num_beams=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

        attn = inputs["attention_mask"]
        for b in range(out.size(0)):
            prompt_len = int(attn[b].sum().item())
            gen_ids = out[b, prompt_len:]
            pred = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
            hyps.append(pred)

    return hyps


## BLEU

In [14]:
import numpy as np
from sacrebleu.metrics import BLEU, TER

try:
    from nltk.translate.meteor_score import meteor_score
    HAS_METEOR = True
except Exception as exc:
    HAS_METEOR = False
    print(f"METEOR disabled: {exc}")

bleu_metric = BLEU()
ter_metric = TER()

def safe_meteor(refs, hyps):
    if not HAS_METEOR:
        return float("nan")
    try:
        return float(np.mean([meteor_score([ref], pred) for ref, pred in zip(refs, hyps)]))
    except Exception as exc:
        print(f"METEOR failed: {exc}")
        return float("nan")

def compute_metrics_on_dataset(ds, n_samples=None, batch_size=8, max_input_len=512, max_new_tokens=192):
    N = len(ds) if n_samples is None else min(n_samples, len(ds))
    examples = [ds[i] for i in range(N)]
    refs = [ex["tgt"] for ex in examples]

    hyps = []
    for i in tqdm(range(0, N, batch_size), desc="Generating"):
        chunk = examples[i:i+batch_size]
        hyps.extend(
            translate_batch(
                chunk,
                max_input_len=max_input_len,
                max_new_tokens=max_new_tokens,
                batch_size=len(chunk)
            )
        )

    bleu = bleu_metric.corpus_score(hyps, [refs]).score
    ter = ter_metric.corpus_score(hyps, [refs]).score
    meteor = safe_meteor(refs, hyps)

    return {"bleu": bleu, "ter": ter, "meteor": meteor}, hyps, refs, examples

metrics_all, hyps_all, refs_all, examples_all = compute_metrics_on_dataset(
    val_ds,
    n_samples=3000,
    batch_size=16,
    max_input_len=512,
    max_new_tokens=192
)
print("Metrics (all directions) @3000:", metrics_all)

val_en_vi = val_ds.filter(lambda x: x["direction"] == "en-vi")
val_vi_en = val_ds.filter(lambda x: x["direction"] == "vi-en")

metrics_en_vi, _, _, _ = compute_metrics_on_dataset(val_en_vi, n_samples=1500, batch_size=16)
metrics_vi_en, _, _, _ = compute_metrics_on_dataset(val_vi_en, n_samples=1500, batch_size=16)

print("Metrics EN->VI @1500:", metrics_en_vi)
print("Metrics VI->EN @1500:", metrics_vi_en)


Generating:   0%|          | 0/188 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
The following generation 

BLEU@3000: 26.129559864978646


In [15]:
for i in range(5):
    ex = examples_all[i]
    print("SRC:", ex["src"])
    print("REF:", refs_all[i])
    print("HYP:", hyps_all[i])
    print("DIR:", ex["direction"])
    print("-"*80)


EN: Knowledge, practices in public health service utilization among health insurance card’s holders and influencing factors in Vientiane, Lao
REF: Thực trạng kiến thức và thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố ảnh hưởng tại tỉnh Viêng Chăn, CHDCND Lào, năm 2017
HYP: 1. Khảo sát thực trạng sử dụng dịch vụ y tế của người dân trong nước và các yếu tố ảnh hưởng đến việc sử dụng dịch vụ y tế tại Vũng Tàu, Việt Nam
--------------------------------------------------------------------------------
EN: Describe knowledge, practices in public health service utilization among health insurance card's holders and influencing factors in Vientiane, Lao PDR, 2017.
REF: Mô tả thực trạng kiến thức, thực hành của người có thẻ bảo hiểm y tế trong sử dụng dịch vụ khám chữa bệnh ở các cơ sở y tế công và một số yếu tố liên quan tại tỉnh Viêng Chăn, Cộng hoà Dân chủ Nhân dân Lào năm 2017.
HYP: 1. Mô tả kiến thức, thực hành về sử dụng