# Package Installation

In [1]:
!pip install -q -U bitsandbytes transformers peft accelerate datasets trl sacrebleu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m112.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.0/557.0 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's depe

In [2]:
import pandas as pd
import sacrebleu
import numpy as np
import torch
import transformers
import peft
import datasets
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


print("torch", torch.__version__)
print("transformers", transformers.__version__, "| peft", peft.__version__)
print("datasets", datasets.__version__)
print("GPU count:", torch.cuda.device_count())

torch 2.8.0+cu126
transformers 5.0.0 | peft 0.18.1
datasets 4.5.0
GPU count: 2


In [None]:
from huggingface_hub import login
hf_token = ""
login(token=hf_token)

# Loading and preparing model

In [4]:
model = "Qwen/Qwen2.5-1.5B-Instruct"
MODEL_NAME = model

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=True
)
model.config.use_cache = False

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [6]:
import re
def get_num_layers(model):
    numbers = set()
    for name, _ in model.named_parameters():
        for number in re.findall(r'\d+', name):
            numbers.add(int(number))
    return max(numbers)

def get_last_layer_linears(model):
    names = []
    
    num_layers = get_num_layers(model)
    for name, module in model.named_modules():
        if str(num_layers) in name and not "encoder" in name:
            if isinstance(module, torch.nn.Linear):
                names.append(name)
    return names

In [7]:
get_num_layers(model)

27

In [8]:
get_last_layer_linears(model)

['model.layers.27.self_attn.q_proj',
 'model.layers.27.self_attn.k_proj',
 'model.layers.27.self_attn.v_proj',
 'model.layers.27.self_attn.o_proj',
 'model.layers.27.mlp.gate_proj',
 'model.layers.27.mlp.up_proj',
 'model.layers.27.mlp.down_proj']

# LORA config

In [9]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


In [10]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

# Load data

In [11]:
def load_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

train_en = load_txt('/kaggle/input/vlsp-dataset/data/train.en.txt')
train_vi = load_txt('/kaggle/input/vlsp-dataset/data/train.vi.txt')
test_en = load_txt('/kaggle/input/vlsp-dataset/data/public_test.en.txt')
test_vi = load_txt('/kaggle/input/vlsp-dataset/data/public_test.vi.txt')

df_train = pd.DataFrame({'en': train_en, 'vi': train_vi}).drop_duplicates().reset_index(drop=True)
df_test = pd.DataFrame({'en': test_en, 'vi': test_vi}).reset_index(drop=True)


train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [12]:
example_en = df_train.iloc[0]['en']
example_vi = df_train.iloc[0]['vi']
print(example_en, example_vi, sep='\n\n')

To evaluate clinical, subclinical symptoms of patients with otitis media with effusion and V.a at otorhinolaryngology department – Thai Nguyen national hospital

Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bệnh nhân viêm tai ứ dịch trên viêm V.A tại Khoa Tai mũi họng - Bệnh viện Trung ương Thái Nguyên


# Generate example

In [13]:
PROMPT_TEMPLATE = "### Task: Translate this sentence into Vietnamese accurately\n### English: {en}\n### Vietnamese:"

In [14]:
tokenizer.pad_token_id

151645

In [15]:
generation_config = model.generation_config
generation_config.max_new_tokens = 256
generation_config.temperature = 0.3
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [16]:
%%time
prompt = f"""
### Task: Translate this sentence into Vietnamese accurately\n
### English: {example_en}\n
### Vietnamese:
"""
messages = [
    {"role": "system", "content": "You are a professional medical translator."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(model.device)

with torch.no_grad():
    generated_ids = model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        generation_config=generation_config
    )
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response.strip())

Để đánh giá các triệu chứng y tế và triệu chứng không triệu chứng của bệnh nhân với viêm họng khí hư và V. a tại khoa Otorinolaringologia - Bệnh viện Quốc Gia Thái Nguyên
CPU times: user 4.63 s, sys: 409 ms, total: 5.04 s
Wall time: 5.61 s


In [17]:
def tokenize_fn(batch):
    sources = batch['en']
    targets = batch['vi']
    
    model_inputs = {"input_ids": [], "attention_mask": [], "labels": []}
    
    for en, vi in zip(sources, targets):
        prompt = PROMPT_TEMPLATE.format(en=en)
        prompt_ids = tokenizer(prompt, add_special_tokens=False)['input_ids']
        target_ids = tokenizer(vi + tokenizer.eos_token, add_special_tokens=False)['input_ids']
        input_ids = prompt_ids + target_ids
        attention_mask = [1] * len(input_ids)
        
        labels = [-100] * len(prompt_ids) + target_ids
        
        if len(input_ids) > 512:
            input_ids = input_ids[:512]
            attention_mask = attention_mask[:512]
            labels = labels[:512]
            
        model_inputs["input_ids"].append(input_ids)
        model_inputs["attention_mask"].append(attention_mask)
        model_inputs["labels"].append(labels)
        
    return model_inputs

# Format finetuning data

In [18]:
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=train_dataset.column_names)
tokenized_test = test_dataset.map(tokenize_fn, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/348224 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

# Train

In [19]:
from transformers import Trainer, DataCollatorForSeq2Seq

In [20]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    final_preds = [p.strip() for p in decoded_preds]
    final_labels = [[l.strip()] for l in decoded_labels]

    bleu = sacrebleu.corpus_bleu(final_preds, final_labels)
    return {"bleu": bleu.score}

In [21]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8
)

In [22]:
model.config.pad_token_id = tokenizer.pad_token_id

In [23]:
targs = Seq2SeqTrainingArguments(
    output_dir="./finetuned_model",
    fp16=True,       
    bf16=False,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8, 
    learning_rate=2e-4,
    num_train_epochs=1,    
    logging_steps=250,
    save_strategy="steps",
    eval_strategy="epoch",
    save_steps=250,        
    save_total_limit=2,
    report_to="none",
    group_by_length=True, 
    ddp_find_unused_parameters=False,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    predict_with_generate=True,
    generation_max_length=512
)

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=targs,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test, 
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [25]:
print("Start training...")
trainer.train()

Start training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Bleu
1,0.76972,0.741176,56.490553


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=10882, training_loss=0.871387998401801, metrics={'train_runtime': 27080.3361, 'train_samples_per_second': 12.859, 'train_steps_per_second': 0.402, 'total_flos': 2.709045937885348e+17, 'train_loss': 0.871387998401801, 'epoch': 1.0})

# Save model

In [26]:
save_path = "trained-model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('trained-model/tokenizer_config.json',
 'trained-model/chat_template.jinja',
 'trained-model/tokenizer.json')

# Loading and using the model later

In [27]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [28]:
model.save_pretrained("trained-model")

PEFT_MODEL = "/kaggle/working/trained-model"

config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading weights:   0%|          | 0/338 [00:00<?, ?it/s]

In [29]:
generation_config = model.generation_config
generation_config.max_new_tokens = 256
generation_config.temperature = 0.3
generation_config.top_p = 0.7
generation_config.do_sample = True
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.pad_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [30]:
example_en = df_train.iloc[0]['en']
example_vi = df_train.iloc[0]['vi']
print(example_en, example_vi, sep='\n\n')

To evaluate clinical, subclinical symptoms of patients with otitis media with effusion and V.a at otorhinolaryngology department – Thai Nguyen national hospital

Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bệnh nhân viêm tai ứ dịch trên viêm V.A tại Khoa Tai mũi họng - Bệnh viện Trung ương Thái Nguyên


In [31]:
%%time
prompt = f"""
### Task: Translate this sentence into Vietnamese accurately\n
### English: {example_en}\n
### Vietnamese:
"""
messages = [
    {"role": "system", "content": "You are a professional medical translator."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True).to(model.device)

with torch.no_grad():
    generated_ids = model.generate(
        input_ids=model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        generation_config=generation_config
    )
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response.strip())

Đánh giá đặc điểm lâm sàng, cận lâm sàng của bệnh nhân viêm tai giữa tiết dịch có nhiễm vi khuẩn V.A tại khoa Tai Mũi Họng - Bệnh viện Trung ương Thái Nguyên
CPU times: user 3.72 s, sys: 2.93 ms, total: 3.73 s
Wall time: 3.72 s
