In [1]:
!pip uninstall -y tensorflow tensorflow-cpu tensorflow-gpu tensorflow-intel keras keras-nightly keras-preprocessing keras-vis tf-nightly tf-estimator-nightly tensorflow-estimator
!pip install -q transformers datasets sacrebleu sentencepiece accelerate evaluate protobuf==3.20.3

Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
[0mFound existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import numpy as np
from accelerate import notebook_launcher
import torch
import os

In [3]:
CHECKPOINT_PATH = "/kaggle/input/finetune-tmnam20-vipubmed-700k/vipubmed_checkpoint/final_model"

In [4]:
if os.path.exists(CHECKPOINT_PATH):
    print(f"Đã tìm thấy model checkpoint tại: {CHECKPOINT_PATH}")
    print("Sẵn sàng train tiếp!")
else:
    print(f"KHÔNG TÌM THẤY đường dẫn: {CHECKPOINT_PATH}")
    print("Vui lòng kiểm tra lại bước 'Add Input' và copy đúng path.")

Đã tìm thấy model checkpoint tại: /kaggle/input/finetune-tmnam20-vipubmed-700k/vipubmed_checkpoint/final_model
Sẵn sàng train tiếp!


In [5]:
base_path = "/kaggle/input/vlsp-dataset/data"

def read_parallel(src_file, tgt_file):
    with open(base_path + src_file, encoding="utf-8") as f_src, open(base_path + tgt_file, encoding="utf-8") as f_tgt:
        src = f_src.read().strip().splitlines()
        tgt = f_tgt.read().strip().splitlines()
    n = min(len(src), len(tgt))
    return pd.DataFrame({'en': src[:n], 'vi': tgt[:n]})

print("Đang load dữ liệu VLSP...")
train_df = read_parallel("/train.en.txt", "/train.vi.txt")
test_df  = read_parallel("/public_test.en.txt", "/public_test.vi.txt")

raw = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
})
print(f"Dữ liệu train: {len(raw['train'])} dòng | Test: {len(raw['test'])} dòng")

Đang load dữ liệu VLSP...
Dữ liệu train: 500000 dòng | Test: 3000 dòng


In [6]:
try:
    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)
except:
    print(" Không load được tokenizer từ checkpoint, dùng tokenizer gốc.")
    tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-vi")



In [7]:
max_length = 128

def preprocess(batch):
    inputs = [ex for ex in batch["en"]]
    targets = [ex for ex in batch["vi"]]
    model_inputs = tokenizer(inputs, text_target=targets, truncation=True, max_length=max_length)
    return model_inputs

tokenized = raw.map(preprocess, batched=True, remove_columns=raw["train"].column_names)
tokenized = tokenized.with_format("torch")

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/vlsp_final_result",
    eval_strategy="epoch",        
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=5,
    logging_steps=100,
    report_to="none"
)

In [9]:
def train_fn():
    model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT_PATH)
    metric = evaluate.load("sacrebleu")
    
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = [p.strip() for p in decoded_preds]
        decoded_labels = [[l.strip()] for l in decoded_labels]
        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
        return {"bleu": result["score"]}

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="longest")
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    print("Bắt đầu train...")
    trainer.train()
    
    save_path = "/kaggle/working/vlsp_complete_model"
    if trainer.args.process_index == 0:
        trainer.save_model(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"HOÀN TẤT! Model cuối cùng nằm tại: {save_path}")

In [10]:
train_fn()

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Bắt đầu train...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss,Bleu
1,1.4238,1.372699,44.246947
2,1.2956,1.282049,45.325951
3,1.2184,1.23651,46.090044
4,1.1967,1.214096,46.427772
5,1.1787,1.206002,46.569962




HOÀN TẤT! Model cuối cùng nằm tại: /kaggle/working/vlsp_complete_model
