In [None]:
!pip install transformers datasets
!pip install evaluate
!pip install bert-score
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import transformers

print(transformers.__version__)

4.50.3


In [None]:
model_checkpoint = 'vinai/phobert-base-v2'

In [None]:
from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2", use_fast=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer_train = AutoTokenizer.from_pretrained('/content/drive/MyDrive/data/tokenizer/train/')
tokenizer_val = AutoTokenizer.from_pretrained('/content/drive/MyDrive/data/tokenizer/valid/')

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/data/tokenizer/train/vocab.txt', sep=' ', header=None, names=['token', 'id'])
df.sample(5)

Unnamed: 0,token,id
27024,Nguyễn_X@@,920
8176,Mahrez,3012
31365,作@@,207
8781,Vụ_Bản,2943
22142,Chase,1908


In [None]:
from transformers import EncoderDecoderModel

# Đặt tying = True
shared = EncoderDecoderModel.from_encoder_decoder_pretrained("vinai/phobert-base-v2", "vinai/phobert-base-v2", tie_encoder_decoder=True)

In [None]:
# Token mở đầu, kết thúc, padding
shared.config.decoder_start_token_id = tokenizer.bos_token_id
shared.config.eos_token_id = tokenizer.eos_token_id
shared.config.pad_token_id = tokenizer.eos_token_id

# Parameter cho decoding
shared.config.max_length = 64
shared.config.early_stopping = True
shared.config.no_repeat_ngram_size = 3
shared.config.length_penalty = 2.0
shared.config.num_beams = 4
shared.config.vocab_size = shared.config.encoder.vocab_size

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/seq2seq')

from transformers import TrainingArguments
from transformers import Seq2SeqTrainer

from dataclasses import dataclass, field
from typing import Optional
from datasets import Dataset

In [None]:
# Thêm generation_config vào Seq2SeqTrainingArguments
# Copy class Seq2SeqTrainingArguments trong file seq2seq_trainer.py
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0
    )
    sortish_sampler: bool = field(default=False)
    predict_with_generate: bool = field(
        default=False
    )
    adafactor: bool = field(default=False)
    encoder_layerdrop: Optional[float] = field(
        default=None
    )
    decoder_layerdrop: Optional[float] = field(
        default=None
    )
    dropout: Optional[float] = field(default=None)
    attention_dropout: Optional[float] = field(
        default=None
    )
    lr_scheduler: Optional[str] = field(
        default="linear"
    )
    generation_config: Optional[str] = field(
        default=None
    )

In [None]:
from evaluate import load
import numpy as np
from bert_score import score

def compute_metrics(pred):
    # Load metrics
    rouge = load("rouge")
    bertscore = load("bertscore")

    # Decode predictions và labels
    pred_ids = pred.predictions
    labels_ids = pred.label_ids

    # Decode sang text
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Tính ROUGE scores cho tất cả loại
    rouge_results = rouge.compute(
        predictions=pred_str,
        references=label_str,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
        use_aggregator=True  # Tính giá trị trung bình
    )

    # Tính BERTScore (sử dụng model phù hợp cho tiếng Việt)
    bertscore_results = bertscore.compute(
        predictions=pred_str,
        references=label_str,
        model_type=model_checkpoint,  # Hoặc model_checkpoint nếu phù hợp
        lang="vi",
        device="cuda" if torch.cuda.is_available() else "cpu"  # Tăng tốc độ tính toán
    )

    return {
        # ROUGE-1
        "rouge1_precision": round(rouge_results["rouge1"].mid.precision, 4),
        "rouge1_recall": round(rouge_results["rouge1"].mid.recall, 4),
        "rouge1_f1": round(rouge_results["rouge1"].mid.fmeasure, 4),

        # ROUGE-2
        "rouge2_precision": round(rouge_results["rouge2"].mid.precision, 4),
        "rouge2_recall": round(rouge_results["rouge2"].mid.recall, 4),
        "rouge2_f1": round(rouge_results["rouge2"].mid.fmeasure, 4),

        # ROUGE-L
        "rougeL_precision": round(rouge_results["rougeL"].mid.precision, 4),
        "rougeL_recall": round(rouge_results["rougeL"].mid.recall, 4),
        "rougeL_f1": round(rouge_results["rougeL"].mid.fmeasure, 4),

        # ROUGE-Lsum
        "rougeLsum_precision": round(rouge_results["rougeLsum"].mid.precision, 4),
        "rougeLsum_recall": round(rouge_results["rougeLsum"].mid.recall, 4),
        "rougeLsum_f1": round(rouge_results["rougeLsum"].mid.fmeasure, 4),

        # BERTScore (trung bình các mẫu)
        "bertscore_precision": round(np.mean(bertscore_results["precision"]), 4),
        "bertscore_recall": round(np.mean(bertscore_results["recall"]), 4),
        "bertscore_f1": round(np.mean(bertscore_results["f1"]), 4),
    }

In [None]:
import os

# Tắt wandb
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Hyperparameters
batch_size = 16
encoder_max_length = 256
decoder_max_length = 80

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/data/splitting/train.csv')
val_df = pd.read_csv('/content/drive/MyDrive/data/splitting/val.csv')

train_df.head(), val_df.head()

(                                                 URL  \
 0  https://thanhnien.vn/tuyen-sinh-lop-10-tp-hcm-...   
 1  https://thanhnien.vn/nph-lien-minh-huyen-thoai...   
 2  https://tuoitre.vn/danh-giac-corona-theo-phong...   
 3  https://thanhnien.vn/toyota-gioi-thieu-he-thon...   
 4  https://tuoitre.vn/covid-19-khien-ti-le-thai-l...   
 
                                                Title  \
 0  tuyển_sinh lớp 10 tphcm đề thi_môn tiếng anh k...   
 1  nph liên_minh huyền_thoại trêu game_thủ bằng t...   
 2           đánh giặc corona theo phong_cách ngành y   
 3  toyota giới_thiệu hệ_thống sạc điện_thoại khôn...   
 4  covid19 khiến tỉ_lệ thai lưu_sản_phụ tử_vong t...   
 
                                             Abstract  \
 0  chiều 116 gần 94000 học_sinh lớp 9 ở tphcm đã ...   
 1  sau liên_minh huyền_thoại fifa online 3 và chi...   
 2  tto phòng_khám đa_khoa trường đại_học y_khoa p...   
 3  các hệ_thống sạc pin điện_thoại không dây đã b...   
 4  tto theo một nghiên_cứu

In [None]:
# Hợp Content and Abstract thành input text
train_df["input_text"] = train_df['Category'].fillna("") + " " + train_df["Abstract"].fillna("") + " " + train_df["Content"].fillna("")
val_df["input_text"] = val_df['Category'].fillna("") + " " + val_df["Abstract"].fillna("") + " " + val_df["Content"].fillna("")

# Dropna
train_df = train_df.dropna(subset=["Title"])
val_df = val_df.dropna(subset=["Title"])

# Drop URL, Category
train_df = train_df.drop(columns=['URL'])
val_df = val_df.drop(columns=['URL'])

# Chuyển sang HuggingFace dataset
train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

In [None]:
train_data

Dataset({
    features: ['Title', 'Abstract', 'Content', 'Category', 'input_text'],
    num_rows: 48000
})

In [None]:
val_data

Dataset({
    features: ['Title', 'Abstract', 'Content', 'Category', 'input_text'],
    num_rows: 6000
})

In [None]:
# Process data
def process_data_to_model_inputs(batch, tokenizer):
    inputs = tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["Title"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs["input_ids"]
    batch["attention_mask"] = inputs["attention_mask"]
    batch["labels"] = outputs["input_ids"].copy()
    # Mask loss cho padding
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]
    # assert len(batch["input_ids"][0]) == encoder_max_length, "Input IDs length mismatch!"
    # assert len(batch["Title"][0]) == decoder_max_length, "Decoder Input IDs length mismatch!"
    # assert len(batch["attention_mask"][0]) == encoder_max_length, "Attention Mask length mismatch!"
    # assert len(batch["decoder_attention_mask"][0]) == decoder_max_length, "Decoder Attention Mask length mismatch!"
    return batch

In [None]:
# Process training data
train_data_batch = train_data.map(
    lambda batch: process_data_to_model_inputs(batch, tokenizer_train),
    batched=True,
    batch_size=batch_size,
    remove_columns=["Title", "Abstract", "Content", "Category"],  # Bỏ cột cũ
)
train_data_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

# Process validation data
val_data_batch = val_data.map(
    lambda batch: process_data_to_model_inputs(batch, tokenizer_val),
    batched=True,
    batch_size=batch_size,
    remove_columns=["Title", "Abstract", "Content", "Category"],  # Bỏ cột cũ
)
val_data_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    output_dir='/content/drive/MyDrive/training/',
    per_device_train_batch_size=batch_size,  # Tăng nếu GPU mạnh
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=500,  # Ghi log sau mỗi 500 bước
    save_steps=2000,  # Lưu checkpoint sau mỗi 2000 bước
    eval_steps=2000,  # Đánh giá sau mỗi 2000 bước
    warmup_steps=2000,  # Warm-up trong 2000 bước đầu tiên
    num_train_epochs=5,
    learning_rate=2e-5,  # Learning rate mặc định
    overwrite_output_dir=True,
    save_total_limit=10,  # Giới hạn lưu 10 checkpoint
    fp16=True,  # Sử dụng mixed precision
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
trainer = Seq2SeqTrainer(
    model=shared,
    args=args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_batch,
    eval_dataset=val_data_batch,
)
trainer.train()

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss
500,9.0019
1000,5.7672
1500,4.9248
2000,4.5853
2500,4.3087


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss
500,9.0019
1000,5.7672
1500,4.9248
2000,4.5853
2500,4.3087
3000,4.1513
3500,3.788
4000,3.6849
4500,3.6163
5000,3.5498


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)
  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


TrainOutput(global_step=15000, training_loss=3.452384236653646, metrics={'train_runtime': 5847.5617, 'train_samples_per_second': 41.043, 'train_steps_per_second': 2.565, 'total_flos': 4.227178991616e+16, 'train_loss': 3.452384236653646, 'epoch': 5.0})

In [None]:
# Lấy state của trainer
training_stats = trainer.state

print("🔥🔥🔥:")
print(training_stats)

In [None]:
from transformers import RobertaTokenizer, EncoderDecoderModel, AutoTokenizer

output_dir = '/content/drive/MyDrive/training/'
tokenizer_test = AutoTokenizer.from_pretrained('/content/drive/MyDrive/data/tokenizer/test/', use_fast=False)
# tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/training/checkpoint-15000", use_fast=False)
model = EncoderDecoderModel.from_pretrained(output_dir + "/checkpoint-15000", tie_encoder_decoder=True)
model.to("cuda")

In [None]:
!pip install transformers datasets evaluate bert-score

In [None]:
import pandas as pd
# Import dataset from huggingface
from datasets import Dataset
from evaluate import load
import numpy as np
from bert_score import score

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/data/splitting/test.csv')

# Hợp Content and Abstract thành input text
test_df["input_text"] = test_df['Category'].fillna("") + " " + test_df["Abstract"].fillna("") + " " + test_df["Content"].fillna("")

# Dropna
test_df = test_df.dropna(subset=["Title"])

# Drop URL, Category
test_df = test_df.drop(columns=['URL'])

# Chuyển sang HuggingFace dataset
test_data = Dataset.from_pandas(test_df)

test_data

Dataset({
    features: ['Title', 'Abstract', 'Content', 'Category', 'input_text'],
    num_rows: 6000
})

In [None]:
!pip install rouge_score

In [None]:
batch_size = 16
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

test_data_batch = test_data.map(
    lambda batch: process_data_to_model_inputs(batch, tokenizer_test),
    batched=True,
    batch_size=batch_size,
    remove_columns=["Title", "Abstract", "Content", "Category"],  # Bỏ cột cũ
)
test_data_batch.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
# Lấy tiêu đề thực tế và tiêu đề dự đoán
references = test_df["Title"]
predictions = test_result["predicted_title"]

In [None]:
# Tính ROUGE
rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']}")
print(f"ROUGE-2: {rouge_scores['rouge2']}")
print(f"ROUGE-L: {rouge_scores['rougeL']}")
print(f"ROUGE-LSUM: {rouge_scores['rougeLsum']}")

# Tính BERTScore
bert_scores = bertscore.compute(predictions=predictions, references=references, lang="vi")
print("\nBERTScore:")
print(f"Precision: {sum(bert_scores['precision']) / len(bert_scores['precision']):.4f}")
print(f"Recall: {sum(bert_scores['recall']) / len(bert_scores['recall']):.4f}")
print(f"F1: {sum(bert_scores['f1']) / len(bert_scores['f1']):.4f}")

ROUGE Scores:
ROUGE-1: 0.47417685514167707
ROUGE-2: 0.23544511057228462
ROUGE-L: 0.3842766715838296
ROUGE-LSUM: 0.38416691497546707


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]


BERTScore:
Precision: 0.7693
Recall: 0.7551
F1: 0.7617


In [None]:
print("- Thực tế:")
print(references[0])
print("- Dự đoán:")
print(predictions[0])

- Thực tế:
htc công_bố smartphone tầm_trung mới
- Dự đoán:
htc trình làng mẫu tablet mới


In [None]:
# trainer.train(resume_from_checkpoint='/content/drive/MyDrive/training/checkpoint-<last_checkpoint>')