In [3]:
%pip install evaluate rouge_score

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import (
     AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
import evaluate

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
# Paths
MODEL_DIR = "../models/summarizer"
CACHE_DIR = "../cache"

# Load processed summarization dataset
df = pd.read_csv('../data/processed/articles_clean.csv')

# Quick peek
df.head()

Unnamed: 0,Contents,Summary
0,"Chiều 15.6, Nguyễn Viết Hiển - Giám đốc Sở GDĐ...",công bố thi bảng ghi chuẩn duyệt tuyển sinh lớ...
1,Vòng kết giải bóng đá mini 2019 vòng 3 cụm 9 đ...,"29.6, Công đoàn Tổng công ty Lương thực Miền N..."
2,Vận hàng hóa đóng vai trò trọng yếu khâu phân ...,Dịch vụ vận hàng hóa đời đáp ứng nhu cầu vận t...
3,Báo Lao Động cập nhật lịch nghỉ Tết Nguyên đán...,"thềm 2021, tỉnh thành công bố lịch nghỉ Tết Ng..."
4,"23.2.2021, Công an huyện Tri Tôn (An Giang) bắ...",Công an huyện Tri Tôn bắt vụ giả dạng bắp nổ v...


In [6]:
#check if dataset still have na
print("Number of samples:", len(df))
print(df.isna().sum())

Number of samples: 43943
Contents    0
Summary     0
dtype: int64


In [6]:
# train/validation split
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

#convert it into huggingface dataset for finetune
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df.reset_index(drop=True))
})

dataset

DatasetDict({
    train: Dataset({
        features: ['Contents', 'Summary'],
        num_rows: 39548
    })
    validation: Dataset({
        features: ['Contents', 'Summary'],
        num_rows: 4395
    })
})

In [7]:
torch.cuda.empty_cache()
# Load tokenizer + model
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base", cache_dir=CACHE_DIR)
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base", cache_dir=CACHE_DIR).to('cuda')

In [9]:
# Tokenization function
MAX_INPUT = 512
MAX_TARGET = 128

def preprocess(batch):
    inputs = tokenizer(
        batch["Contents"], max_length=MAX_INPUT, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        batch["Summary"], max_length=MAX_TARGET, truncation=True, padding="max_length"
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_ds = dataset.map(preprocess, batched=True, remove_columns=["Contents", "Summary"])

Map:   0%|          | 0/39548 [00:00<?, ? examples/s]

Map:   0%|          | 0/4395 [00:00<?, ? examples/s]

In [10]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Metric (ROUGE)
rouge = evaluate.load("rouge")
import numpy as np
import gc
import torch
gc.collect()
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    print(f"Type of predictions: {type(predictions)}")
    print(f"Shape of predictions: {predictions.shape if hasattr(predictions, 'shape') else 'N/A'}")
    print(f"Type of labels: {type(labels)}")
    print(f"Shape of labels: {labels.shape if hasattr(labels, 'shape') else 'N/A'}")

    # Extract predictions if it's a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]
        print(f"Extracted predictions from tuple. Type: {type(predictions)}, Shape: {predictions.shape if hasattr(predictions, 'shape') else 'N/A'}")

    # Apply argmax to convert logits to predicted token IDs
    # Ensure predictions is a numpy array before applying argmax
    if not isinstance(predictions, np.ndarray):
        predictions = np.array(predictions)

    # Check if predictions is 3D (logits) and apply argmax
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)
        print(f"Applied argmax to predictions. New Shape: {predictions.shape}")

    # Replace ignored tokens (-100) with pad_token_id so decoding works
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip whitespace (ROUGE is sensitive to it)
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Debug: Check what the ROUGE result looks like
    print(f"ROUGE result type: {type(result)}")
    print(f"ROUGE result keys: {result.keys()}")
    for key, value in result.items():
        print(f"Key: {key}, Value type: {type(value)}, Value: {value}")

    # Handle different ROUGE output formats
    processed_result = {}
    for key, value in result.items():
        if hasattr(value, 'mid') and hasattr(value.mid, 'fmeasure'):
            # Old format: value.mid.fmeasure
            processed_result[key] = round(value.mid.fmeasure * 100, 2)
        elif isinstance(value, (float, np.floating)):
            # New format: direct float value
            processed_result[key] = round(value * 100, 2)
        else:
            # Fallback: try to extract the value
            try:
                processed_result[key] = round(float(value) * 100, 2)
            except (ValueError, TypeError):
                processed_result[key] = value

    return processed_result

In [17]:
# Training args
training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=3,
    learning_rate=0.00004,
    warmup_steps=500,
    weight_decay=0.01,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    bf16=True,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    eval_accumulation_steps=4,
    torch_empty_cache_steps=100,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"].select(range(1000)),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [19]:
# Train
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,3.5001,0.640229,70.74,45.47,61.81,61.83
1000,0.6493,0.602712,71.8,47.05,63.07,63.08
1500,0.6124,0.591448,71.96,47.45,63.41,63.42


Type of predictions: <class 'tuple'>
Shape of predictions: N/A
Type of labels: <class 'numpy.ndarray'>
Shape of labels: (1000, 128)
Extracted predictions from tuple. Type: <class 'numpy.ndarray'>, Shape: (1000, 128, 36096)
Applied argmax to predictions. New Shape: (1000, 128)
ROUGE result type: <class 'dict'>
ROUGE result keys: dict_keys(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
Key: rouge1, Value type: <class 'numpy.float64'>, Value: 0.7073804365908857
Key: rouge2, Value type: <class 'numpy.float64'>, Value: 0.4547128073587927
Key: rougeL, Value type: <class 'numpy.float64'>, Value: 0.6180666347677285
Key: rougeLsum, Value type: <class 'numpy.float64'>, Value: 0.6182504634676119
Type of predictions: <class 'tuple'>
Shape of predictions: N/A
Type of labels: <class 'numpy.ndarray'>
Shape of labels: (1000, 128)
Extracted predictions from tuple. Type: <class 'numpy.ndarray'>, Shape: (1000, 128, 36096)
Applied argmax to predictions. New Shape: (1000, 128)
ROUGE result type: <class 'dict

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1854, training_loss=1.3968556145891273, metrics={'train_runtime': 1971.5355, 'train_samples_per_second': 60.178, 'train_steps_per_second': 0.94, 'total_flos': 7.224919996760064e+16, 'train_loss': 1.3968556145891273, 'epoch': 3.0})

In [20]:
metrics = trainer.evaluate()
print(metrics)

Type of predictions: <class 'tuple'>
Shape of predictions: N/A
Type of labels: <class 'numpy.ndarray'>
Shape of labels: (1000, 128)
Extracted predictions from tuple. Type: <class 'numpy.ndarray'>, Shape: (1000, 128, 36096)
Applied argmax to predictions. New Shape: (1000, 128)
ROUGE result type: <class 'dict'>
ROUGE result keys: dict_keys(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])
Key: rouge1, Value type: <class 'numpy.float64'>, Value: 0.719595889935803
Key: rouge2, Value type: <class 'numpy.float64'>, Value: 0.47445729122065916
Key: rougeL, Value type: <class 'numpy.float64'>, Value: 0.6340633509470275
Key: rougeLsum, Value type: <class 'numpy.float64'>, Value: 0.6342037876385411
{'eval_loss': 0.5914480090141296, 'eval_rouge1': 71.96, 'eval_rouge2': 47.45, 'eval_rougeL': 63.41, 'eval_rougeLsum': 63.42, 'eval_runtime': 81.8932, 'eval_samples_per_second': 12.211, 'eval_steps_per_second': 0.769, 'epoch': 3.0}


In [None]:
def generate_summary(text, max_length=128, num_beams=4, temperature=0.7):
    """
    Generate a summary for given text using the fine-tuned model
    
    Args:
        text (str): Input text to summarize
        max_length (int): Maximum length of generated summary
        num_beams (int): Number of beams for beam search
        temperature (float): Randomness in generation (0.0-1.0)
    
    Returns:
        str: Generated summary
    
    Raises:
        ValueError: If input parameters are invalid
    """
    # Input validation
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Input text must be a non-empty string")
    if not 0.0 <= temperature <= 1.0:
        raise ValueError("Temperature must be between 0.0 and 1.0")
    
    # Clean input text
    text = clean_vietnamese_text(text)
    
    # Prepare input text
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=512
    ).to(model.device)
    
    # Generate summary with improved parameters
    summary_ids = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams,
        temperature=temperature,
        length_penalty=1.5,  # Adjusted for better length control
        no_repeat_ngram_size=3,
        repetition_penalty=2.5,  # Added to reduce repetition
        early_stopping=True,
        do_sample=True  # Enable sampling for more natural output
    )
    
    # Decode and clean summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summary = clean_vietnamese_text(summary)
    
    return summary

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Summary: *.+___.Z.[ nhầm màu] cho tôi.! nhầm màu cho tôi>Tôi nóij thì nói bán lại.Tôi nói tôi không bán lại cho Shopee thì nói không bán.Tôi muốn Shopee giao hàng lại cho tôi thì nói.Tôi không muốn bán lại thì nói& tôi nói tôi chặn thì nóiW thì nóiỴ thì nóiẼ thì nóiỠ thì nóiẪẲ choẰ thì nóiÕTôiẺ tôiẶ nhầm màuỖ nhầm màu.Tôi giao nhầm màuẸỸ thì nóiÈỮỮ màu Xám mà giao hàng nhầm màuỰTôi giaoỢỬTôi đặt màu X
