#### T5-SMALL - TASK: TEXT SUMMARIZATION 

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"


In [None]:
import warnings
warnings.filterwarnings('ignore')


32512

In [4]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, T5ForConditionalGeneration
from tqdm.auto import tqdm
import json
import os


In [5]:
raw_datasets = load_dataset("cnn_dailymail", "3.0.0")


In [6]:
tokenizer_path = "./checkpoint/tokenizer"
tokenized_dataset_path = "./checkpoint/tokenized_dataset"

if os.path.exists(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
else:
    tokenizer = AutoTokenizer.from_pretrained("t5-base")
    tokenizer.save_pretrained(tokenizer_path)


#### Data Preprocessing

In [7]:
prefix = "summarize: "

def preprocess(example):
    inputs = [prefix + doc for doc in example["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors=None
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["highlights"],
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors=None
        )

    # Replace pad_token_id (typically 0) with -100 to ignore padding in loss
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


if os.path.exists(tokenized_dataset_path):
    # print("Loading tokenized dataset from checkpoint...")
    tokenized_datasets = load_from_disk(tokenized_dataset_path)
else:
    tokenized_datasets = raw_datasets.map(preprocess, batched=True, remove_columns=["article", "highlights", "id"])
    tokenized_datasets.save_to_disk(tokenized_dataset_path)

# train_data = tokenized_datasets["train"]
# eval_data = tokenized_datasets["validation"]
train_data = tokenized_datasets["train"]
eval_data = tokenized_datasets["validation"]



Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/287113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/13368 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
import evaluate

def get_trainer(model, output_dir):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        predict_with_generate=True,
        num_train_epochs=20,
        learning_rate=3e-5,
        weight_decay=0.01,
        logging_dir=f'{output_dir}/logs',
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        generation_max_length=128,       # <-- set appropriate max length
        generation_num_beams=4,  
        report_to="none",
        local_rank=-1,  
    )
    rouge = evaluate.load("rouge")

    import torch
    import numpy as np
    from torch.nn.parallel import DataParallel

    def compute_metrics(eval_pred):
        preds, labels = eval_pred

        # Handle tuple output (common in Hugging Face models)
        if isinstance(preds, tuple):
            preds = preds[0]

        # Ensure tensors are on CPU and handle multi-GPU gathering
        if isinstance(preds, torch.Tensor):
            if torch.cuda.device_count() > 1:
                # If using DataParallel, ensure proper gathering
                preds = preds if preds.dim() > 0 else preds.unsqueeze(0)
            preds = preds.cpu().numpy()
        if isinstance(labels, torch.Tensor):
            if torch.cuda.device_count() > 1:
                labels = labels if labels.dim() > 0 else labels.unsqueeze(0)
            labels = labels.cpu().numpy()

        # Convert to lists
        preds = preds.tolist() if isinstance(preds, np.ndarray) else preds
        labels = labels.tolist() if isinstance(labels, np.ndarray) else labels

        # Ensure token IDs are within valid range
        vocab_size = tokenizer.vocab_size
        if tokenizer.pad_token_id is None:
            tokenizer.pad_token_id = tokenizer.eos_token_id  # Fallback if pad_token_id is None
        preds = [
            [token if token != -100 else tokenizer.pad_token_id for token in seq]
            for seq in preds
        ]
        labels = [
            [token if token != -100 else tokenizer.pad_token_id for token in seq]
            for seq in labels
        ]

        # Clamp token IDs to valid range [0, vocab_size - 1]
        preds = [[int(min(max(token, 0), vocab_size - 1)) for token in seq] for seq in preds]
        labels = [[int(min(max(token, 0), vocab_size - 1)) for token in seq] for seq in labels]

        try:
            decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        except Exception as e:
            return {"rouge1": 0.0, "rouge2": 0.0, "rougeL": 0.0}

        # Compute ROUGE scores
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        
        return {
            "rouge1": round(result["rouge1"], 4),
            "rouge2": round(result["rouge2"], 4),
            "rougeL": round(result["rougeL"], 4),
        }



    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )
    return trainer


In [10]:
import os
os.system("your_command_here 2>/dev/null")

32512

### Prompt Tuning

In [11]:
# 1. Prompt Tuning
from peft import PromptTuningConfig, get_peft_model, TaskType

model_pt = T5ForConditionalGeneration.from_pretrained("t5-small")
model_pt.save_pretrained("./checkpoint_t5_small/t5-prompt-base")
prompt_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=20,
    tokenizer_name_or_path="t5-small"
)
model_pt = get_peft_model(model_pt, prompt_config)

trainer_pt = get_trainer(model_pt, "./checkpoint_t5_small/t5-prompt-tuning")
trainer_pt.train()
results_pt = trainer_pt.evaluate()
print("Prompt Tuning Results:", results_pt)


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.669,2.529305,0.3748,0.1613,0.2577
2,2.5846,2.432097,0.3938,0.174,0.2715
3,2.5027,2.383007,0.3903,0.1722,0.2686
4,2.4568,2.346484,0.3854,0.1691,0.267


Prompt Tuning Results: {'eval_loss': 2.4320971965789795, 'eval_rouge1': 0.3938, 'eval_rouge2': 0.174, 'eval_rougeL': 0.2715, 'eval_runtime': 350.8431, 'eval_samples_per_second': 38.102, 'eval_steps_per_second': 0.596, 'epoch': 4.0}


### Layer Freezing 

In [12]:
# 2. Layer Freezing (freeze encoder)
model_lf = T5ForConditionalGeneration.from_pretrained("t5-base")
model_lf.save_pretrained("./checkpoint_t5_small/t5-layer-base")
for param in model_lf.encoder.parameters():
    param.requires_grad = False

trainer_lf = get_trainer(model_lf, "./checkpoint_t5_small/t5-layer-freeze")
trainer_lf.train()
results_lf = trainer_lf.evaluate()
print("Layer Freezing Results:", results_lf)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,1.5579,1.543,0.4357,0.2095,0.3049
2,1.56,1.534623,0.4345,0.2085,0.3048
3,1.5367,1.532105,0.4357,0.2095,0.3054
4,1.5207,1.529674,0.4358,0.2093,0.3054
5,1.5234,1.529449,0.4353,0.2089,0.305


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Layer Freezing Results: {'eval_loss': 1.5321046113967896, 'eval_rouge1': 0.4357, 'eval_rouge2': 0.2095, 'eval_rougeL': 0.3054, 'eval_runtime': 961.4272, 'eval_samples_per_second': 13.904, 'eval_steps_per_second': 0.217, 'epoch': 5.0}


### LoRA Fine-tuning

In [13]:
# 3. LoRA Fine-tuning
from peft import LoraConfig

model_lora = T5ForConditionalGeneration.from_pretrained("t5-small")
model_lora.save_pretrained("./checkpoint_t5_small/t5-lora-base")
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model_lora = get_peft_model(model_lora, lora_config)

trainer_lora = get_trainer(model_lora, "./checkpoint_t5_small/t5-lora")
trainer_lora.train()
results_lora = trainer_lora.evaluate()
print("LoRA Results:", results_lora)


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.0621,1.864986,0.4177,0.1933,0.2894
2,2.0845,1.853907,0.419,0.1944,0.2908
3,2.0517,1.85006,0.4189,0.1944,0.2907
4,2.0281,1.846388,0.4188,0.1943,0.2907


LoRA Results: {'eval_loss': 1.8539073467254639, 'eval_rouge1': 0.419, 'eval_rouge2': 0.1944, 'eval_rougeL': 0.2908, 'eval_runtime': 402.5113, 'eval_samples_per_second': 33.211, 'eval_steps_per_second': 0.519, 'epoch': 4.0}


### Summary Table

In [14]:
pip install tabulate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
from tabulate import tabulate
print(tabulate([
    ["Prompt Tuning", results_pt['eval_rouge1'], results_pt['eval_rouge2'], results_pt['eval_rougeL']],
    ["Layer Freezing", results_lf['eval_rouge1'], results_lf['eval_rouge2'], results_lf['eval_rougeL']],
    ["LoRA", results_lora['eval_rouge1'], results_lora['eval_rouge2'], results_lora['eval_rougeL']]
], headers=["Method", "ROUGE-1", "ROUGE-2", "ROUGE-L"]))


Method            ROUGE-1    ROUGE-2    ROUGE-L
--------------  ---------  ---------  ---------
Prompt Tuning      0.3938     0.174      0.2715
Layer Freezing     0.4357     0.2095     0.3054
LoRA               0.419      0.1944     0.2908


In [16]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model_lora, tokenizer=tokenizer)
article = """
NASA's Perseverance rover has successfully collected samples from Mars that may contain signs of ancient microbial life. Scientists are now preparing to bring the samples back to Earth for further analysis, hoping to answer the age-old question of whether life ever existed on the red planet.
"""
summary = summarizer("summarize: " + article, max_length=128, min_length=30, do_sample=False)
print("\nExample Article:", article)
print("\nExample Summary:\n", summary[0]['summary_text'])


Device set to use cuda:0
Your max_length is set to 128, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Example Article: 
NASA's Perseverance rover has successfully collected samples from Mars that may contain signs of ancient microbial life. Scientists are now preparing to bring the samples back to Earth for further analysis, hoping to answer the age-old question of whether life ever existed on the red planet.


Example Summary:
 Perseverance rover has successfully collected samples from Mars that may contain signs of ancient microbial life . Scientists are now preparing to bring the samples back to Earth for further analysis .


#### CONCLUSION

We compared three fine-tuning methods for the T5-small model on text summarization, evaluated by ROUGE scores:

- **Layer Freezing**  
  - ROUGE-1: **0.4357**, ROUGE-2: **0.2095**, ROUGE-L: **0.3054**  
  - Achieved the best overall performance.  
  - Recommended when high-quality summarization is the main goal.

- **LoRA**  
  - ROUGE-1: **0.4190**, ROUGE-2: **0.1944**, ROUGE-L: **0.2908**  
  - Offers a strong balance between performance and efficiency.  
  - Suitable for multitask setups or environments with limited memory.

- **Prompt Tuning**  
  - ROUGE-1: **0.3938**, ROUGE-2: **0.1740**, ROUGE-L: **0.2715**  
  - Lowest performance among the three methods.  
  - Extremely parameter-efficient and useful for highly resource-constrained scenarios.

Choose the method based on your prioritiesâ€”quality (Layer Freezing), efficiency and scalability (LoRA), or parameter/resource constraints (Prompt Tuning).
