In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import transformers
print(transformers.__version__)


4.44.2


In [3]:
import torch
print(torch.__version__)

2.9.0+cu128


In [4]:
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
tokenizer = T5Tokenizer.from_pretrained(
    'google/flan-t5-base',
    skip_chat_template=True
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
input_text = "summarize: Parents face video game lessons Ways of ensuring that parents know which video games are suitable for children are to be considered by the games industry.The issue was discussed at a meeting between UK government officials, industry representatives and the British Board of Film Classification. It follows concerns that children may be playing games aimed at adults which include high levels of violence. In 2003, Britons spent £1,152m on games, more than ever before. And this Christmas, parents are expected to spend millions on video games and consoles."
inputs = tokenizer(input_text, return_tensors='pt')

output = model.generate(**inputs, max_length=50)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

print(summary)

Parents are being asked to consider whether video games are suitable for children.


## Read and combine the dataset

In [6]:
import os
import pandas as pd


base_path = "BBC News Summary" 

articles_dir = os.path.join(base_path, "News Articles")
summaries_dir = os.path.join(base_path, "Summaries")

data = []

# Loop through categories (business, politics, sport, etc.)
for category in os.listdir(articles_dir):
    article_cat_dir = os.path.join(articles_dir, category)
    summary_cat_dir = os.path.join(summaries_dir, category)
    
    for fname in os.listdir(article_cat_dir):
        article_path = os.path.join(article_cat_dir, fname)
        summary_path = os.path.join(summary_cat_dir, fname)
        
        if os.path.exists(article_path) and os.path.exists(summary_path):
            
            with open(article_path, "r", encoding="utf-8", errors="replace") as f:
                article = f.read().strip()
            with open(summary_path, "r", encoding="utf-8", errors="replace") as f:
                summary = f.read().strip()
            
            data.append({
                "category": category,
                "article": article,
                "summary": summary
            })


df = pd.DataFrame(data)


print(f"Total samples: {len(df)}")
print("Categories:", df['category'].unique())
print(df.head())


Total samples: 2225
Categories: ['business' 'entertainment' 'politics' 'sport' 'tech']
   category                                            article  \
0  business  Ad sales boost Time Warner profit\n\nQuarterly...   
1  business  Dollar gains on Greenspan speech\n\nThe dollar...   
2  business  Yukos unit buyer faces loan claim\n\nThe owner...   
3  business  High fuel prices hit BA's profits\n\nBritish A...   
4  business  Pernod takeover talk lifts Domecq\n\nShares in...   

                                             summary  
0  TimeWarner said fourth quarter sales rose 2% t...  
1  The dollar has hit its highest level against t...  
2  Yukos' owner Menatep Group says it will ask Ro...  
3  Rod Eddington, BA's chief executive, said the ...  
4  Pernod has reduced the debt it took on to fund...  


## Clean and inspect text

In [7]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'([.,!?;:])', r' \1 ', text)  
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

df["article"] = df["article"].apply(clean_text)
df["summary"] = df["summary"].apply(clean_text)

print("Average article length:", df["article"].apply(lambda x: len(x.split())).mean())
print("Average summary length:", df["summary"].apply(lambda x: len(x.split())).mean())

df.sample(5)


Average article length: 425.78426966292136
Average summary length: 189.6485393258427


Unnamed: 0,category,article,summary
1042,politics,Labour accused of broken pledge Labour has alr...,Home Secretary Charles Clarke has been quoted ...
933,politics,Borders rail link campaign rally Campaigners a...,"Anne Borthwick , of Campaign for Borders Rail ..."
1217,politics,Parties build up poll war chests The Labour Pa...,Registered political parties are required to s...
1991,tech,A decade of good website design The web looks ...,Dr Nielsen said the success of sites such as G...
1693,sport,England claim Dubai Sevens glory England beat ...,England beat Fiji 26-21 in a dramatic final in...


## Split into train, validation, test sets

In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 1801, Val: 201, Test: 223


## Save cleaned data for later stages

In [9]:
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)


## Quick sanity check

In [10]:
for i in range(3):
    print(f"\nCATEGORY: {train_df.iloc[i]['category']}")
    print("ARTICLE:", train_df.iloc[i]['article'][:500], "...")
    print("SUMMARY:", train_df.iloc[i]['summary'])


CATEGORY: tech
ARTICLE: Norway upholds 'Napster' ruling A Norwegian student who ran a website which linked to downloadable MP3 files has been ordered to pay compensation by the country's Supreme Court . Frank Allan Bruvik was ordered to pay 100 , 000 kroner (£8 , 000) to the music industry in Norway . He was a student when he set up his napster . no site , which allowed users to submit and receive links to MP3 files . Bruvik had earlier been cleared on appeal after a lower court had found for the music industry . Music ...
SUMMARY: Frank Allan Bruvik was ordered to pay 100 , 000 kroner (£8 , 000) to the music industry in Norway . Norway's music industry said it was satisfied with the ruling , because showed that music piracy would not be accepted . A Norwegian court ruled in 2003 that Bruvik would have to pay 100 , 000 kroner to the music industry , but the country's Court of Appeal cleared him , saying that the copyright violation occurred when others posted the music . Bruvik's site

## Prepare for tokenization (T5/BART) 

In [11]:
from transformers import AutoTokenizer


model_name_t5 = "google/flan-t5-base"
model_name_bart = "facebook/bart-base"

tokenizer_t5 = AutoTokenizer.from_pretrained(model_name_t5)
tokenizer_bart = AutoTokenizer.from_pretrained(model_name_bart)

# Example tokenization test
sample = train_df.iloc[0]["article"]
inputs = tokenizer_t5("summarize: " + sample, max_length=512, truncation=True, return_tensors="pt")
print(inputs["input_ids"].shape)




torch.Size([1, 512])


## Tokenize and prepare Hugging Face Datasets

In [12]:
from datasets import Dataset
import pandas as pd
#Load CSVs into Hugging Face Datasets

train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# Define tokenization function

def tokenize_function_t5(batch):
    inputs = ["summarize: " + text for text in batch["article"]]
    model_inputs = tokenizer_t5(inputs, max_length=512, truncation=True)
    labels = tokenizer_t5(batch["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def tokenize_function_bart(batch):
    inputs = batch["article"]
    model_inputs = tokenizer_bart(inputs, max_length=512, truncation=True)
    labels = tokenizer_bart(batch["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  Apply tokenization

tokenized_train_t5 = train_ds.map(tokenize_function_t5, batched=True)
tokenized_val_t5 = val_ds.map(tokenize_function_t5, batched=True)
tokenized_test_t5 = test_ds.map(tokenize_function_t5, batched=True)

tokenized_train_bart = train_ds.map(tokenize_function_bart, batched=True)
tokenized_val_bart = val_ds.map(tokenize_function_bart, batched=True)
tokenized_test_bart = test_ds.map(tokenize_function_bart, batched=True)

# Quick sanity check

print("T5 sample input_ids:", tokenized_train_t5[0]["input_ids"][:10])
print("BART sample input_ids:", tokenized_train_bart[0]["input_ids"][:10])


Map: 100%|██████████| 1801/1801 [00:00<00:00, 2227.27 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 2777.13 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 2752.43 examples/s]
Map: 100%|██████████| 1801/1801 [00:00<00:00, 2927.58 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 2991.25 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 2593.14 examples/s]


T5 sample input_ids: [21603, 10, 16491, 95, 6134, 7, 3, 31, 567, 9]
BART sample input_ids: [0, 29723, 1970, 16060, 29, 128, 37549, 3121, 108, 2255]


## Fine-tuning setup with LoRA for T5 and BART

In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model
import torch


#  Load base models

model_t5 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model_bart = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")


# Define LoRA configuration

lora_config_t5 = LoraConfig(
    r=16,              # rank (usually 8–32)
    lora_alpha=32,     # scaling factor
    target_modules=["q", "v"],  # which layers to adapt (for seq2seq models)
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

lora_config_bart = LoraConfig(
    r=16,              # rank (usually 8–32)
    lora_alpha=32,     # scaling factor
    target_modules=["q_proj", "v_proj"],  # which layers to adapt (for seq2seq models)
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

#  Wrap models with LoRA adapters

model_t5 = get_peft_model(model_t5, lora_config_t5)
model_bart = get_peft_model(model_bart, lora_config_bart)

#  Training arguments

training_args_t5 = Seq2SeqTrainingArguments(
    output_dir="./results_t5",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs_t5",
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(),  
)

training_args_bart = Seq2SeqTrainingArguments(
    output_dir="./results_bart",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs_bart",
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)


#  Data collators

data_collator_t5 = DataCollatorForSeq2Seq(tokenizer_t5, model=model_t5)
data_collator_bart = DataCollatorForSeq2Seq(tokenizer_bart, model=model_bart)


#  Define trainers

trainer_t5 = Seq2SeqTrainer(
    model=model_t5,
    args=training_args_t5,
    train_dataset=tokenized_train_t5,
    eval_dataset=tokenized_val_t5,
    tokenizer=tokenizer_t5,
    data_collator=data_collator_t5,
)

trainer_bart = Seq2SeqTrainer(
    model=model_bart,
    args=training_args_bart,
    train_dataset=tokenized_train_bart,
    eval_dataset=tokenized_val_bart,
    tokenizer=tokenizer_bart,
    data_collator=data_collator_bart,
)


#  Print trainable parameters 

def print_trainable_params(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable params: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")

print("T5:")
print_trainable_params(model_t5)
print("BART:")
print_trainable_params(model_bart)




T5:
Trainable params: 1,769,472 / 249,347,328 (0.71%)
BART:
Trainable params: 884,736 / 140,305,152 (0.63%)


## Train and Evaluate LoRA models

In [19]:
import evaluate
import numpy as np
import torch


# Load ROUGE metric

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    # Convert to numpy arrays
    preds = np.array(preds)
    labels = np.array(labels)

    # Replace -100 in labels with pad_token_id to make decoding safe
    labels = np.where(labels == -100, tokenizer_t5.pad_token_id, labels)

    # Decode predictions and labels safely
    decoded_preds = tokenizer_t5.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer_t5.batch_decode(labels, skip_special_tokens=True)

    # Post-process: clean up spacing and ensure valid strings
    decoded_preds = ["\n".join(pred.strip().split(". ")) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split(". ")) for label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer_t5.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}



#  Attach metrics to trainer

trainer_t5.compute_metrics = compute_metrics
trainer_bart.compute_metrics = compute_metrics


#  Train models 

print("Fine-tuning T5...")
trainer_t5.train()

print("Fine-tuning BART...")
trainer_bart.train()


# Evaluate on validation set

print("Evaluating T5...")
metrics_t5 = trainer_t5.evaluate()
print(metrics_t5)

print("Evaluating BART...")
metrics_bart = trainer_bart.evaluate()
print(metrics_bart)


#  Generate sample summaries

def generate_summary(model, tokenizer, text, max_length=128):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_length, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample_text = test_df.iloc[0]["article"]

print("\n📰 Original article:\n", sample_text[:500], "...")
print("\n🧾 Reference summary:\n", test_df.iloc[0]["summary"])
print("\n🤖 T5 summary:\n", generate_summary(model_t5, tokenizer_t5, "summarize: " + sample_text))
print("\n🤖 BART summary:\n", generate_summary(model_bart, tokenizer_bart, sample_text))


Fine-tuning T5...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,18.3317,10.8636,15.9043,17.4978,18.3284
2,0.0,,18.3317,10.8636,15.9043,17.4978,18.3284
3,0.0,,18.3317,10.8636,15.9043,17.4978,18.3284




Fine-tuning BART...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.6001,0.399232,17.5666,14.4048,17.097,17.3823,19.0
2,0.5477,0.37404,18.4097,15.4784,17.9245,18.2741,19.0
3,0.5514,0.361583,18.9312,15.9855,18.3655,18.7701,19.0




Evaluating T5...


{'eval_loss': nan, 'eval_rouge1': 18.3317, 'eval_rouge2': 10.8636, 'eval_rougeL': 15.9043, 'eval_rougeLsum': 17.4978, 'eval_gen_len': 18.3284, 'eval_runtime': 65.9953, 'eval_samples_per_second': 3.046, 'eval_steps_per_second': 0.773, 'epoch': 3.0}
Evaluating BART...


{'eval_loss': 0.36158257722854614, 'eval_rouge1': 18.9312, 'eval_rouge2': 15.9855, 'eval_rougeL': 18.3655, 'eval_rougeLsum': 18.7701, 'eval_gen_len': 19.0, 'eval_runtime': 40.4132, 'eval_samples_per_second': 4.974, 'eval_steps_per_second': 1.262, 'epoch': 3.0}

📰 Original article:
 UK house prices dip in November UK house prices dipped slightly in November , the Office of the Deputy Prime Minister (ODPM) has said . The average house price fell marginally to £180 , 226 , from £180 , 444 in October . Recent evidence has suggested that the UK housing market is slowing after interest rate increases , and economists forecast a drop in prices during 2005 . But while the monthly figures may hint at a cooling of the market , annual house price inflation is still strong , up 13 . 8 ...

🧾 Reference summary:
 All areas saw a rise in annual house price inflation in November except for Northern Ireland and the West Midlands , where the rate was unchanged , the ODPM said . It said annual inflation 