In [1]:
%%bash

pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 491.5/491.5 kB 25.9 MB/s eta 0:00:00
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 10.3 MB/s eta 0:00:00
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 193.6/193.6 kB 19.8 MB/s eta 0:00:00
Do

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system 

In [2]:
# load the packages
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [65]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [70]:
# Acquire the training data from Hugging Face
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
train_dataset = dataset["train"].select(range(20000)) ## Select 20 k samples for training
val_dataset = dataset["validation"].select(range(2000)) ## Select 2k for validation
test_dataset = dataset["test"].select(range(1000)) ## Select 1k for testing

In [71]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [72]:
# We prefix our tasks with "answer the question"
prefix = "Give the summary of the article: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["article"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["highlights"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [73]:
# Map the preprocessing function across our dataset
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
test_tokenized_dataset = test_dataset.map(preprocess_function, batched=True)
val_tokenized_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [74]:
# Load nltk for texts
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True) #
metric = evaluate.load("rouge")

In [75]:
# Define compute metrics funtion to get the rouge score while training and validation
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels, handling unknown tokens
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [80]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
LOG_STEPS=100
import os
os.environ["WANDB_DISABLED"] = "true"

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./flant-t5-finetuned-cnn-dailymail_20k",
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   logging_steps=LOG_STEPS,
   predict_with_generate=True,
   push_to_hub=False,
   report_to="none"
)

In [81]:
# Define model trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
  #  train_dataset=tokenized_dataset["train"],
  #  eval_dataset=tokenized_dataset["test"],
   train_dataset=train_tokenized_dataset,
   eval_dataset=val_tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [82]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.8035,2.606337,0.23326,0.087826,0.18841,0.215092
2,2.549,2.607017,0.234063,0.089764,0.188738,0.216568
3,2.4964,2.605431,0.234718,0.089318,0.18842,0.216778


TrainOutput(global_step=15000, training_loss=2.648099143473307, metrics={'train_runtime': 2098.1579, 'train_samples_per_second': 28.597, 'train_steps_per_second': 7.149, 'total_flos': 2788357570560000.0, 'train_loss': 2.648099143473307, 'epoch': 3.0})

In [84]:
# We see last epoch gives the best Rouge scores, so we will use this for the testing
last_checkpoint = "./flant-t5-finetuned-cnn-dailymail_20k/checkpoint-15000"
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [85]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load  # or use evaluate if newer
import nltk

# Fine-tuned model
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("./flant-t5-finetuned-cnn-dailymail_20k/checkpoint-15000")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./flant-t5-finetuned-cnn-dailymail_20k/checkpoint-15000")

# Original model
original_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
original_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")


In [87]:
# Define the function to generate model summaries

def generate_summaries(model, tokenizer, inputs, max_input_length=512, max_target_length=128):
    model.eval()
    inputs_tokenized = tokenizer(inputs, padding="max_length", truncation=True, max_length=max_input_length, return_tensors="pt")
    input_ids = inputs_tokenized["input_ids"].to(model.device)
    attention_mask = inputs_tokenized["attention_mask"].to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length)

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)


In [88]:
# Generate summaries from the finetuned and original model
original_preds = generate_summaries(original_model, original_tokenizer, val_dataset["article"])
finetuned_preds = generate_summaries(finetuned_model, finetuned_tokenizer, val_dataset["article"])
references = val_dataset["highlights"]


In [89]:
# Compute rouge scores for original and finetuned model for all samples from val datasets
from evaluate import load  # Or `load_metric` if using older versions

rouge = load("rouge")

original_score = rouge.compute(predictions=original_preds, references=references)
finetuned_score = rouge.compute(predictions=finetuned_preds, references=references)


In [90]:
# Compare the model metrics between fine-tuned and original model
print("Original Model ROUGE Scores:")
for k, v in original_score.items():
    print(f"{k}: {v:.4f}")

print("\nFine-Tuned Model ROUGE Scores:")
for k, v in finetuned_score.items():
    print(f"{k}: {v:.4f}")


Original Model ROUGE Scores:
rouge1: 0.2345
rouge2: 0.0857
rougeL: 0.1787
rougeLsum: 0.2103

Fine-Tuned Model ROUGE Scores:
rouge1: 0.2699
rouge2: 0.0922
rougeL: 0.1984
rougeLsum: 0.2376


In [91]:
# Combine into a DataFrame
comparison_df = pd.DataFrame({
    "Metric": list(original_score.keys()),
    "Original_Model": list(original_score.values()),
    "FineTuned_Model": list(finetuned_score.values())
})

# Display the DataFrame
print(comparison_df)

# Save as CSV
comparison_df.to_csv("rouge_comparison.csv", index=False)

# Save as JSON
comparison_df.to_json("rouge_comparison.json", orient="records", lines=True)

      Metric  Original_Model  FineTuned_Model
0     rouge1        0.234540         0.269925
1     rouge2        0.085651         0.092203
2     rougeL        0.178660         0.198397
3  rougeLsum        0.210323         0.237573


In [92]:
# Download the model for reference

import shutil

# Path to your model checkpoint directory
model_dir = "./flant-t5-finetuned-cnn-dailymail_20k/checkpoint-15000"  # replace with your actual checkpoint

# Output zip file name
zip_file = "finetuned_model_cnn_news_20k"

# Zip the directory
shutil.make_archive("finetuned_model_cnn_news_20k", 'zip', model_dir)

'/content/finetuned_model_cnn_news_20k.zip'

In [93]:
from google.colab import files
files.download("finetuned_model_cnn_news_20k.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [94]:
# Generate master files of generated summaries from original and fine-tuned model
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from evaluate import load as load_metric
import nltk
nltk.download("punkt")


# Generate summaries
def generate(model, tokenizer, inputs, max_input_length=512, max_target_length=128):
    inputs_tokenized = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_input_length).to(model.device)
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs_tokenized["input_ids"],
                                 attention_mask=inputs_tokenized["attention_mask"],
                                 max_length=max_target_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Compute ROUGE scores
rouge = load_metric("rouge")
original_scores = rouge.compute(predictions=original_preds, references=references, use_stemmer=True)
finetuned_scores = rouge.compute(predictions=finetuned_preds, references=references, use_stemmer=True)

# Build DataFrame
df = pd.DataFrame({
    "original_summary": original_preds,
    "finetuned_summary": finetuned_preds,
    "reference_summary": references
})

# Add summary-level scores
for metric, score in original_scores.items():
    df[f"original_{metric}"] = score
for metric, score in finetuned_scores.items():
    df[f"finetuned_{metric}"] = score

# Show first few rows
print(df.head())

# save to CSV
df.to_csv("model_comparison_finetune_20k_results.csv", index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                    original_summary  \
0                                   Zully Broussard.   
1  ESPN's MLS team has been a key player in the M...   
2                  Swansea re-signed him in January.   
3  Ryan Holmes scored a 71 to finish the day thre...   
4  Cayman Naib, and if you read this please know ...   

                                   finetuned_summary  \
0  The super swap works on a simple swapping prin...   
1  Phil Rawlins: MLS has a new generation of play...   
2  Bafetimbi Gomis has been under a great deal of...   
3  Rory McIlroy pulls his second shot on the eigh...   
4  NEW: Police say Cayman was last seen wearing a...   

                                   reference_summary  original_rouge1  \
0  Zully Broussard decided to give a kidney to a ...         0.243266   
1  The 20th MLS season begins this weekend .\nLea...         0.243266   
2  Bafetimbi Gomis collapses within 10 minutes of...         0.243266   
3  Rory McIlroy throws club into w