In [2]:
%%bash

pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub



In [3]:
!pip install --upgrade datasets



In [7]:
# load the packages
import nltk
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [5]:
# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
# Acquire the training data from Hugging Face
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
train_dataset = dataset["train"].select(range(50000)).shuffle(seed=20)
val_dataset = dataset["validation"].select(range(5000)).shuffle(seed=20)
test_dataset = dataset["test"].select(range(5000)).shuffle(seed=20)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [12]:
# We prefix our tasks with "answer the question"
prefix = "Give the summary of the article: "

# Define the preprocessing function

def preprocess_function(examples):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + doc for doc in examples["article"]]
   model_inputs = tokenizer(inputs, max_length=128, truncation=True)

   # The "labels" are the tokenized outputs:
   labels = tokenizer(text_target=examples["highlights"],
                      max_length=512,
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [10]:
# Map the preprocessing function across our dataset
train_tokenized_dataset = train_dataset.map(preprocess_function, batched=True)
test_tokenized_dataset = test_dataset.map(preprocess_function, batched=True)
val_tokenized_dataset = val_dataset.map(preprocess_function, batched=True)

NameError: name 'preprocess_function' is not defined

In [14]:
# Load nltk for texts
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True) #
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [15]:
# Define compute metrics funtion to get the rouge score while training and validation
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels, handling unknown tokens
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [18]:
# Global Parameters
L_RATE = 3e-4
BATCH_SIZE = 4
PER_DEVICE_EVAL_BATCH = 4
WEIGHT_DECAY = 0.01
SAVE_TOTAL_LIM = 3
NUM_EPOCHS = 3
LOG_STEPS=100
import os
os.environ["WANDB_DISABLED"] = "true"

# Set up training arguments
training_args = Seq2SeqTrainingArguments(
   output_dir="./flant-t5-finetuned-cnn-dailymail_50k",
   eval_strategy="epoch",
   learning_rate=L_RATE,
   per_device_train_batch_size=BATCH_SIZE,
   per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
   weight_decay=WEIGHT_DECAY,
   save_total_limit=SAVE_TOTAL_LIM,
   num_train_epochs=NUM_EPOCHS,
   logging_steps=LOG_STEPS,
   predict_with_generate=True,
   push_to_hub=False,
   report_to="none"
)

In [19]:
# Define model trainer
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
  #  train_dataset=tokenized_dataset["train"],
  #  eval_dataset=tokenized_dataset["test"],
   train_dataset=train_tokenized_dataset,
   eval_dataset=val_tokenized_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [20]:
# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.7601,2.489434,0.24479,0.099537,0.195291,0.227901
2,2.6042,2.486202,0.246003,0.101461,0.196036,0.229156
3,2.4983,2.47622,0.247507,0.101306,0.196453,0.230313


TrainOutput(global_step=37500, training_loss=2.6563773942057294, metrics={'train_runtime': 5237.199, 'train_samples_per_second': 28.641, 'train_steps_per_second': 7.16, 'total_flos': 6970893926400000.0, 'train_loss': 2.6563773942057294, 'epoch': 3.0})

In [5]:
# We see last epoch gives the best Rouge scores, so we will use this for the testing
last_checkpoint = "./flant-t5-finetuned-cnn-dailymail_50k/checkpoint-36500"
finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from evaluate import load  # or use evaluate if newer
import nltk

# Fine-tuned model
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("./flant-t5-finetuned-cnn-dailymail_50k/checkpoint-37500")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./flant-t5-finetuned-cnn-dailymail_50k/checkpoint-37500")

# Original model
original_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
original_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
# Define the function to generate model summaries
import torch
def generate_summaries(model, tokenizer, inputs, max_input_length=512, max_target_length=128):
    model.eval()
    inputs_tokenized = tokenizer(inputs, padding="max_length", truncation=True, max_length=max_input_length, return_tensors="pt")
    input_ids = inputs_tokenized["input_ids"].to(model.device)
    attention_mask = inputs_tokenized["attention_mask"].to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_target_length)

    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)


In [None]:
# Generate summaries from the finetuned and original model
original_preds = generate_summaries(original_model, original_tokenizer, val_dataset["article"])
finetuned_preds = generate_summaries(finetuned_model, finetuned_tokenizer, val_dataset["article"])
references = val_dataset["highlights"]


In [None]:
# Compute rouge scores for original and finetuned model for all samples from val datasets
from evaluate import load  # Or `load_metric` if using older versions

rouge = load("rouge")

original_score = rouge.compute(predictions=original_preds, references=references)
finetuned_score = rouge.compute(predictions=finetuned_preds, references=references)


In [None]:
# Compare the model metrics between fine-tuned and original model
print("Original Model ROUGE Scores:")
for k, v in original_score.items():
    print(f"{k}: {v:.4f}")

print("\nFine-Tuned Model ROUGE Scores:")
for k, v in finetuned_score.items():
    print(f"{k}: {v:.4f}")


In [None]:
# Combine into a DataFrame
comparison_df = pd.DataFrame({
    "Metric": list(original_score.keys()),
    "Original_Model": list(original_score.values()),
    "FineTuned_Model": list(finetuned_score.values())
})

# Display the DataFrame
print(comparison_df)

# Save as CSV
comparison_df.to_csv("rouge_comparison.csv", index=False)

# Save as JSON
comparison_df.to_json("rouge_comparison.json", orient="records", lines=True)

In [None]:
# Download the model for reference

import shutil

# Path to your model checkpoint directory
model_dir = "./flant-t5-finetuned-cnn-dailymail_20k/checkpoint-15000"  # replace with your actual checkpoint

# Output zip file name
zip_file = "finetuned_model_cnn_news_20k"

# Zip the directory
shutil.make_archive("finetuned_model_cnn_news_20k", 'zip', model_dir)

'/content/finetuned_model_cnn_news_20k.zip'

In [None]:
from google.colab import files
files.download("finetuned_model_cnn_news_20k.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Generate master files of generated summaries from original and fine-tuned model
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from evaluate import load as load_metric
import nltk
nltk.download("punkt")


# Generate summaries
def generate(model, tokenizer, inputs, max_input_length=512, max_target_length=128):
    inputs_tokenized = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_input_length).to(model.device)
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs_tokenized["input_ids"],
                                 attention_mask=inputs_tokenized["attention_mask"],
                                 max_length=max_target_length)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Compute ROUGE scores
rouge = load_metric("rouge")
original_scores = rouge.compute(predictions=original_preds, references=references, use_stemmer=True)
finetuned_scores = rouge.compute(predictions=finetuned_preds, references=references, use_stemmer=True)

# Build DataFrame
df = pd.DataFrame({
    "original_summary": original_preds,
    "finetuned_summary": finetuned_preds,
    "reference_summary": references
})

# Add summary-level scores
for metric, score in original_scores.items():
    df[f"original_{metric}"] = score
for metric, score in finetuned_scores.items():
    df[f"finetuned_{metric}"] = score

# Show first few rows
print(df.head())

# save to CSV
df.to_csv("model_comparison_finetune_20k_results.csv", index=False)


In [22]:
# Download the model for reference

import shutil

# Path to your model checkpoint directory
model_dir = "./flant-t5-finetuned-cnn-dailymail_50k/checkpoint-37500"  # replace with your actual checkpoint

# Output zip file name
zip_file = "finetuned_model_cnn_news_50k"

# Zip the directory
shutil.make_archive("finetuned_model_cnn_news_50k", 'zip', model_dir)

from google.colab import files
files.download("finetuned_model_cnn_news_50k.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>