# News Summarization using PEGASUS

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

### Load Data Locally

In [None]:
# df_train1 = pd.read_csv('train.csv')
# df_train1.head()
# # Drop the 'id' column from the dataframe df_train1 to create a new dataframe df_train
# df_train = df_train1.drop(columns=['id'])

# # Select the first 1000 rows of the dataframe df_train
# df_train = df_train.iloc[:1000, :]

# # Define a prefix string to be added to each article
# prefix = 'summarize: '

# # Add the prefix to each article in the 'article' column
# df_train['article'] = prefix + df_train['article']

# # Display the first few rows of the modified dataframe df_train
# df_train.head()

# # Loading the testing dataset
# df_test = pd.read_csv('test.csv')
# df_test = df_test.drop(columns=['id'])
# # Loading the validation dataset
# df_val = pd.read_csv('validation.csv')

# # Select the first 200 rows of the dataframe df_val
# df_val = df_val.iloc[:200, :]

# # Drop the 'id' column from the dataframe
# df_val = df_val.drop(columns=['id'])
# df_val.head()

## Import data from Hugging Face

In [None]:
# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check available splits
print(dataset)


In [None]:
model_name = "google/pegasus-large"
# Load the PEGASUS tokenizer and model for summarization
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("mps")

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Extract model parameters
def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

total_params, trainable_params = get_model_parameters(model)
print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")


In [None]:
# Fitting into dataset dict
train_val_test_dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test'],
    'val': dataset['validation']})

print(type(train_val_test_dataset))

## PEGASUS fine-tuning

In [None]:
def prepare_dataset(data):
    inputs = data["article"]

    # Tokenize the inputs using the tokenizer
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenize the 'highlights' column from the data to be used as labels
    labels = tokenizer(text_target=data["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = train_val_test_dataset.map(prepare_dataset, batched=True)

In [None]:
# Padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [None]:
# set up hyper-parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="pegasus-news",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    #fp16=False,
    report_to="none",
    #use_mps_device = True,
    no_cuda=True
)

In [None]:
# setup trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
# save the model
model_path = "pegasus-news"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)