# News Summarization using PEGASUS

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


### Load Data Locally

In [2]:
# df_train1 = pd.read_csv('train.csv')
# df_train1.head()
# # Drop the 'id' column from the dataframe df_train1 to create a new dataframe df_train
# df_train = df_train1.drop(columns=['id'])

# # Select the first 1000 rows of the dataframe df_train
# df_train = df_train.iloc[:1000, :]

# # Define a prefix string to be added to each article
# prefix = 'summarize: '

# # Add the prefix to each article in the 'article' column
# df_train['article'] = prefix + df_train['article']

# # Display the first few rows of the modified dataframe df_train
# df_train.head()

# # Loading the testing dataset
# df_test = pd.read_csv('test.csv')
# df_test = df_test.drop(columns=['id'])
# # Loading the validation dataset
# df_val = pd.read_csv('validation.csv')

# # Select the first 200 rows of the dataframe df_val
# df_val = df_val.iloc[:200, :]

# # Drop the 'id' column from the dataframe
# df_val = df_val.drop(columns=['id'])
# df_val.head()

## Import data from Hugging Face

In [3]:
# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check available splits
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [4]:
model_name = "google/pegasus-large"
# Load the PEGASUS tokenizer and model for summarization
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("mps")

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Extract model parameters
def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

total_params, trainable_params = get_model_parameters(model)
print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")


  return torch.load(checkpoint_file, map_location=map_location)
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Parameters: 570797056
Trainable Parameters: 568699904


In [5]:
# Fitting into dataset dict
train_val_test_dataset = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test'],
    'val': dataset['validation']})

print(type(train_val_test_dataset))

<class 'datasets.dataset_dict.DatasetDict'>


## PEGASUS fine-tuning

In [6]:
def prepare_dataset(data):
    inputs = data["article"]

    # Tokenize the inputs using the tokenizer
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenize the 'highlights' column from the data to be used as labels
    labels = tokenizer(text_target=data["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_data = train_val_test_dataset.map(prepare_dataset, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:  11%|█         | 31000/287113 [00:30<04:08, 1031.97 examples/s]


KeyboardInterrupt: 

In [13]:
# Padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [14]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [15]:
# set up hyper-parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="pegasus-news",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=1,
    predict_with_generate=True,
    #fp16=False,
    report_to="none",
    #use_mps_device = True,
    no_cuda=True
)



In [16]:
# setup trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [17]:
trainer.train()

  0%|          | 0/63 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


KeyboardInterrupt: 

In [18]:
# save the model
model_path = "pegasus-news"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Non-default generation parameters: {'max_length': 256, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('pegasus-news/tokenizer_config.json',
 'pegasus-news/special_tokens_map.json',
 'pegasus-news/spiece.model',
 'pegasus-news/added_tokens.json',
 'pegasus-news/tokenizer.json')