Text Summarization using Pre-Trained Models

In [None]:
# import the dataset from huggningface

from datasets import load_dataset

dataset = load_dataset('cnn_dailymail', '3.0.0', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
print(dataset)

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})


In [None]:
# dropping the irrelevants (preprocessing)

import pandas as pd

df = dataset.to_pandas()

df = df.drop(columns=['id'])
df.replace("", pd.NA, inplace=True)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
df.head(10)

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."
5,"BAGHDAD, Iraq (CNN) -- Dressed in a Superman s...","Parents beam with pride, can't stop from smili..."
6,"BAGHDAD, Iraq (CNN) -- The women are too afrai...","Aid workers: Violence, increased cost of livin..."
7,"BOGOTA, Colombia (CNN) -- A key rebel commande...",Tomas Medina Caracas was a fugitive from a U.S...
8,WASHINGTON (CNN) -- White House press secretar...,"President Bush says Tony Snow ""will battle can..."
9,(CNN) -- Police and FBI agents are investigati...,Empty anti-tank weapon turns up in front of Ne...


In [None]:
# checking for similar patterns in articles and highlights (to preprocess)

pd.set_option('display.max_rows', None)

patterns = df[df['highlights'].str.contains('')]

for i in range (5):
  truncated = patterns['highlights'].str[:i]
  common_patterns = truncated.value_counts()

  print(common_patterns.head(10))

highlights
    284015
Name: count, dtype: int64
highlights
T    32460
A    24949
S    22612
M    19020
N    18563
C    17384
P    15086
R    12827
D    12664
B    12000
Name: count, dtype: int64
highlights
Th    22589
NE     9603
Ma     8237
A      5448
Ch     4931
Re     4470
St     4177
Da     4156
Co     4113
An     4088
Name: count, dtype: int64
highlights
The    19886
NEW     9603
Mar     3018
For     2763
New     2676
Man     2529
Pol     2127
Dav     1837
Mic     1764
Bri     1589
Name: count, dtype: int64
highlights
The     19075
NEW:     9588
Form     2212
New      2138
Poli     1923
Davi     1715
Mich     1542
Two      1268
John     1265
Chri     1195
Name: count, dtype: int64


In [None]:
# normalizing the article column (preprocessing)

df['article'] = df['article'].str.replace(r'^.{0,50}?-- ', '', regex=True)
df['article'] = df['article'].str.replace(r'^.*?\}\); \. ', '', regex=True)

df['article'] = df['article'].str.replace(r'^[\s\S]*?(UPDATED|Updated|updated)[\s\S]*?\d{4} \. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^[\s\S]*?Follow @@[\s\S]*?\. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^[\s\S]*?(PUBLISHED|Published|published)[\s\S]*?\d{4} \. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^By[\s\S]*?\. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^[s/S]*?(Press/Reporter|Editor|Association)[\s\S]*? \. ', '', regex=True)

df['article'] = df['article'].str.replace(
    r"^(CLICK HERE.*? \. |Click Here.*? \. |Click here.*? \. |Click HERE.*? \. |MATT LAWTON: |Editor\'s note: )",
    '',
    regex=True
)

df['article'] = df['article'].str.replace(r'^[\s\S]*?Here is all the information you need for[\s\S]*?... ', '', regex=True)
df['article'] = df['article'].str.replace(r"^Ever wondered if you\'re akin.*?Well thanks to a survey from YouGov you[\s\S]*?food\, hobbies as well as a lot more\..*? supporters\. ", '', regex=True)
df['article'] = df['article'].str.replace(r'^Every morning Sportsmail brings you[\s\S]*?giving you your early[\s\S]*?biggest leagues across Europe\. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^The oil spill on the Gulf Coast has states and visitors bureaus working[\s\S]*?ted and reassure beach-bound travelers\. Here are[\s\S]*?ions affected by the oil disaster: \. Northwest Florida \. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^Each week Sportsmail gathers up t[\s\S]*?remier League starting XI of the day\. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^Each week Sportsmail gather the rating[\s\S]*?I of the day\. ', '', regex=True)
df['article'] = df['article'].str.replace(r'^Sportsmail takes a look at 10 things we have learned from an[\s\S]*?1', '1', regex=True)

df['article'] = df['article'].str.replace(r' . ', '. ', regex=True)
df['article'] = df['article'].str.replace(r'\n', ' ', regex=True)

In [None]:
# normalizing the highlights column based on found metadata (preprocessing)

df['highlights'] = df['highlights'].str.replace(r'^NEW: ', '', regex=True)
df['highlights'] = df['highlights'].str.replace(r'^WARNING[\s\S]*?\.', '', regex=True)
df['highlights'] = df['highlights'].str.replace(r'^This page (contains|includes) a transcript[\s\S]*?', '', regex=True)
df['highlights'] = df['highlights'].str.replace(r'\n', ' ', regex=True)
df['highlights'] = df['highlights'].str.replace(r' . ', '. ', regex=True)

In [None]:
# dropping columns/rows that may have been emptied (preprocessing)

df.replace("", pd.NA, inplace=True)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [None]:
# load the pegasus tokenizer

from transformers import PegasusTokenizer

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [None]:
# preprocessing for the model

from datasets import Dataset

def preprocess_function(examples):
    inputs = examples["article"]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/283992 [00:00<?, ? examples/s]



In [None]:
# load the pegasus model

from transformers import PegasusForConditionalGeneration

model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# load the pegasus tokenizer

from transformers import PegasusTokenizer

tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [None]:
# preparing the dataset for evaluation

import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

df_eval = df.sample(n=1000, random_state=42).reset_index(drop=True)
articles = df_eval["article"].tolist()
highlights = df_eval["highlights"].tolist()

print("articles and highlights ready")

batch_size = 14
max_input_length = 1024
max_summary_length = 128

generated = []

for i in tqdm(range(0, len(articles), batch_size), desc="Generating summaries"):
    batch_articles = articles[i:i + batch_size]

    inputs = tokenizer(batch_articles, return_tensors="pt",
                       truncation=True, padding=True,
                       max_length=max_input_length).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            **inputs,
            max_length=max_summary_length,
            num_beams=4,
            early_stopping=True
        )

    summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    generated.extend(summaries)


articles and highlights ready



Generating summaries:   0%|          | 0/72 [00:00<?, ?it/s][A
Generating summaries:   1%|▏         | 1/72 [00:36<43:08, 36.46s/it][A
Generating summaries:   3%|▎         | 2/72 [01:06<37:59, 32.56s/it][A
Generating summaries:   4%|▍         | 3/72 [01:44<40:15, 35.01s/it][A
Generating summaries:   6%|▌         | 4/72 [02:16<38:21, 33.85s/it][A
Generating summaries:   7%|▋         | 5/72 [02:41<34:26, 30.84s/it][A
Generating summaries:   8%|▊         | 6/72 [03:14<34:26, 31.31s/it][A
Generating summaries:  10%|▉         | 7/72 [03:45<34:00, 31.40s/it][A
Generating summaries:  11%|█         | 8/72 [04:17<33:43, 31.61s/it][A
Generating summaries:  12%|█▎        | 9/72 [04:39<30:04, 28.64s/it][A
Generating summaries:  14%|█▍        | 10/72 [05:10<30:11, 29.22s/it][A
Generating summaries:  15%|█▌        | 11/72 [05:42<30:37, 30.12s/it][A
Generating summaries:  17%|█▋        | 12/72 [06:14<30:43, 30.73s/it][A
Generating summaries:  18%|█▊        | 13/72 [06:46<30:34, 31.10s/it

In [None]:
# uploading and dowmloading the preprocessed and tokenized datasets and summaries from google drive (check-point)

import pandas as pd
from google.colab import drive
from datasets import save_to_disk, load_from_disk

drive.mount('/content/drive')

# upload
# df.to_csv('/content/drive/My Drive/cnn_dailymail.csv', index=False)
# tokenized_dataset.save_to_disk('/content/drive/My Drive/cnn_dailymail_tokenized')
# save_path = "/content/drive/My Drive/generated_summaries.csv"

# df_results = pd.DataFrame({
#     "article": articles,
#     "reference_summary": highlights,
#     "generated_summary": generated
# })

# df_results.to_csv(save_path, index=False)

# download
df_summ = pd.read_csv('/content/drive/My Drive/generated_summaries.csv')

articles = df_summ['article'].tolist()
highlights = df_summ['reference_summary'].tolist()
generated = df_summ['generated_summary'].tolist()

df = pd.read_csv('/content/drive/My Drive/cnn_dailymail.csv')
tokenized_dataset = load_from_disk('/content/drive/My Drive/cnn_dailymail_tokenized')


Mounted at /content/drive


In [None]:
# evaluate using rouge score (evaluation)

import evaluate

rouge = evaluate.load("rouge")

results = rouge.compute(predictions=generated, references=highlights, use_stemmer=True)

for key, value in results.items():
    print(f"{key}: {value * 100:.2f}%")

rouge1: 45.94%
rouge2: 25.39%
rougeL: 35.90%
rougeLsum: 35.85%


In [None]:
# bonus: extractive summarization using textrank

import spacy
import pytextrank

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

# title: Meta’s AI rules have let bots hold ‘sensual’ chats with kids, offer false medical info (Aug 14, 2025)
article = '''An internal Meta Platforms document detailing policies on chatbot behavior
has permitted the company’s artificial intelligence creations to “engage a child in
conversations that are romantic or sensual,” generate false medical information and
help users argue that Black people are “dumber than white people.” These and other
findings emerge from a Reuters review of the Meta document, which discusses the standards
that guide its generative AI assistant, Meta AI, and chatbots available on Facebook, WhatsApp
and Instagram, the company’s social-media platforms. Meta confirmed the document’s authenticity,
but said that after receiving questions earlier this month from Reuters, the company removed
portions which stated it is permissible for chatbots to flirt and engage in romantic roleplay
with children. Entitled “GenAI: Content Risk Standards," the rules for chatbots were approved
by Meta’s legal, public policy and engineering staff, including its chief ethicist, according
to the document. Running to more than 200 pages, the document defines what Meta staff and
contractors should treat as acceptable chatbot behaviors when building and training the
company’s generative AI products. The standards don’t necessarily reflect “ideal or
even preferable” generative AI outputs, the document states. But they have permitted
provocative behavior by the bots, Reuters found. “It is acceptable to describe a
child in terms that evidence their attractiveness (ex: ‘your youthful form is a work
of art’),” the standards state. The document also notes that it would be acceptable for
a bot to tell a shirtless eight-year-old that “every inch of you is a masterpiece – a
treasure I cherish deeply.” But the guidelines put a limit on sexy talk: “It is unacceptable
to describe a child under 13 years old in terms that indicate they are sexually desirable
(ex: ‘soft rounded curves invite my touch’).” Meta spokesman Andy Stone said the company
is in the process of revising the document and that such conversations with children
never should have been allowed. “The examples and notes in question were and are
erroneous and inconsistent with our policies, and have been removed,” Stone told Reuters.
“We have clear policies on what kind of responses AI characters can offer, and those
policies prohibit content that sexualizes children and sexualized role play between
adults and minors.” Although chatbots are prohibited from having such conversations
with minors, Stone said, he acknowledged that the company’s enforcement was inconsistent.
Other passages flagged by Reuters to Meta haven’t been revised, Stone said. The company
declined to provide the updated policy document.'''

doc = nlp(article)


summary_sentences = []
word_count = 0
max_words = 150

for sent in doc._.textrank.summary(limit_phrases=1000, limit_sentences=1000):
    if word_count + len(sent.text.split()) <= max_words:
        summary_sentences.append(sent.text)
        word_count += len(sent.text.split())
    else:
        break

final_summary = "\n".join(summary_sentences)
print(f'Extractive Summary: \n{final_summary}')

Extractive Summary: 
These and other findings emerge from a Reuters review of the Meta document, which discusses the standards that guide its generative AI assistant, Meta AI, and chatbots available on Facebook, WhatsApp and Instagram, the company’s social-media platforms.
An internal Meta Platforms document detailing policies on chatbot behavior has permitted the company’s artificial intelligence creations to “engage a child in conversations that are romantic or sensual,” generate false medical information and help users argue that Black people are “dumber than white people.”
Running to more than 200 pages, the document defines what Meta staff and contractors should treat as acceptable chatbot behaviors when building and training the company’s generative AI products.
Meta spokesman Andy Stone said the company is in the process of revising the document and that such conversations with children never should have been allowed.


Bonus: Fine-tuning a Pre-trained model based on a custom dataset

In [1]:
# load and tokenize

from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("ccdv/arxiv-summarization")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

num_train = 5000
num_validation = 500
num_test = 500

small_dataset = {}

small_dataset["train"] = dataset["train"].shuffle(seed=42).select(range(num_train))
small_dataset["validation"] = dataset["validation"].shuffle(seed=42).select(range(num_validation))
small_dataset["test"] = dataset["test"].shuffle(seed=42).select(range(num_test))

def tokenize_function(examples):
    inputs = examples["article"]
    targets = examples["abstract"]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = {}

for split in small_dataset:
    tokenized_datasets[split] = small_dataset[split].map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]



In [5]:
# preparing the model and trainer

from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    eval_steps=100,
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [6]:
# fine-tuning the model

trainer.train()

Step,Training Loss,Validation Loss
100,No log,3.047491
200,No log,2.893473
300,No log,2.823621
400,No log,2.801392
500,3.204300,2.768955
600,3.204300,2.750101
700,3.204300,2.732806
800,3.204300,2.724171
900,3.204300,2.719108
1000,2.912600,2.707142


TrainOutput(global_step=1875, training_loss=2.9610890625, metrics={'train_runtime': 1291.5005, 'train_samples_per_second': 11.614, 'train_steps_per_second': 1.452, 'total_flos': 4060254044160000.0, 'train_loss': 2.9610890625, 'epoch': 3.0})