In [1]:
import nltk

In [2]:
from datasets import load_dataset

# billsum = load_dataset("billsum", split="ca_test")
# billsum = billsum.train_test_split(test_size=0.2)

cnn_dailymail = load_dataset('cnn_dailymail', '2.0.0', split="train")
cnn_dailymail = cnn_dailymail.train_test_split(test_size=0.2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from transformers import AutoTokenizer

# T5 comes in different sizes:
# google-t5/t5-small
# google-t5/t5-base
# google-t5/t5-large
# google-t5/t5-3b
# google-t5/t5-11b.

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [4]:
# Preprocess
prefix = "summarize: "

def tokenize_function_billsum(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# tokenized_billsum = billsum.map(tokenize_function_billsum, batched=True)


def tokenize_function_cnn(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    # Encode the summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the entire dataset
tokenized_cnn = cnn_dailymail.map(tokenize_function_cnn, batched=True)


Map:   0%|          | 0/229690 [00:00<?, ? examples/s]



Map:   0%|          | 0/57423 [00:00<?, ? examples/s]

In [5]:
# Evaluate
import evaluate

rouge = evaluate.load("rouge")

import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [6]:
from transformers import TrainerCallback
from copy import deepcopy

class CustomCallback(TrainerCallback):

    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [7]:
import torch
torch.cuda.memory_summary()



In [None]:
# Train
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

training_args = Seq2SeqTrainingArguments(
    output_dir="cnn_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    logging_dir='./logs',  # directory for storing logs
    logging_strategy="steps",
    logging_steps=50,  # log loss every 50 steps
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn["train"],
    eval_dataset=tokenized_cnn["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback(CustomCallback(trainer))
trainer.train()

model.save_pretrained('cnn_model')
tokenizer.save_pretrained('cnn_model')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.7954,1.664208,0.2386,0.109,0.1978,0.1978,18.9806




In [None]:
trainer.state.log_history

In [None]:
text = """summarize:
A bodyguard who was shot at the Kuala Lumpur airport on April 14 while protecting his high-risk client from her husband was assigned with the job because of his training and extensive experience, said his employer. Mr Mohd Nur Hadith, nicknamed Along, is in critical condition. He has been working with Elite Bodyguard Service Company for the past four years and has 18 years of experience as a bodyguard, the company’s director Rashid Ratag told Utusan Malaysia. Mr Nur’s profile on the company’s Facebook says he is 37 years old and fluent in Malay and English. His stated talents are being a personal driver for VVIPs as well as being a bodyguard cum driver. Mr Rashid said that as the client – who had filed a police report against her husband for criminal intimidation – was considered a high-risk case, Mr Nur was picked to be her bodyguard. Mr Nur and his client, a travel agency owner who goes by the username Farah Cie on social media, had been at the arrival hall of Kuala Lumpur International Airport Terminal 1 to receive Muslim pilgrims returning from Mecca at about 1.30am on April 14 when her husband pulled the trigger on her. But Mr Nur took the bullet in the abdomen instead. The bullet reportedly missed his vital organs, but left him in critical condition. “I didn’t expect Along to be so committed to the task given to him that he was willing to risk his life,” said Mr Rashid. Mr Nur’s colleagues told Bernama he had not been supposed to be working on the day of the shooting, but had to stand in for another colleague who was on holiday because of Hari Raya Aidilfitri. Police are now hunting down the suspected shooter, whom they identified as 38-year-old Hafizul Harawi.
"""

In [None]:
# Inference
from transformers import pipeline

summarizer = pipeline("summarization", model="cnn_model")
summarizer(text)

In [None]:
from transformers import pipeline

pipe = pipeline("summarization", model="google/pegasus-large")

In [None]:
pipe(text)

In [None]:
# Finetuning model for summarization

In [None]:
from datasets import load_dataset
from evaluate import load

raw_datasets = load_dataset("cnn_dailymail", '2.0.0')
metric = load("rouge")

In [None]:
raw_datasets

In [None]:
raw_datasets["train"][0]

In [None]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(raw_datasets["train"])

In [None]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["highlights"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
model_checkpoint = "google-t5/t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

batch_size = 16
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-cnndaillymail",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")