In [1]:
import transformers
from transformers import (
    pipeline,
    set_seed,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)
import datasets
from datasets import load_dataset, load_from_disk, load_metric

import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/migue/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
type(model_pegasus)

transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration

In [13]:
# Original dataset
# dataset_ckpt = "Samsung/samsum"

dataset = load_from_disk("../data/samsum_dataset/")

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
print("\nDialogue:")

print(dataset["test"][1]["dialogue"])

print("\nSummary:")

print(dataset["test"][1]["summary"])


Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


In [8]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(
        example_batch["dialogue"], max_length=1024, truncation=True
    )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(
            example_batch["summary"], max_length=128, truncation=True
        )

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"],
    }

In [9]:
dataset_pt = dataset.map(convert_examples_to_features, batched=True)

In [10]:
dataset_pt

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

## **Training**

In [10]:
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [11]:
args = TrainingArguments(
    output_dir="pegasus-samsum",
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1e6,
)



In [14]:
trainer = Trainer(
    model=model_pegasus,
    args=args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_pt["train"],
    eval_dataset=dataset_pt["validation"],
)

In [None]:
trainer.train()

## **Evaluation**

In [25]:
# load model and tokenizer
model_path = "pegasus-samsum/checkpoint-14732"
trained_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
def generate_batch_sized_chuncks(list_of_elements: list, batch_size: int):
    """Split dataset into smaller batches that can be processed simultaneously
    Yield successive batch-sized chunks from list of elements.

    Args:
        list_of_elements (list): List with elements to be split on batches
        batch_size (int): Number of elements per batch

    Yield:
        list: Batches
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


def calculate_test_metric(
    dataset: datasets.arrow_dataset.Dataset,
    metric,
    model: transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration,
    tokenizer: transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast,
    batch_size: int = 16,
    device: str = "cuda",
    column_text: str = "article",
    column_summary: str = "highlights",
) -> float:
    """_summary_

    Args:
        dataset (datasets.arrow_dataset.Dataset): _description_
        metric (str): _description_
        model (transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration): _description_
        tokenizer (transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast): _description_
        batch_size (int): _description_
        column_text (str, optional): _description_. Defaults to "article".
        column_summary (str, optional): _description_. Defaults to "highlights".

    Returns:
        float: _description_
    """
    article_batches = list(
        generate_batch_sized_chuncks(dataset[column_text], batch_size)
    )
    target_batches = list(
        generate_batch_sized_chuncks(dataset[column_summary], batch_size)
    )

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)
    ):

        inputs = tokenizer(
            article_batch,
            max_length=1024,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        summaries = model.generate(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            length_penalty=0.8,
            num_beams=8,
            max_length=128,  # avoid long sequences
        )

        decoded_summaries = [
            tokenizer.decode(
                s,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
            for s in summaries
        ]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    return metric.compute()

In [11]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric("rouge")

In [26]:
score = calculate_test_metric(
    dataset["test"],
    rouge_metric,
    trained_model,
    tokenizer,
    batch_size=2,
    device="cuda",
    column_text="dialogue",
    column_summary="summary",
)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame(rouge_dict, index=["pegasus"])

100%|██████████| 410/410 [04:25<00:00,  1.55it/s]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.01829,0.000369,0.018192,0.018176


In [27]:
trained_model.save_pretrained("pegasus-samsum-model")
tokenizer.save_pretrained("pegasus-samsum-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('pegasus-samsum-model/tokenizer_config.json',
 'pegasus-samsum-model/special_tokens_map.json',
 'pegasus-samsum-model/spiece.model',
 'pegasus-samsum-model/added_tokens.json',
 'pegasus-samsum-model/tokenizer.json')

## **Prediction**

In [30]:
gen_kwargs = {"length_penalty": 0.8, "num_beams": 8, "max_length": 128}


sample_text = dataset["test"][5]["dialogue"]

reference = dataset["test"][5]["summary"]

pipe = pipeline("summarization", model="pegasus-samsum-model", tokenizer=tokenizer)

##
print("Dialogue:")
print(sample_text)


print("\nReference Summary:")
print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[5]["summary_text"])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Dialogue:
Benjamin: Hey guys, what are we doing with the keys today?
Hilary: I've got them. Whoever wants them can meet me at lunchtime or after
Elliot: I'm ok. We're meeting for the drinks in the evening anyway and I guess we'll be going back to the apartment together?
Hilary: Yeah, I guess so
Daniel: I'm with Hilary atm and won't let go of her for the rest of the day, so any option you guys choose is good for me
Benjamin: Hmm I might actually pass by at lunchtime, take the keys and go take a nap. I'm sooo tired after yesterday
Hilary: Sounds good. We'll be having lunch with some French people (the ones who work on the history of food in colonial Mexico - I already see you yawning your head off)
Benjamin: YAAAAWN 🙊 Where and where are you meeting?
Hilary: So I'm meeting them at the entrance to the conference hall at 2 pm and then we'll head to this place called La Cantina. Italian cuisine, which is quite funny, but that's what they've chosen
Benjamin: Interesting 😱 To be honest, Hilar