In [1]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [2]:

from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
2023-06-30 15:57:05.571619: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-30 15:57:05.614590: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/amahi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## [The CNN/DailyMail Dataset](https://huggingface.co/datasets/cnn_dailymail)

##  An important aspect of the dataset is that the summaries are abstractive and not extractive, which means that they consist of new sentences instead of simple excerpts.

Extractive Summarization: the extractive approach selects the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.

Abstractive Summarization: The abstractive approach uses new phrases and terms that are different from the original document, keeping the meaning the same, just like how humans do in summarization. So, it is much harder than the extractive approach.

In [3]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", version="3.0.0")

print(f"Features in cnn_dailymail : {dataset['train'].column_names}")

Found cached dataset cnn_dailymail (/home/amahi/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 3/3 [00:00<00:00, 63.48it/s]

Features in cnn_dailymail : ['article', 'highlights', 'id']





In [4]:
sample = dataset["train"][25]
print(f"""
Full Article (excerpt of 500 characters, total length: {len(sample["article"])}):
""")
print(sample["article"][:4500])
print(f'\nSummary (Highlights) (length: {len(sample["highlights"])}):')
print(sample["highlights"])


Full Article (excerpt of 500 characters, total length: 3881):

LOS ANGELES, California (Reuters)  -- "To insure or not to insure?" It isn't Shakespeare, but it is the dramatic question Hollywood filmmakers are asking about Lindsay Lohan following her legal troubles this week. Lindsay Lohan may still get work after her legal problems are settled, but the cost will be high. It is an important question, too, because whether companies insure Lohan's future movies may determine whether she will quickly fall off Hollywood's A-list. But Lohan fans have little to fear because no actor is uninsurable, say underwriting experts. While some producers may balk at conditions for hiring problematic stars, experts say that unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost. "For a price, anything can be done, although an insurance carrier can make things so unpalatable that at times the makers of the film just won't be interested," said Ross Mill

-------------------

## Text Summarization Pipelines


In [5]:
sample_text = dataset["train"][25]["article"][:1000]

# We'll collect the generated summaries of each model in a dictionary
summaries = {}

### Summarization Baseline


In [6]:
def baseline_summary_three_sent(text):
    return "\n".join(sent_tokenize(text)[:3])

In [7]:
summaries['baseline'] = baseline_summary_three_sent(sample_text)

summaries['baseline']

'LOS ANGELES, California (Reuters)  -- "To insure or not to insure?"\nIt isn\'t Shakespeare, but it is the dramatic question Hollywood filmmakers are asking about Lindsay Lohan following her legal troubles this week.\nLindsay Lohan may still get work after her legal problems are settled, but the cost will be high.'

# huggingface pipeline

The pipelines are a great and easy way to use models for inference.

Stating ”summarization”: will return a `SummarizationPipeline`

”text-generation”: will return a TextGenerationPipeline

-----------------------


# Distilbart-cnn-12-6

In [8]:
pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
pipe_out = pipe(sample_text)

In [9]:
pipe_out

[{'summary_text': ' Lohan fans have little to fear because no actor is uninsurable, say underwriting experts . Unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost . "For a price, anything can be done, although an insurance carrier can make things so unpalatable," expert says .'}]

In [10]:
summaries["distilbart-cnn-12-6"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# BERT

In [11]:
from transformers import BertTokenizerFast, EncoderDecoderModel
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizerFast.from_pretrained('mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization')
model = EncoderDecoderModel.from_pretrained('mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization').to(device)

def generate_summary(text):
    # cut off at BERT max length 512
    inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    output = model.generate(input_ids, attention_mask=attention_mask)

    return tokenizer.decode(output[0], skip_special_tokens=True)
  
generate_summary(sample_text)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



"lindsay lohan's future movies may determine whether she will quickly fall off hollywood's a - list. some producers may balk at conditions for hiring problematic stars. expert : if an actor is serving time in prison, even the most volatile can be covered - - albeit at a high cost."

In [12]:
pipe_out = pipe(sample_text)

In [13]:
pipe_out

[{'summary_text': ' Lohan fans have little to fear because no actor is uninsurable, say underwriting experts . Unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost . "For a price, anything can be done, although an insurance carrier can make things so unpalatable," expert says .'}]

In [14]:
summaries["mrm8488/bert-small2bert-small-finetuned-cnn_daily_mail-summarization"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# Roberta

In [15]:
#pipe = pipeline("Text2TextGeneration", model="Ayham/roberta_roberta_summarization_cnn_dailymail")
#pipe_out = pipe(sample_text)

In [16]:
#pipe_out

In [17]:
#Smaries["ahmeddbahaa/xlmroberta-finetune-en-cnn"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# ProphetNet

In [18]:
pipe = pipeline("summarization", model="microsoft/prophetnet-large-uncased-cnndm")
pipe_out = pipe(sample_text)

In [19]:
pipe_out

[{'summary_text': "lohan ' s future movies may determine whether she will quickly fall off hollywood ' s a -"}]

In [20]:
summaries["microsoft/prophetnet-large-uncased-cnndm"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# GPT-3

In [21]:
pipe = pipeline("summarization", model="minhtoan/gpt3-small-finetune-cnndaily-news")
pipe_out = pipe(sample_text)

The model 'GPTNeoForCausalLM' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 207, but `max_length` is set to 20. This can lead to

In [22]:
pipe_out

[{'summary_text': 'LOS ANGELES, California (Reuters)  -- "To insure or not to insure?" It isn\'t Shakespeare, but it is the dramatic question Hollywood filmmakers are asking about Lindsay Lohan following her legal troubles this week. Lindsay Lohan may still get work after her legal problems are settled, but the cost will be high. It is an important question, too, because whether companies insure Lohan\'s future movies may determine whether she will quickly fall off Hollywood\'s A-list. But Lohan fans have little to fear because no actor is uninsurable, say underwriting experts. While some producers may balk at conditions for hiring problematic stars, experts say that unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost. "For a price, anything can be done, although an insurance carrier can make things so unpalatable that at times the makers of the film just won\'t be interested," said Ross Miller, partner with insurance brokerage D.R.

In [23]:
summaries["minhtoan/gpt3-small-finetune-cnndaily-news"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

# GPT-2

We can use GPT-2 it to generate summaries by simply appending “TL;DR” at the end of the input text.

The expression “TL;DR” (too long; didn’t read) is often used on platforms like
Reddit to indicate a short version of a long post. We will start our
summarization experiment by re-creating the procedure of the original paper
with the pipeline() function from Transformers

We create a text generation pipeline and load the GPT-2 model:

In [24]:
from transformers import pipeline, set_seed

set_seed(42)

pipe = pipeline('text-generation', model = 'gpt2-medium' )

gpt2_query = sample_text + "\nTL;DR:\n"

pipe_out = pipe(gpt2_query, max_length = 512, clean_up_tokenization_spaces = True)




Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [25]:
pipe_out

[{'generated_text': 'LOS ANGELES, California (Reuters)  -- "To insure or not to insure?" It isn\'t Shakespeare, but it is the dramatic question Hollywood filmmakers are asking about Lindsay Lohan following her legal troubles this week. Lindsay Lohan may still get work after her legal problems are settled, but the cost will be high. It is an important question, too, because whether companies insure Lohan\'s future movies may determine whether she will quickly fall off Hollywood\'s A-list. But Lohan fans have little to fear because no actor is uninsurable, say underwriting experts. While some producers may balk at conditions for hiring problematic stars, experts say that unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost. "For a price, anything can be done, although an insurance carrier can make things so unpalatable that at times the makers of the film just won\'t be interested," said Ross Miller, partner with insurance brokerage D.

In [26]:
pipe_out[0]["generated_text"][len(gpt2_query) :]

'Lawsuit seeks to avoid lawsuit if actors in upcoming movies embody a character that "inspire" sexual assault. "If they\'re not allowed to work, a young actor with the right lifestyle or temperament is going to go to Hollywood. I suspect the same fate will befall Lohan, if the lawyers don\'t settle fast enough and Lindsay goes on to become a bigger star," said Miller. (Source)\nA man named Charles "Goliath" Suggs, a former U.S. Army officer and prison guard turned self-made billionaire who died earlier this month in his New York basement, has been listed on the National Insurance Institute\'s most outstanding life insurance policies database. The New York Post cites "Goliath" as having $31 million in cash in 2011. Miller said that it appears that other life policies are being sold to "Goliath" via his estate andAlpha Reivers Trust, which does business with Goliath family members, which are listed on the National Insurance Institute\'s website as "C.S.S. Trust." Miller said that "A priv

In [27]:
summaries['gpt2'] = "\n".join(sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

# T5

T5 (Text-To-Text Transfer Transformer) is a transformer model that is trained in an end-to-end manner with text as input and modified text as output, in contrast to BERT-style models that can only output either a class label or a span of the input. This text-to-text formatting makes the T5 model fit for multiple NLP tasks like Summarization, Question-Answering, Machine Translation, and Classification problems.

How T5 is different from BERT?
Both T5 and BERT are trained with MLM (Masked Language Model) approach.

What is MLM?

The MLM is a fill-in-the-blank task, where the model masks part of the input text and tries to predict what that masked word should be.

Example:

“I like to eat peanut butter and <MASK> sandwiches,”
“I like to eat peanut butter and jelly sandwiches,”


The only difference is that T5 replaces multiple consecutive tokens with the single Mask Keyword, unlike, BERT which uses Mask token for each word. This illustration is shown below.


### T5 expects a prefix before the input text to understand the task given by the user. For example,

- “summarize:” for the summarization,
- “cola sentence:” for the classification,
- “translate English to Spanish:” for the machine translation, etc.,


--------------


But here in this case, I can directly load T5 for summarization with the pipeline() function, which also takes care of formatting the inputs in the text-to-text format so we don’t
need to prepend them with "summarize":


# T5 Small

In [28]:
pipe = pipeline('summarization', model = 't5-small' )

pipe_out = pipe(sample_text)

In [29]:
pipe_out

[{'summary_text': "if companies insure Lindsay Lohan's future movies may determine if she will quickly fall off Hollywood's A-list . if an actor is serving time in prison, even the most volatile can be covered -- at a high cost . some producers may balk at conditions for hiring problematic stars, experts say ."}]

In [30]:
summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

In [31]:
### T5 Base ( French T5 Abstractive Text Summarization )
#pipe = pipeline('summarization', model = 'plguillou/t5-base-fr-sum-cnndm' )

#pipe_out = pipe(sample_text)

In [32]:
#pipe_out

In [33]:
#summaries['t5'] = 'n'.join(sent_tokenize(pipe_out[0]['summary_text']))

# BART

BART is a denoising autoencoder for pretraining sequence-to-sequence models. It is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Transformer-based neural machine translation architecture.

That means, It uses a standard seq2seq/NMT architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT). This means the encoder's attention mask is fully visible, like BERT, and the decoder's attention mask is causal, like GPT2.


This means that a fine-tuned BART model can take a text sequence (for example, English) as input and produce a different text sequence at the output (for example, French).

This type of model is relevant for machine translation, question-answering , text summarization, or sequence classification (categorizing input text sentences or tokens).

Another task is sentence entailment which, given two or more sentences, evaluates whether the sentences are logical extensions or are logically related to a given statement.

In [34]:
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)


In [35]:
pipe_out

[{'summary_text': 'Lohan fans have little to fear because no actor is uninsurable, say underwriting experts. Unless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost. "For a price, anything can be done," says insurance broker Ross Miller.'}]

In [36]:
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

In [37]:
summaries["bart"]

'Lohan fans have little to fear because no actor is uninsurable, say underwriting experts.\nUnless an actor is serving time in prison, even the most volatile can be covered -- albeit at a high cost.\n"For a price, anything can be done," says insurance broker Ross Miller.'

# PEGASUS

The PEGASUS model’s pre-training task is very similar to summarization, i.e. important sentences are removed and masked from an input document and are later generated together as one output sequence from the remaining sentences, which is fairly similar to a summary. In PEGASUS, several whole sentences are removed from documents during pre-training, and the model is tasked with recovering them. The Input for such pre-training is a document with missing sentences, while the output consists of the missing sentences being concatenated together. The advantage of this self-supervision is that you can create as many examples as there are documents without any human intervention, which often becomes a bottleneck problem in purely supervised systems.

In [38]:
pipe = pipeline('summarization', model="google/pegasus-cnn_dailymail"  )

pipe_out = pipe(sample_text)

In [39]:
pipe_out

[{'summary_text': "Companies insure Lindsay Lohan's future movies .<n>Lohan fans have little to fear because no actor is uninsurable .<n>Unless an actor is serving time in prison, even the most volatile can be covered ."}]

In [40]:
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .<n>", ".\n")

## Comparing Different Summaries

In [41]:
print("GROUND TRUTH")

print(dataset['train'][1]['highlights'])


ent Summaries
print("GROUND TRUTH")for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])


SyntaxError: invalid syntax (3858891016.py, line 6)


# SacreBLEU

The bleu_metric object is an instance of the Metric class, and works like an
aggregator: you can add single instances with add() or whole batches via
add_batch(). Once you have added all the samples you need to evaluate, you
then call compute() and the metric is calculated. This returns a dictionary with
several values, such as the precision for each n-gram, the length penalty, as
well as the final BLEU score. Let’s look at the example from before:

In [None]:

from datasets import load_metric

bleu_metric = load_metric("sacrebleu")

In [None]:
bleu_metric.add(prediction = [summaries["pegasus"]], reference = [dataset['train'][1]['highlights'] ])

results = bleu_metric.compute(smooth_method = 'floor', smooth_value = 0 )

results['precision'] = [np.round(p , 2) for p in results['precisions'] ]

pd.DataFrame.from_dict(results, orient = 'index', columns = ['Value'] )

# ROUGE

# ROUGE vs BLEU

Bleu measures precision: how much the words (and/or n-grams) in the machine generated summaries appeared in the human reference summaries.

Rouge measures recall: how much the words (and/or n-grams) in the human reference summaries appeared in the machine generated summaries.

### Interpretation of Rouge Score

ROUGE-n recall=40% means that 40% of the n-grams in the reference summary are also present in the generated summary.

--------

The ROUGE score was specifically developed for applications like
summarization where high recall is more important than just precision.5

The approach is very similar to the BLEU score in that we look at different n-grams
and compare their occurrences in the generated text and the reference texts.


The difference is that with ROUGE we check how many n-grams in the
reference text also occur in the generated text. For BLEU we looked at how
many n-grams in the generated text appear in the reference

In [None]:
rouge_metric = load_metric('rouge')

## ROUGE-N

With ROUGE-N, the N represents the n-gram that we are using. For ROUGE-1 we would be measuring the match-rate of unigrams between our model output and reference.

ROUGE-2 and ROUGE-3 would use bigrams and trigrams respectively.


## ROUGE-L

ROUGE-L measures the longest common subsequence (LCS) between our model output and reference. All this means is that we count the longest sequence of tokens that is shared between both:


In the HF Datasets implementation, two variations of ROUGE are
calculated: one calculates the score per sentence and averages it for the
summaries (ROUGE-L), and the other calculates it directly over the whole
summary (ROUGE-Lsum).


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

reference = dataset['train'][1]['highlights']

records = []

for model_name in summaries:
    rouge_metric.add(prediction = summaries[model_name], reference = reference )
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )
    print('rouge_dict ', rouge_dict )
    records.append(rouge_dict)

pd.DataFrame.from_records(records, index = summaries.keys() )

# Evaluationg on the TEST set of the CNN/DailyMail Dataset

In [None]:
def calculate_metric_on_baseline_test_ds(dataset, metric, column_text = 'article', column_summary = 'highlights' ):
    """
    This function calculates a specified metric on a baseline test dataset for a Natural Language Processing (NLP) task.
    It assumes the task is a text summarization task, where the goal is to generate a summary (e.g., highlights) from a text (e.g., article).

    Parameters:
    dataset (pandas.DataFrame): The test dataset. It should contain a column for the text and a column for the true summary.
    metric (datasets.Metric): The metric to calculate. This should be a metric object from the Hugging Face datasets library.
    column_text (str, optional): The name of the column in the dataset that contains the text. Defaults to 'article'.
    column_summary (str, optional): The name of the column in the dataset that contains the true summary. Defaults to 'highlights'.

    Returns:
    score (float): The calculated score of the metric on the test dataset.
    """
    summaries = [baseline_summary_three_sent(text) for text in dataset[column_text] ]

    metric.add_batch(predictions = summaries, references = dataset[column_summary] )

    score = metric.compute()
    return score

In [None]:
test_sampled = dataset['train'].shuffle(seed = 42).select(range(1000))

score = calculate_metric_on_baseline_test_ds(test_sampled, rouge_metric )

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame.from_dict(rouge_dict, orient = 'index' , columns = ['baseline'] ).T

## Strategy to calculate the ROUGE Metric on test dataset for the other Models

In [None]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements.

    Generator function to yield successive batch-sized chunks from list_of_elements.

    Parameters:
    list_of_elements (list): List of elements to be divided into chunks.
    batch_size (int): The size of each chunk.

    Yields:
    list: Batch-sized chunk from list_of_elements.

    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    """
    Function to calculate a specified metric on a test dataset for a Natural Language Processing (NLP) task.
    It assumes the task is a text summarization task, where the goal is to generate a summary from a text.

    Parameters:
    dataset (pandas.DataFrame): The test dataset. It should contain a column for the text and a column for the true summary.
    metric (datasets.Metric): The metric to calculate. This should be a metric object from the Hugging Face datasets library.
    model (transformers.PreTrainedModel): The transformer model to use for text generation.
    tokenizer (transformers.PreTrainedTokenizer): The tokenizer corresponding to the model.
    batch_size (int, optional): The size of the batches to use for processing. Defaults to 16.
    device (str, optional): The device to run the model on. Defaults to the output of torch.cuda.is_available().
    column_text (str, optional): The name of the column in the dataset that contains the text. Defaults to 'article'.
    column_summary (str, optional): The name of the column in the dataset that contains the true summary. Defaults to 'highlights'.

    Returns:
    score (float): The calculated score of the metric on the test dataset.
    """
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

score = calculate_metric_on_test_ds(test_sampled, rouge_metric,
                                   model_pegasus, tokenizer, batch_size=8)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)

# At the end, we compute and return the ROUGE scores.
pd.DataFrame(rouge_dict, index=["pegasus"])