In [6]:
from datasets import load_dataset

# Load XSum dataset
dataset_samsum = load_dataset('xsum')

# Define the percentage of the data to keep
train_precent = 0.4
test_and_val_percent = 0.6

# Reduce the size of the train split
dataset_samsum_train = dataset_samsum['train'].shuffle(seed=42).select(range(int(len(dataset_samsum['train']) * (train_precent/100))))

# Reduce the size of the validation split
dataset_samsum_validation = dataset_samsum['validation'].shuffle(seed=42).select(range(int(len(dataset_samsum['validation']) * (test_and_val_percent/100))))

# Reduce the size of the test split
dataset_samsum_test = dataset_samsum['test'].shuffle(seed=42).select(range(int(len(dataset_samsum['test']) * (test_and_val_percent/100))))

# Print the new split sizes
print(f"New split sizes: {[len(dataset_samsum_train), len(dataset_samsum_validation), len(dataset_samsum_test)]}")
print(f"Features: {dataset_samsum['train'].column_names}")

print(f"\ndocument:")
print(dataset_samsum["test"][0]["document"])
print("\nsummary")
print(dataset_samsum["test"][0]["summary"])

  0%|          | 0/3 [00:00<?, ?it/s]

New split sizes: [816, 67, 68]
Features: ['document', 'summary', 'id']

document:
Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post traumatic stress disorder or drug dependency, were often viewed as less of a priority.
Andrew Stevens, who works in Welsh

In [7]:
from transformers import pipeline

# Evaluate this using PEGASUS
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail", framework='pt')
pipe_out = pipe(dataset_samsum["test"][0]["document"])
print("summary:")
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n"))

summary:
Some ex-offenders are living rough for up to a year before finding accommodation.
Prison Link Cymru had 1,099 referrals in 2015-16.
Charity workers say investment in housing would be cheaper than jailing homeless repeat offenders.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation .


In [8]:
print(pipe_out[0])

{'summary_text': 'Some ex-offenders are living rough for up to a year before finding accommodation .<n>Prison Link Cymru had 1,099 referrals in 2015-16 .<n>Charity workers say investment in housing would be cheaper than jailing homeless repeat offenders .<n>Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation .'}


In [9]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries(dataset, metric, model, tokenizer,
                       batch_size=16, device=device,
                       column_text="article", column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                             for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
    return metric.compute(predictions=decoded_summaries, references=target_batch)

In [10]:
# Load the model directly
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "ainize/bart-base-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [11]:
!pip install evaluate
!pip install rouge_score
import evaluate

rouge_metric = evaluate.load("rouge")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [12]:
score = evaluate_summaries(dataset_samsum["test"], rouge_metric, model,
                           tokenizer, column_text="document",
                           column_summary="summary", batch_size=8)

100%|██████████| 1417/1417 [1:18:22<00:00,  3.32s/it]


In [13]:
import pandas as pd

pd.DataFrame(score, index=["bart"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart,0.190064,0.021783,0.115375,0.144644


In [14]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["document"], truncation=True,
                                max_length=1024)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features,
                                       batched=True)

columns = ["input_ids", "labels", "attention_mask"]

  0%|          | 0/205 [00:00<?, ?ba/s]



  0%|          | 0/12 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [15]:
from transformers import DataCollatorForSeq2Seq

seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
from transformers import TrainingArguments, Trainer

# Gradient accumulation saves memory by updating the model only every X batches
training_args = TrainingArguments(
    output_dir="bart-samsum", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=False,
    evaluation_strategy="steps", eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [None]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmeetpatel05431[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,2.3166,2.071031
1000,2.1515,2.021414
1500,2.278,1.983023
2000,2.1026,1.956639
2500,2.1705,1.94055
3000,2.0544,1.921085


In [18]:
# Evaluate after finetuning
score = evaluate_summaries(
    dataset_samsum["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="document", column_summary="summary")
pd.DataFrame(score, index=[f"bart_finetuned"])

100%|██████████| 5667/5667 [40:50<00:00,  2.31it/s]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart_finetuned,0.224641,0.046612,0.112321,0.112321


In [19]:
sample_text = dataset_samsum["test"][0]["document"]
reference = dataset_samsum["test"][0]["summary"]

inputs = tokenizer(sample_text, max_length=1024, truncation=True,
                   padding="max_length", return_tensors="pt")

summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                           attention_mask=inputs["attention_mask"].to(
    device),
    length_penalty=0.8, num_beams=8, max_length=128)

decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)
                     for s in summaries]

decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]

In [20]:
print(decoded_summaries)


['More than 1,000 homeless people have been referred for accommodation in Wales since their release from prison, a charity has said.']


In [21]:
#References
# This code was inspired by ajdillhoff/CSE6363:
#https://github.com/ajdillhoff/CSE6363/blob/main/natural_language_processing/finetune_summarization.ipynb