In [11]:
!pip install datasets torch pandas evaluate rouge_score sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.0


In [2]:
import evaluate
from google.colab import drive
from datasets import load_dataset
from transformers import AutoTokenizer, BartForConditionalGeneration
from tqdm import tqdm

import pandas as pd
import torch
import random

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
papers_dataset = load_dataset("scientific_papers","arxiv")

feature_to_remove = "section_names"

for split, dataset in papers_dataset.items():
    papers_dataset[split] = dataset.remove_columns(feature_to_remove)

def truncate_dataset(dataset, fraction_to_keep):
  num_samples = len(dataset)
  num_samples_to_keep = int(num_samples * fraction_to_keep)
  indices_to_keep = random.sample(range(num_samples), num_samples_to_keep)
  truncated_dataset = dataset.select(indices_to_keep)
  return truncated_dataset

fraction_to_keep = 0.0005
for split in papers_dataset.keys():
    papers_dataset[split] = truncate_dataset(papers_dataset[split], fraction_to_keep)


# Removing Articles with blank text from train dataset
i = 0
indices_to_remove = []
while i < len(papers_dataset["train"]):
  if papers_dataset["train"][i]["article"] == "" or papers_dataset["train"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["train"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["train"] = filtered_dataset

i = 0
indices_to_remove = []
while i < len(papers_dataset["validation"]):
  if papers_dataset["validation"][i]["article"] == "" or papers_dataset["validation"][i]["abstract"] == "":
    indices_to_remove.append(i)
  i += 1

filtered_dataset = papers_dataset["validation"].filter(lambda example, idx: idx not in indices_to_remove, with_indices=True)
papers_dataset["validation"] = filtered_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

Filter:   0%|          | 0/101 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3 [00:00<?, ? examples/s]

In [17]:
papers_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 101
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 3
    })
})

In [5]:
model_path = "./drive/MyDrive/Submission/BE Project Group No 31/Model/bart-papers-trained-model"
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

### Rouge Score

In [7]:
rouge_metric = evaluate.load("rouge")

rouge_score = calculate_metric_on_test_ds(papers_dataset['test'], rouge_metric, model, tokenizer, column_text = 'article', column_summary='abstract', batch_size=8)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 1/1 [00:05<00:00,  5.51s/it]


In [9]:
pd.DataFrame(rouge_score,index=["Trained Model"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
Trained Model,0.03867,0.0,0.03367,0.03867


### Bleu Score

In [12]:
bleu_metric = evaluate.load("sacrebleu")

bleu_score = calculate_metric_on_test_ds(papers_dataset['test'], bleu_metric, model, tokenizer, column_text = 'article', column_summary='abstract', batch_size=8)

100%|██████████| 1/1 [00:05<00:00,  5.62s/it]


In [16]:
print("Bleu Score : ",bleu_score["score"])

Bleu Score :  0.09077334319264609
