In [1]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l- \ done
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=6eeede71b6782e0ddea020c0d87e256a55f1c98212c790372ef8d615e6e02a44
  Stored in directory: /root/.cache/pip/wheels/8e/6b/70/59daa7c90a238610e34bac5916e001fe3d9bb0ec59c8cf5518
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[0m

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

from transformers import AutoTokenizer
from datasets import load_dataset, load_metric
    
model_checkpoint = "t5-small"
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

prefix = "summarize: "

metric = load_metric("rouge")

batch_size = 16

max_input_length = 1024
max_target_length = 184

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [4]:
class SummaryDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = dataframe['article']
        self.summarytext = dataframe['abstract']

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        
        text = 'summarize: ' +str(self.text[index])
        text = ' '.join(text.split())

        summarytext = str(self.summarytext[index])
        summarytext = ' '.join(summarytext.split())

        #source = self.tokenizer.batch_encode_plus([text], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        #target = self.tokenizer.batch_encode_plus([summarytext], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source = self.tokenizer.batch_encode_plus([text], max_length= self.source_len, pad_to_max_length=True,truncation=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([summarytext], max_length= self.summ_len, pad_to_max_length=True,truncation=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [5]:
test_dataset = load_dataset("ccdv/arxiv-summarization",'document',split="test")

Downloading builder script:   0%|          | 0.00/5.14k [00:00<?, ?B/s]

Downloading and preparing dataset arxiv_summarization_dataset/document to /root/.cache/huggingface/datasets/ccdv___arxiv_summarization_dataset/document/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3...


Downloading data:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset arxiv_summarization_dataset downloaded and prepared to /root/.cache/huggingface/datasets/ccdv___arxiv_summarization_dataset/document/1.0.0/fa2c9abf4312afb8660ef8e041d576b8e3943ea96ae771bd3cd091b5798e7cc3. Subsequent calls will reuse this data.


In [6]:
test_dataset

Dataset({
    features: ['article', 'abstract'],
    num_rows: 6440
})

In [7]:
test_set = SummaryDataset(test_dataset, tokenizer, max_input_length, max_target_length)

In [8]:
test_params = {
        'batch_size': batch_size ,
        'shuffle': False,
        'num_workers': 0
        }

In [9]:
test_loader = DataLoader(test_set, **test_params)

In [10]:
def predict(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=max_target_length
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [11]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

predictions, actuals = predict(tokenizer, model, device, test_loader)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Completed 0
Completed 100
Completed 200
Completed 300
Completed 400


In [12]:
final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
final_df.to_csv('predictions_pretrained.csv')

In [13]:
result = metric.compute(predictions=predictions, references=actuals, use_stemmer=True)
# Extract a few results
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

result

{'rouge1': 26.71075866109905,
 'rouge2': 6.1547706688140496,
 'rougeL': 16.47846825124016,
 'rougeLsum': 16.475238430509233}