In [None]:
import string
import re 
import csv
import collections

import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import islice

import nltk
from nltk.tokenize import sent_tokenize

from datasets import load_metric, load_dataset

### Load Summaries

Transformers
- BART: sshleiferdistilbart-xsum-12-1
- Pegasus: sshleiferdistill-pegasus-xsum-16-4
- GPT-5: t5-small
- BigBird: googlebigbird-pegasus-large-bigpatent

Baselines

- Lead 3

In [None]:
#Transformers
bart    = pd.read_csv('output_sshleiferdistilbart-xsum-12-1.csv')
pegasus = pd.read_csv('output_sshleiferdistill-pegasus-xsum-16-4.csv')
t5      = pd.read_csv('output_t5-small.csv')
bird    = pd.read_csv('output_googlebigbird-pegasus-large-bigpatent.csv')

#Baselines
lead_three = pd.read_csv('baseline_pubmed.csv')

print(len(pegasus))
print(len(bart))
print(len(t5))
print(len(lead_three))
print(len(bird))

# #Adjust columns
bart = bart[['ground_truth', 'summaries']]
bart.columns = ['ground_truth', 'bart']

pegasus = pegasus[['ground_truth', 'summaries']]
pegasus.columns = ['ground_truth', 'pegasus']

t5 = t5[['ground_truth', 'summaries']]
t5.columns = ['ground_truth', 't5']

bird = bird[['ground_truth','summaries']]
bird.columns = ['ground_truth', 'bird']

6658
6658
6658
6658
6658


In [None]:
print(bart.isnull().sum())
print(pegasus.isnull().sum())
print(t5.isnull().sum())
print(lead_three.isnull().sum())

ground_truth    0
bart            0
dtype: int64
ground_truth    0
pegasus         0
dtype: int64
ground_truth    0
t5              0
dtype: int64
ground_truth    0
lead3           0
dtype: int64


In [None]:
bart = bart.drop_duplicates(subset='ground_truth', keep='first')
pegasus = pegasus.drop_duplicates(subset='ground_truth', keep='first')
t5 = t5.drop_duplicates(subset='ground_truth', keep='first')

lead_three = lead_three.drop_duplicates(subset='ground_truth', keep='first')

print(len(pegasus))
print(len(bart))
print(len(t5))
print(len(lead_three))

6656
6656
6656
6656


In [None]:
temp  = bart.merge(pegasus, how='inner', on='ground_truth')
temp2 = temp.merge(t5, how='inner', on='ground_truth')
temp3 = temp2.merge(lead_three, how='inner', on='ground_truth')
temp4 = temp3.merge(bird, how='inner', on='ground_truth')

merged = temp4

merged

Unnamed: 0,ground_truth,bart,pegasus,t5,lead3,bird
0,research on the implications of anxiety in pa...,The impact of anxiety in people with mental h...,Anxiety is one of the most common forms of men...,", apathy and anhedonia in pd patients have bee...",anxiety affects quality of life in those livin...,A method of treating anxiety in a parkinsons p...
1,"small non - coding rnas include sirna , mirna...",A group of small non-coded rna has been ident...,"mirna, a class of multifunctional helper rnas,...",are transcribed into mrna but remain untransla...,small non - coding rnas are transcribed into m...,Micrornas are a class of small non-coding rnas...
2,objective : to evaluate the efficacy and safe...,The condition of a rare form of in vitro fert...,Ohsss is one of the most common iatrogenic con...,ohss is a serious complication of ovulation in,ohss is a serious complication of ovulation in...,The present invention relates to the use of hu...
3,congenital adrenal hyperplasia is a group of ...,A study of a group of patients diagnosed with...,A study of 29 patients diagnosed with congenit...,congenital adrenal hyperplasia ( cah ) refers ...,congenital adrenal hyperplasia ( cah ) refers ...,Adreno cortico trophic hormone levels in patie...
4,objective(s):pentoxifylline is an immunomodul...,The path of type 1 diabetes (1d) is an import...,research into the pathogenesis of type 1 diabe...,pentoxifylline ( ptx ) have immunomodulatory and,type 1 diabetes ( t1d ) results from the destr...,The present invention relates to the use of pe...
...,...,...,...,...,...,...
6653,cysticercosis is an infection with larval cys...,The study of a tapeworm that causes severe br...,The granulomatous response to dead parasite t....,granulomatous inflammation is associated with ...,neurocysticercosis is the most common parasiti...,It is disclosed that substance p protein withi...
6654,background : congenital heart disease ( chd )...,The prevalence of chd at birth in China's Heb...,chd is one of the most common birth defects in...,"77,836 3-month - old infants in the district w...",most chd prevalence data are based on populati...,In a cross- sectional study of chd prevalence ...
6655,we describe a successful living donor liver t...,"The results of a liver transplant in Japan, i...",A patient with end-stage liver disease has had...,anti - cd 20 antibody ( rituximab ),renal transplantation rates are low among pati...,A living donor liver transplantation using a l...
6656,objective : the aim was to investigate the ef...,The number of cases of acute kidney injury in...,A systematic review and meta-analysis of the e...,"aki, defined as an abrupt drop of renal functi...","acute kidney injury ( aki ) , defined as an ab...","A method of preventing acute kidney injury, de..."


In [None]:
len(merged)

6658

In [None]:
merged.isnull().any()

ground_truth    False
bart            False
pegasus         False
t5              False
lead3           False
bird            False
dtype: bool

In [None]:
merged.isnull().sum()

ground_truth    0
bart            0
pegasus         0
t5              0
lead3           0
bird            0
dtype: int64

### Cleaning

Cleaning function replaces white spaces and newlines

In [None]:
def clean(s):
    a = s.translate(str.maketrans(' ', ' ', string.punctuation))
    return re.sub('\s+',' ', a).replace('\n', '').strip()

In [None]:
for c in list(merged.columns):
    merged[c] = [clean(series) for series in list(merged[c])]

In [None]:
merged_dict = merged.to_dict('index')

### Example

Below is a series of summaries for a given article

In [None]:
merged_dict[0]

{'ground_truth': 'research on the implications of anxiety in parkinson s disease pd has been neglected despite its prevalence in nearly 50 of patients and its negative impact on quality of life previous reports have noted that neuropsychiatric symptoms impair cognitive performance in pd patients however to date no study has directly compared pd patients with and without anxiety to examine the impact of anxiety on cognitive impairments in pd this study compared cognitive performance across 50 pd participants with and without anxiety 17 pda 33 pda who underwent neurological and neuropsychological assessment group performance was compared across the following cognitive domains simple attention visuomotor processing speed executive function eg set shifting working memory language and memory new verbal learning results showed that pda performed significantly worse on the digit span forward and backward test and part b of the trail making task tmt b compared to the pda group there were no gr

### ROUGE Batching

In [None]:
def chunks(dictionary, batch_size=10):
    """Yield successive batch-sized chunks from dictionary."""
    it = iter(dictionary)   
    for i in range(0, len(dictionary), batch_size):
        yield {k: dictionary[k] for k in islice(it, batch_size)}
    
chunk_list = [c for c in chunks(merged_dict)]
print(f"Number of batches: {len(chunk_list)}")

Number of batches: 666


In [None]:
results = {}

rouge_metric = load_metric("rouge")

#ROUGE Metric Batching
for summarizer in ['bart', 'pegasus', 't5', 'bird', 'lead3']:
    for chunk in tqdm(chunk_list):
        refs  = [i['ground_truth'] for i in list(chunk.values())]
        preds = [i[summarizer] for i in list(chunk.values())]
        rouge_metric.add_batch(predictions=preds, references=refs)

    scores = rouge_metric.compute()
    results[summarizer] = [
        np.round(scores['rouge1'].mid.fmeasure * 100, 1), 
        np.round(scores['rougeL'].mid.fmeasure * 100, 1)]

100%|██████████| 666/666 [00:00<00:00, 2425.62it/s]
100%|██████████| 666/666 [00:00<00:00, 2412.78it/s]
100%|██████████| 666/666 [00:00<00:00, 2467.19it/s]
100%|██████████| 666/666 [00:00<00:00, 2473.36it/s]
100%|██████████| 666/666 [00:00<00:00, 2433.08it/s]


### BLEU Batching

In [None]:
bleu_metric  = load_metric('bleu')

#BLEU Metric Batching
for summarizer in ['bart', 'pegasus', 't5', 'bird', 'lead3']:

    for chunk in tqdm(chunk_list):
        refs  = [i['ground_truth'].split() for i in list(chunk.values())]        
        refs  = list(map(lambda item: [item], refs))
        preds = [i[summarizer].split() for i in list(chunk.values())]

        bleu_metric.add_batch(predictions=preds, references=refs)

    bleus = bleu_metric.compute()   
    results[summarizer].extend([
        np.round(bleus['bleu'] * 100, 3),
        np.round(bleus['precisions'][0] * 100, 1),
        np.round(bleus['brevity_penalty'], 3),
        np.round(bleus['length_ratio'], 3)])

100%|██████████| 666/666 [00:03<00:00, 216.51it/s]
100%|██████████| 666/666 [00:03<00:00, 208.11it/s]
100%|██████████| 666/666 [00:02<00:00, 231.37it/s]
100%|██████████| 666/666 [00:03<00:00, 178.57it/s]
100%|██████████| 666/666 [00:03<00:00, 174.19it/s]


### Perplexity

In [None]:
### Unigram and Perplexity
def unigram(tokenized_corpus):
    model = collections.defaultdict(lambda: 0.01)
    for f in tqdm(tokenized_corpus):
        try: model[f] += 1
        except KeyError: 
            model[f] = 1
            continue
    N = float(sum(model.values()))
    for word in model:
        model[word] = model[word]/N
    return model

def calc_perplexity(string, model):
    tokens = string.split()
    perplexity = 1
    N = 0
    for w in tokens:
        N += 1
        perplexity = perplexity * (1 / model[w])
    perplexity = pow(perplexity, 1 / float(N))
    return perplexity

In [None]:
DATA_CACHE_PATH = 'cached_data'

### EDIT THIS LINE FOR YOUR DATASET
dataset = load_dataset('scientific_papers', 'pubmed', 'test', cache_dir=DATA_CACHE_PATH)

Using custom data configuration pubmed-data_dir=test
Reusing dataset scientific_papers (cached_data/scientific_papers/pubmed-data_dir=test/1.1.1/306757013fb6f37089b6a75469e6638a553bd9f009484938d8f75a4c5e84206f)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#We need all of the words from both articles and their abstracts to form a vocabulary
#for the unigram model

#You may need to adjust the column names (e.g. description)
all_texts = list(dataset['test']['abstract']) + list(dataset['test']['abstract'])
cleaned = [clean(s) for s in all_texts]

token_list = [s.split() for s in cleaned]
flattened = [a for sublist in token_list for a in sublist]

In [None]:
unigram_model = unigram(flattened)

100%|██████████| 2368272/2368272 [00:00<00:00, 2828994.37it/s]


In [None]:
tokens = merged['bird'][871].split()

In [None]:
merged.columns

Index(['ground_truth', 'bart', 'pegasus', 't5', 'lead3', 'bird'], dtype='object')

In [None]:
perplexity_results = {}

for summarizer in ['bart', 'pegasus', 't5', 'lead3', 'bird']:
    merged_filtered = merged[merged[summarizer] != '']
    column = list(merged_filtered[summarizer])
    perplexity_results[summarizer] = [calc_perplexity(i, unigram_model) for i in column]

In [None]:
print(np.mean(perplexity_results['bart']))
print(np.mean(perplexity_results['pegasus']))
print(np.mean(perplexity_results['t5']))
print(np.mean(perplexity_results['bird']))
print(np.mean(perplexity_results['lead3']))

print("")

print(np.median(perplexity_results['bart']))
print(np.median(perplexity_results['pegasus']))
print(np.median(perplexity_results['t5']))
print(np.median(perplexity_results['bird']))
print(np.median(perplexity_results['lead3']))

421.1422723632759
804.4497063318031
7391.0500986382685
inf
inf

374.80908203586193
586.3908850203911
1944.9207212450144
1549.32215644382
1793.5315680671715


In [None]:
for k, v in perplexity_results.items():
    results[k].append(int(np.round(np.median(perplexity_results[k]))))

In [None]:
labels = [
    'rouge1', 'rougeL',
    'bleu', 'precisions', 'brevity_penalty', 'length_ratio',
    'perplexity'
]

df = pd.DataFrame.from_dict(results)
df = df.T
df.columns = labels
df

Unnamed: 0,rouge1,rougeL,bleu,precisions,brevity_penalty,length_ratio,perplexity
bart,14.3,10.3,0.011,50.2,0.002,0.137,375.0
pegasus,17.9,12.7,0.155,50.2,0.014,0.189,586.0
t5,5.9,5.2,0.0,52.7,0.0,0.053,1945.0
bird,22.7,15.6,2.335,35.7,0.256,0.423,1549.0
lead3,26.5,16.7,3.904,41.1,0.317,0.465,1794.0
