In [22]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration
from bert_score import BERTScorer
import numpy as np
from tqdm import tqdm
from evaluate import load


### Helper Functions

---

In [23]:
def generate_bart_summary(text, model, tokenizer):
    """Generate a summary using the BART model."""
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs['input_ids'], 
        max_length=150, 
        min_length=30, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [24]:
def evaluate_with_bert_score(generated_summaries, reference_summaries):
    """Calculate BERT Score for generated summaries against references."""
    bertscore = load("bertscore")
    
    # Calculate BERT Score
    scores = bertscore.compute(
        predictions=generated_summaries, 
        references=reference_summaries, 
        lang="en", 
        verbose=True
    )
    
    # Convert to numpy for easier handling
    precision = np.array(scores['precision'])
    recall = np.array(scores['recall'])
    f1 = np.array(scores['f1'])
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mean_precision": precision.mean(),
        "mean_recall": recall.mean(),
        "mean_f1": f1.mean()
    }


In [25]:
def evaluate_bart_summarizer(sampled_data, text_column, reference_column, model_path='fine_tuned_bart'):
    # Load the fine-tuned BART model
    model = BartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = BartTokenizer.from_pretrained(model_path)
    
    # Create a new column for generated summaries
    sampled_data['Generated Summaries'] = None
    
    # Generate summaries for each row
    print("Generating summaries...")
    for i, row in tqdm(sampled_data.iterrows(), total=len(sampled_data)):
        text = row[text_column]
        
        # Generate summary for this text
        generated_summary = generate_bart_summary(text, model, tokenizer)
        
        # Store the generated summary
        sampled_data.at[i, 'Generated Summaries'] = generated_summary
    
    # Evaluate using BERT Score
    print("Calculating BERT Score...")
    bert_scores = evaluate_with_bert_score(
        generated_summaries=sampled_data['Generated Summaries'].tolist(),
        reference_summaries=sampled_data[reference_column].tolist()
    )
    
    # Add scores to the dataframe
    sampled_data['bert_precision'] = bert_scores['precision']
    sampled_data['bert_recall'] = bert_scores['recall']
    sampled_data['bert_f1'] = bert_scores['f1']
    
    # Print average scores
    print(f"Average BERT Score Precision: {bert_scores['mean_precision']:.4f}")
    print(f"Average BERT Score Recall: {bert_scores['mean_recall']:.4f}")
    print(f"Average BERT Score F1: {bert_scores['mean_f1']:.4f}")
    
    return sampled_data

## Evaluation on BBC Dataset
https://www.kaggle.com/datasets/pariza/bbc-news-summary

---

In [26]:
news_df = pd.read_csv('news_test.csv',index_col=False)
news_df.head(5)

Unnamed: 0,text,summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...


In [29]:
# BERT score

news_results = evaluate_bart_summarizer(
    sampled_data=news_df,
    text_column='text',  
    reference_column='summary',  
    model_path='fine_tuned_bart' 
    )

Generating summaries...


100%|██████████| 25/25 [01:20<00:00,  3.21s/it]


Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:09<00:00,  9.77s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 78.33it/s]

done in 42980.15 seconds, 0.00 sentences/sec
Average BERT Score Precision: 0.9233
Average BERT Score Recall: 0.8914
Average BERT Score F1: 0.9069





In [30]:
news_results.to_csv('bart_news_extractive_summarization.csv', index=False)

## Evaluation on SciSumm Dataset
https://www.kaggle.com/datasets/jawakar/scisummnet-corpus/data

---

In [31]:
sci_df = pd.read_csv('sci_test.csv',index_col=False)
sci_df.head(5)

Unnamed: 0,text,summary
0,TnT - A Statistical Part-Of-Speech Tagger Trig...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
1,Mildly Non-Projective Dependency Structures Sy...,Mildly Non-Projective Dependency Structures\nS...
2,Using Corpus Statistics And WordNet Relations ...,Using Corpus Statistics And WordNet Relations ...
3,Automatic Labeling Of Semantic Roles present a...,Automatic Labeling Of Semantic Roles\nWe prese...
4,Generative Models For Statistical Parsing With...,Generative Models For Statistical Parsing With...


In [32]:
# Test BART on SciTest dataset

sci_results = evaluate_bart_summarizer(
        sampled_data=sci_df,
        text_column='text',  
        reference_column='summary',  
        model_path='fine_tuned_bart'
        )

Generating summaries...


100%|██████████| 25/25 [02:28<00:00,  5.93s/it]


Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:15<00:00, 15.78s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 55.52it/s]

done in 43258.83 seconds, 0.00 sentences/sec
Average BERT Score Precision: 0.8595
Average BERT Score Recall: 0.8569
Average BERT Score F1: 0.8578





In [33]:
# Save results
sci_results.to_csv('bart_science_extractive_summarization.csv', index=False)