In [21]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration
from bert_score import BERTScorer
import numpy as np
from tqdm import tqdm


In [27]:
# Load the dataset
df = pd.read_csv('cleaned_data.csv')

# Define the categories
categories = ['tech', 'business', 'politics', 'sport', 'entertainment']

# Create an empty DataFrame
sampled_data = pd.DataFrame()

# Sample 5 outputs per category
sampled_data = pd.concat([
    df[df['category'] == category].sample(n=5, random_state=42)
    for category in categories
], ignore_index=True)

# Save to CSV
sampled_data.to_csv('evaluation_sample.csv', index=False)

# Display the sampled data
sampled_data.head(5)


Unnamed: 0,category,article,summary
0,tech,"Peer-to-peer (P2P) networks are here to stay, ...",But they have slowly realised that P2P is a go...
1,tech,"Dublin's hi-tech research laboratory, Media La...","In a statement, Media Labs Europe said the dec..."
2,tech,A rapid alerting service that tells home compu...,A rapid alerting service that tells home compu...
3,tech,"Faster, better or funkier hardware alone is no...",Dr Bjorn said that people also used their came...
4,tech,"First it was the humble home video, then it wa...","But currently, putting a master feature film o..."


In [28]:
def generate_bart_summary(text, model, tokenizer):
    """Generate a summary using the BART model."""
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding=True)
    summary_ids = model.generate(
        inputs['input_ids'], 
        max_length=150, 
        min_length=30, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [29]:
def evaluate_with_bert_score(generated_summaries, reference_summaries):
    """Calculate BERT Score for generated summaries against references."""
    bertscore = load("bertscore")
    
    # Calculate BERT Score
    scores = bertscore.compute(
        predictions=generated_summaries, 
        references=reference_summaries, 
        lang="en", 
        verbose=True
    )
    
    # Convert to numpy for easier handling
    precision = np.array(scores['precision'])
    recall = np.array(scores['recall'])
    f1 = np.array(scores['f1'])
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mean_precision": precision.mean(),
        "mean_recall": recall.mean(),
        "mean_f1": f1.mean()
    }


In [31]:
def evaluate_bart_summarizer(sampled_data, text_column, reference_column, model_path='fine_tuned_bart'):
    # Load the fine-tuned BART model
    model = BartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = BartTokenizer.from_pretrained(model_path)
    
    # Create a new column for generated summaries
    sampled_data['Generated Summaries'] = None
    
    # Generate summaries for each row
    print("Generating summaries...")
    for i, row in tqdm(sampled_data.iterrows(), total=len(sampled_data)):
        text = row[text_column]
        
        # Generate summary for this text
        generated_summary = generate_bart_summary(text, model, tokenizer)
        
        # Store the generated summary
        sampled_data.at[i, 'Generated Summaries'] = generated_summary
    
    # Evaluate using BERT Score
    print("Calculating BERT Score...")
    bert_scores = evaluate_with_bert_score(
        generated_summaries=sampled_data['Generated Summaries'].tolist(),
        reference_summaries=sampled_data[reference_column].tolist()
    )
    
    # Add scores to the dataframe
    sampled_data['bert_precision'] = bert_scores['precision']
    sampled_data['bert_recall'] = bert_scores['recall']
    sampled_data['bert_f1'] = bert_scores['f1']
    
    # Print average scores
    print(f"Average BERT Score Precision: {bert_scores['mean_precision']:.4f}")
    print(f"Average BERT Score Recall: {bert_scores['mean_recall']:.4f}")
    print(f"Average BERT Score F1: {bert_scores['mean_f1']:.4f}")
    
    return sampled_data

In [32]:
# Example usage
if __name__ == "__main__":
    # Load your test dataset
    df_test = pd.read_csv('evaluation_sample.csv')
    
    # Evaluate BART summarizer
    results = evaluate_bart_summarizer(
        sampled_data=df_test,
        text_column='article',  
        reference_column='summary',  
        model_path='fine_tuned_bart' 
    )

Generating summaries...


100%|██████████| 25/25 [01:18<00:00,  3.13s/it]


Calculating BERT Score...


Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<00:00, 14.7MB/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:12<00:00, 12.95s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 51.37it/s]

done in 47053.84 seconds, 0.00 sentences/sec
Average BERT Score Precision: 0.9222
Average BERT Score Recall: 0.8902
Average BERT Score F1: 0.9056





In [33]:
# Save results
results.to_csv('bertscore_evaluation_results.csv', index=False)