## Evaluating with N-grams metrics (ROUGE, BLEU)

In [1]:
import pandas as pd

csv_path = "new_questions_with_LLM_answers.csv"
df = pd.read_csv(csv_path)
columns = df.columns.tolist()

columns


['Question',
 'Source Docs',
 'Question Type',
 'Source Chunk Type',
 'Answer',
 'Generated Answers']

In [7]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
from typing import List, Dict

def preprocess_text(text: str) -> List[str]:
    """
    Preprocess text by converting to lowercase and tokenizing
    """
    tokens = nltk.word_tokenize(str(text).lower())
    return tokens

def compute_bleu_score(reference: str, candidate: str) -> float:
    reference_tokens = preprocess_text(reference)
    candidate_tokens = preprocess_text(candidate)
    
    references = [reference_tokens]
    smoothing = SmoothingFunction().method1
    
    # Calculate BLEU score with equal weights for 1-4 grams
    weights = (0.25, 0.25, 0.25, 0.25)
    
    return sentence_bleu(references, candidate_tokens, 
                           weights=weights,
                           smoothing_function=smoothing)


def compute_rouge_scores(reference: str, candidate: str) -> Dict[str, float]:
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    scores = scorer.score(str(reference), str(candidate))
    
    # Extract F1 scores
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

def evaluate_qa_metrics(csv_path: str) -> pd.DataFrame:
    """
    Compute BLEU and ROUGE metrics for QA pairs in CSV file
    """
    nltk.download('punkt')
    df = pd.read_csv(csv_path)
    
    # Initialize lists to store metrics
    bleu_scores = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for _, row in df.iterrows():
        # Get reference and generated answers
        reference = row['Answer'] 
        generated = row['Generated Answers']
        
        bleu = compute_bleu_score(reference, generated)
        bleu_scores.append(bleu)
        
        rouge_scores = compute_rouge_scores(reference, generated)
        rouge1_scores.append(rouge_scores['rouge1'])
        rouge2_scores.append(rouge_scores['rouge2'])
        rougeL_scores.append(rouge_scores['rougeL'])
    
    # Add metrics to DataFrame
    df['BLEU'] = bleu_scores
    df['ROUGE-1'] = rouge1_scores
    df['ROUGE-2'] = rouge2_scores
    df['ROUGE-L'] = rougeL_scores
    
    # Calculate average metrics
    metrics_summary = {
        'Average BLEU': df['BLEU'].mean(),
        'Average ROUGE-1': df['ROUGE-1'].mean(),
        'Average ROUGE-2': df['ROUGE-2'].mean(),
        'Average ROUGE-L': df['ROUGE-L'].mean()
    }
    
    print("\nMetrics Summary:")
    for metric, value in metrics_summary.items():
        print(f"{metric}: {value:.4f}")
    
    return df

In [8]:
results_df = evaluate_qa_metrics(csv_path)

[nltk_data] Downloading package punkt to /home/nikunj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Metrics Summary:
Average BLEU: 0.0561
Average ROUGE-1: 0.2936
Average ROUGE-2: 0.1221
Average ROUGE-L: 0.1789


## BERT Scorer

In [11]:
import pandas as pd
import numpy as np
from bert_score import BERTScorer
from sentence_transformers import SentenceTransformer
from nltk.translate.meteor_score import meteor_score
import nltk
from tqdm import tqdm
import torch
from typing import Dict

class QADataFrameEvaluator:
    def __init__(self):
        self.bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
        nltk.download('wordnet')
        
    def semantic_similarity(self, pred: str, ref: str) -> float:
        pred_embedding = self.sentence_model.encode([str(pred)])
        ref_embedding = self.sentence_model.encode([str(ref)])
        similarity = torch.nn.functional.cosine_similarity(
            torch.Tensor(pred_embedding), 
            torch.Tensor(ref_embedding)
        )
        return float(similarity[0])
    
    def calculate_meteor(self, pred: str, ref: str) -> float:
        return meteor_score([str(ref).split()], str(pred).split())
    

    def evaluate_dataframe(self, df: pd.DataFrame, reference_col: str = 'Answer', generated_col: str = 'Generated Answers', batch_size: int = 32) -> pd.DataFrame:

        semantic_similarities = []
        
        for i in tqdm(range(0, len(df), batch_size), desc="Evaluating answers"):
            batch_df = df.iloc[i:i+batch_size]
            
            for _, row in batch_df.iterrows():
                pred = str(row[generated_col]) if pd.notna(row[generated_col]) else ""
                ref = str(row[reference_col]) if pd.notna(row[reference_col]) else ""
                
                semantic_similarities.append(self.semantic_similarity(pred, ref))
                
        avg_scores = {
            'Average Semantic Similarity': np.mean(semantic_similarities),
        }
        
        for metric, value in avg_scores.items():
            print(f"{metric}: {value:.3f}")

In [12]:
evaluator = QADataFrameEvaluator()
evaluator.evaluate_dataframe(df)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package wordnet to /home/nikunj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Evaluating answers: 100%|██████████| 2/2 [00:04<00:00,  2.04s/it]

Average Semantic Similarity: 0.765



