# Calculate EM, Precision, Recall, and F1 Score for PQuad HippoRAG Results


In [15]:
import pandas as pd
import json
import re
import string
from collections import Counter
import numpy as np

In [16]:
df = pd.read_csv('output/HippoRAG/HippoRAG_PQuad/evaluated_results_no_reasoning_RAG.csv')

In [17]:
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation + "،")
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def convert_digits_en2fa(text):
    english_digits = '0123456789'
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    
    translation_table = str.maketrans(english_digits, persian_digits)
    return text.translate(translation_table)


def parse_result(gold, generated_text) -> bool:
    if pd.isna(gold) or pd.isna(generated_text):
        return False
        
    gold = str(gold)
    generated_text = str(generated_text)
    
    gold = convert_digits_en2fa(gold)
    generated_text = convert_digits_en2fa(generated_text)

    if gold in generated_text:
        return True
    if gold == "بلی":
        if "بله" in generated_text:
            return True

    return False

In [18]:
def f1_score(prediction, ground_truth):
    # Handle None/NaN values
    if pd.isna(prediction) or pd.isna(ground_truth):
        return 0, 0, 0
    
    prediction = str(prediction)
    ground_truth = str(ground_truth)
    
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0)

    # Handle special cases for yes/no answers
    if normalized_prediction in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    
    # If either is empty, return zeros
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return ZERO_METRIC
    
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return ZERO_METRIC
    
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1, precision, recall


def exact_match_score(prediction, ground_truth):
    # Handle None/NaN values
    if pd.isna(prediction) or pd.isna(ground_truth):
        return False
    
    prediction = str(prediction)
    ground_truth = str(ground_truth)
    
    return normalize_answer(prediction) == normalize_answer(ground_truth)

In [19]:
em_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
parse_result_scores = []

# Calculate metrics for each row
for index, row in df.iterrows():
    answer = row['answer']
    model_answer = row['model_answer'] 
    
    # Calculate EM
    em = exact_match_score(model_answer, answer)
    em_scores.append(em)
    
    # Calculate F1, Precision, Recall
    f1, prec, rec = f1_score(model_answer, answer)
    f1_scores.append(f1)
    precision_scores.append(prec)
    recall_scores.append(rec)
    
    # Calculate parse result (substring matching)
    parse_res = parse_result(answer, model_answer)
    parse_result_scores.append(parse_res)

# Add the calculated metrics to the dataframe
df['em_score'] = em_scores
df['f1_score'] = f1_scores
df['precision_score'] = precision_scores
df['recall_score'] = recall_scores
df['parse_result'] = parse_result_scores

In [20]:
# Calculate overall aggregated metrics
total_questions = len(df)

# Filter out questions with empty/null answers for meaningful metric calculation
valid_questions = df.dropna(subset=['answer'])
valid_questions = valid_questions[valid_questions['answer'] != '']

print(f"Total questions: {total_questions}")
print(f"Questions with valid answers: {len(valid_questions)}")
print(f"Questions with empty/null answers: {total_questions - len(valid_questions)}")


# Calculate metrics on valid questions only
overall_em = valid_questions['em_score'].mean()
overall_f1 = valid_questions['f1_score'].mean()
overall_precision = valid_questions['precision_score'].mean()
overall_recall = valid_questions['recall_score'].mean()
overall_parse_accuracy = valid_questions['parse_result'].mean()

overall_is_correct = valid_questions['is_correct'].mean()

print(f"\n=== Overall Metrics (on {len(valid_questions)} valid questions) ===")
print(f"Exact Match (EM): {overall_em:.4f}")
print(f"F1 Score: {overall_f1:.4f}")
print(f"Precision: {overall_precision:.4f}")
print(f"Recall: {overall_recall:.4f}")
print(f"Parse Accuracy: {overall_parse_accuracy:.4f}")
print(f"LLM Judge Accuracy (is_correct): {overall_is_correct:.4f}")


Total questions: 500
Questions with valid answers: 404
Questions with empty/null answers: 96

=== Overall Metrics (on 404 valid questions) ===
Exact Match (EM): 0.1460
F1 Score: 0.3083
Precision: 0.3260
Recall: 0.3312
Parse Accuracy: 0.2252
LLM Judge Accuracy (is_correct): 0.3564
