# Calculate EM, Precision, Recall, and F1 Score for PQuad Baseline Results

In [3]:
import pandas as pd
import json
import re
import string
from collections import Counter
import numpy as np

In [4]:
df = pd.read_csv('output/Baseline/google_api/pqaud_evaluated_results_baseline.csv')

In [5]:
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation + "،")
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def convert_digits_en2fa(text):
    english_digits = '0123456789'
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    
    translation_table = str.maketrans(english_digits, persian_digits)
    return text.translate(translation_table)


def parse_result(gold, generated_text) -> bool:
    if pd.isna(gold) or pd.isna(generated_text):
        return False
        
    gold = str(gold)
    generated_text = str(generated_text)
    
    gold = convert_digits_en2fa(gold)
    generated_text = convert_digits_en2fa(generated_text)

    if gold in generated_text:
        return True
    if gold == "بلی":
        if "بله" in generated_text:
            return True

    return False

In [6]:
def clean_model_answer(text):
    if pd.isna(text):
        return text
    
    text = str(text)
    # Remove <ANSWER> and </ANSWER> tags
    text = re.sub(r'<ANSWER>', '', text)
    text = re.sub(r'</ANSWER>', '', text)
    # Clean up any extra whitespace
    text = text.strip()
    
    return text

In [7]:
def f1_score(prediction, ground_truth):
    # Handle None/NaN values
    if pd.isna(prediction) or pd.isna(ground_truth):
        return 0, 0, 0
    
    prediction = str(prediction)
    ground_truth = str(ground_truth)
    
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0)

    # Handle special cases for yes/no answers
    if normalized_prediction in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC
    if normalized_ground_truth in ['بله', 'خیر', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    
    # If either is empty, return zeros
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return ZERO_METRIC
    
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return ZERO_METRIC
    
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1, precision, recall


def exact_match_score(prediction, ground_truth):
    # Handle None/NaN values
    if pd.isna(prediction) or pd.isna(ground_truth):
        return False
    
    prediction = str(prediction)
    ground_truth = str(ground_truth)
    
    return normalize_answer(prediction) == normalize_answer(ground_truth)

In [8]:
# Calculate metrics for each row
f1_scores = []
precisions = []
recalls = []
exact_matches = []

for idx, row in df.iterrows():
    prediction = clean_model_answer(row['model_answer'])
    ground_truth = row['answer']
    
    # Calculate F1, precision, recall
    f1, precision, recall = f1_score(prediction, ground_truth)
    f1_scores.append(f1)
    precisions.append(precision)
    recalls.append(recall)
    
    # Calculate exact match
    em = exact_match_score(prediction, ground_truth)
    exact_matches.append(em)

# Add to dataframe
df['f1_score'] = f1_scores
df['precision'] = precisions
df['recall'] = recalls
df['exact_match'] = exact_matches


In [11]:
# Calculate overall metrics
overall_f1 = np.mean(f1_scores)
overall_precision = np.mean(precisions)
overall_recall = np.mean(recalls)
overall_em = np.mean(exact_matches)

print("=" * 60)
print("MHQA Baseline Evaluation Results")
print("=" * 60)
print(f"Total samples: {len(df)}")
print("-" * 40)
print(f"Exact Match (EM): {overall_em:.4f} ({overall_em*100:.2f}%)")
print(f"F1 Score:         {overall_f1:.4f}")
print(f"Precision:        {overall_precision:.4f}")
print(f"Recall:           {overall_recall:.4f}")


MHQA Baseline Evaluation Results
Total samples: 500
----------------------------------------
Exact Match (EM): 0.0760 (7.60%)
F1 Score:         0.2034
Precision:        0.2405
Recall:           0.2099
