# HotpotQA Evaluation

This notebook provides a comprehensive evaluation framework for testing and benchmarking LLM performance on the HotpotQA dataset.

In [23]:
# --- 1. Metric Functions ---
import re
import string
from collections import Counter
import os
from openai import OpenAI

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)
    if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return 0.0
    if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
        return 0.0
    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

# --- 2. Dataset Loading (project-style) ---

import yaml

with open('hotpot_qa.yaml') as stream:
    prompt = yaml.safe_load(stream)   

import sys
sys.path.append("../../")  # adjust as needed for your project structure

from src.prompt_ops.datasets.hotpotqa.adapter import HotpotQAAdapter

# Specify the path to your test set
hotpotqa_path = "/Users/justinai/Documents/Code/prompt-ops/use-cases/hotpotqa/hotpot_dev_distractor_v1.json"  # update as needed
adapter = HotpotQAAdapter(hotpotqa_path)
dataset_test = adapter.adapt()


# --- 3. Model Inference Loop ---
from tqdm.auto import tqdm

results = []
for entry in tqdm(dataset_test[:30]):
    # entry["fields"] should contain the question/context fields as per your adapter
    output = client.chat.completions.create(
        model="meta-llama/llama-3.3-70b-instruct",
        messages=[
            {"role": "system", "content": prompt["system"]},
            {"role": "user", "content": prompt["user"].format(**entry["inputs"])},
        ],
        temperature=0.0,
    )
    prediction = output.choices[0].message.content
    gt = entry["outputs"]["answer"]
    f1 = f1_score(prediction, gt)
    em = float(exact_match_score(prediction, gt))
    results.append({"question": entry["inputs"]["question"], "f1": f1, "em": em, "prediction": prediction, "ground_truth": gt})

# --- 4. Aggregate and Display Results ---
import pandas as pd

df = pd.DataFrame(results)
print("Mean F1:", df["f1"].mean())
print("Mean EM:", df["em"].mean())
df.head()

100%|██████████| 30/30 [00:29<00:00,  1.01it/s]

Mean F1: 0.7524603174603175
Mean EM: 0.6333333333333333





Unnamed: 0,question,f1,em,prediction,ground_truth
0,Were Scott Derrickson and Ed Wood of the same nationality?,1.0,1.0,Yes,yes
1,What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?,1.0,1.0,Chief of Protocol,Chief of Protocol
2,"What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?",1.0,1.0,Animorphs,Animorphs
3,Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?,1.0,1.0,No,no
4,"The director of the romantic comedy ""Big Stone Gap"" is based in what New York city?",0.75,0.0,New York City,"Greenwich Village, New York City"


In [20]:
# --- 4. Aggregate and Display Results ---
import pandas as pd

# Set display options to show everything
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(results)
print("Mean F1:", df["f1"].mean())
print("Mean EM:", df["em"].mean())

# Make the DataFrame the last expression in the cell
# This tells Jupyter to render it as a table
df 

Mean F1: 0.27261839084594397
Mean EM: 0.13333333333333333


Unnamed: 0,question,f1,em,prediction,ground_truth
0,Were Scott Derrickson and Ed Wood of the same nationality?,0.0,0.0,"Yes, Scott Derrickson and Ed Wood were of the same nationality; both were American.",yes
1,What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?,1.0,1.0,Chief of Protocol,Chief of Protocol
2,"What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?",0.08,0.0,"The science fantasy young adult series told in first person, with a set of companion books narrating the stories of enslaved worlds and alien species, is ""Animorphs.""",Animorphs
3,Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?,1.0,1.0,No,no
4,"The director of the romantic comedy ""Big Stone Gap"" is based in what New York city?",1.0,1.0,"Greenwich Village, New York City","Greenwich Village, New York City"
5,2014 S/S is the debut album of a South Korean boy group that was formed by who?,0.222222,0.0,"The South Korean boy group WINNER, whose debut album is ""2014 S/S,"" was formed by YG Entertainment.",YG Entertainment
6,Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?,0.173913,0.0,"Eenasul Fateh, also known by his stage name Aladin, was a Bangladeshi-British cultural practitioner, magician, live artist, and former international management consultant.",Eenasul Fateh
7,The arena where the Lewiston Maineiacs played their home games can seat how many people?,0.117647,0.0,"The arena where the Lewiston Maineiacs played their home games, the Androscoggin Bank Colisée, can seat 3,677 people.","3,677 seated"
8,"Who is older, Annie Morton or Terry Richardson?",0.173913,0.0,"Terry Richardson is older. Annie Morton was born on October 8, 1970, and Terry Richardson was born on August 14, 1965.",Terry Richardson
9,Are Local H and For Against both from the United States?,0.0,0.0,"Yes, both Local H and For Against are from the United States. Local H is from Zion, Illinois, and For Against is from Lincoln, Nebraska.",yes
