In [None]:
# Copyright (c) 2025 Microsoft Corporation.
import os
from typing import cast

import pandas as pd
from rich import print as rich_print

from benchmark_qed.autoe.score import analyze_criteria, get_pairwise_scores
from benchmark_qed.cli.utils import print_df
from benchmark_qed.config.llm_config import LLMConfig, LLMProvider
from benchmark_qed.config.model.score import pairwise_scores_criteria
from benchmark_qed.llm.factory import ModelFactory

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
%load_ext dotenv
%dotenv

In [None]:
llm_config = LLMConfig(
    model="o3-mini",
    api_key=os.environ["OPENAI_API_KEY"],
    llm_provider=LLMProvider.OpenAIChat,
    concurrent_requests=100,
    call_args={
        "temperature": 0.0,
    },
)
llm_client = ModelFactory.create_chat_model(llm_config)

In [None]:
trials = 4
alpha = 0.05

criteria = pairwise_scores_criteria()

base = "vector_rag"
others = ["drift_search", "global_search_C2"]
question_sets = ["activity_global", "activity_local"]
trials = 4

In [None]:
all_results = []
for question_set in question_sets:
    for other in others:
        rich_print(f"Processing {base} vs {other} for question set: {question_set}")
        result = get_pairwise_scores(
            llm_client=llm_client,
            llm_config=llm_config,
            base_name=base,
            other_name=other,
            base_answers=pd.read_json(
                f"./autoe_example_data/{base}/{question_set}.json"
            ),
            other_answers=pd.read_json(
                f"./autoe_example_data/{other}/{question_set}.json"
            ),
            criteria=criteria,
            trials=trials,
        )
        result["question_set"] = question_set
        all_results.append(result)

all_results_df = pd.concat(all_results, ignore_index=True)
p_value = analyze_criteria(
    all_results_df,
    alpha=alpha,
)

print_df(
    cast(
        pd.DataFrame,
        p_value[
            [
                "question_set",
                "criteria",
                "base_name",
                "other_name",
                "base_mean",
                "other_mean",
                "formatted_corrected_p_value",
            ]
        ],
    ),
    "Pairwise Scores Summary",
)

In [None]:
rich_print("Model usage statistics:")
rich_print(llm_client.get_usage())