In [None]:
import os
import json
import time

from datetime import datetime

from tqdm.notebook import tqdm

from react_agent.src.util.llm_proxy import LLM_PROXY
from react_agent.src.util.tools_fabric import ToolsFabric

from react_agent.src.config.system_parameters import TriageSettings

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.metrics.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.agent_judge import AgentJudgeEvaluator

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
LLM_TO_USE = "gpt-4.1"
AGENT_JUDGE_MODEL = "anthropic--claude-3.5-sonnet"

fact_scorer = FactScorer()
path_to_source_data = "./resources/02_facts/"

date_str = datetime.now().strftime("%d%m%Y")
path_to_results = f"./results/{date_str}"

if not os.path.exists(path_to_results):
    os.makedirs(path_to_results)

time_str = datetime.now().strftime("%H%M%S")

result_file_name = f"{path_to_results}/{time_str}_results.json"
failed_experiments_file_name = f"{path_to_results}/{time_str}_failed_experiments.json"

In [None]:
file_path = os.path.abspath(path_to_source_data + "data_set_facts.json")

with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [None]:
data_set_for_experiment = data_set

In [None]:
async def run_experiment(data_row):
    result = ExperimentResult(
        id=data_row.id,
        question=data_row.question,
        answer=data_row.answer,
        product=data_row.product,
        category=data_row.category,
        persona=data_row.persona,
        activity=data_row.activity,
        country=data_row.country,
    )
    print(f"Processing question with id: {data_row.id}")

    tools = ToolsFabric.get_tools_for_category(
        use_mcp=False,
        configuration=TriageSettings.Categories.ALL,
    )

    print("\tAsking LLM question...")

    LLM_PROXY.set_new_model(LLM_TO_USE)

    run_start_time = time.perf_counter()
    llm_response = LLM_PROXY.invoke(input=data_row.question)
    run_end_time = time.perf_counter()

    result.tools_used = None
    result.tool_calls_count = 0
    result.excecution_time_seconds = run_end_time - run_start_time
    result.model_used = LLM_TO_USE
    result.tokens_consumed = LLM_PROXY.get_token_usage()
    result.llm_call_count = LLM_PROXY.get_call_count()
    result.facts = data_row.facts
    result.generated_answer = llm_response

    print("\tFinished agent execution")

    print("\tCalculating FactScore...")
    result.fact_score = await fact_scorer.get_fact_score(
        facts=data_row.facts,
        knowledge_source=result.generated_answer,
    )

    print("\tCalculating BERTScore...")
    result.bert_score = BertScore.compute_score(
        expected_response=data_row.answer, actual_response=result.generated_answer
    )

    print("\tEvaluating agent response with Agent as a Judge...")
    result.agent_judge_model = AGENT_JUDGE_MODEL
    agent_evaluator = AgentJudgeEvaluator(model=AGENT_JUDGE_MODEL)

    try:
        agent_judge_outcome = agent_evaluator.evaluate(
            question=data_row.question,
            expert_answer=data_row.answer,
            generated_answer=result.generated_answer,
        )
    except Exception as e:
        print(f"\tAn error occurred during agent evaluation: {e}")
        print("\t Trying to run evaluation again...")
        agent_judge_outcome = agent_evaluator.evaluate(
            question=data_row.question,
            expert_answer=data_row.answer,
            generated_answer=result.generated_answer,
        )

    result.agent_judge_outcome = agent_judge_outcome.answer
    result.agent_judge_reasoning = agent_judge_outcome.reasoning

    result.agent_judge_call_count = agent_evaluator.get_llm_call_count()
    result.agent_judge_tokens_consumed = agent_evaluator.get_token_consumption()

    print(f"\tFinished experiment for id: {data_row.id}\n")

    return result

In [None]:
experiment_results: list[ExperimentResult] = []

In [None]:
for data_row in tqdm(data_set_for_experiment, desc="Running cases", unit="case"):
    if data_row.id in [result.id for result in experiment_results]:
        print(f"Skipping already processed id: {data_row.id}")
        continue
    try:
        result = await run_experiment(data_row)

        experiment_results.append(result)
    except Exception as e:
        print(f"Error processing id {data_row.id}: {e}")

In [None]:
# Serialize experiment results to list of dicts
import pandas as pd

records = [r.model_dump(mode="json") for r in experiment_results]
# Make it a dataframe in order to flatten nested structures
df = pd.json_normalize(records)

df

In [None]:
# Convert to list of dictionaries (JSON objects)
json_list = df.to_dict(orient="records")

with open(result_file_name, "w") as f:
    json.dump(json_list, f, indent=2)