In [1]:
import os
import json
import random
import pandas as pd

from tqdm.notebook import tqdm

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.llm_judge import LLMAsJudgeEvaluator

In [2]:
USE_MCP = False
DEBUG_MODE = False

LLM_JUDGE_MODEL = "gpt-4o-mini"

fact_scorer = FactScorer()
path_to_ressources = "./resources/02_facts/"

files = {
    "extensibility": "extensibility_assistance_facts.json",
    "malaysia": "malaysia_support_facts.json",
    "peppol": "peppol_support_facts.json",
    "all": "all_cases_facts.json",
}

In [3]:
file_path = os.path.abspath(path_to_ressources + files["all"])


with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [4]:
# Select 3 random elements
random_data_set = random.sample(data_set, 2)

In [5]:
triage_agent = Triage()
execution_trail = ""
experiment_results: list[ExperimentResult] = []

for data_row in tqdm(random_data_set, desc="Running experiments", unit="experiment"):
    result = ExperimentResult(
        id=data_row.id,
        question=data_row.question,
        answer=data_row.answer,
        product=data_row.product,
        category=data_row.category,
        persona=data_row.persona,
        activity=data_row.activity,
        country=data_row.country,
    )
    print(f"Asking agent question with id: {data_row.id}")
    query_categorization = triage_agent.triage_user_message(
        user_message=data_row.question
    )
    if DEBUG_MODE:
        print(f"Query categorization: {query_categorization}")

    tools = ToolsFabric.get_tools_for_category(
        use_mcp=USE_MCP,
        configuration=query_categorization["category"],
    )

    if USE_MCP:
        async with MultiServerMCPClient(tools) as client:
            agent = ReActAgent(tool_list=client.get_tools())
            execution_trail = await agent.arun_agent_with_input(
                user_message=query_categorization["user_query"], debug=DEBUG_MODE
            )
    else:
        agent = ReActAgent(tool_list=tools)
        execution_trail = agent.run_agent_with_input(
            user_message=query_categorization["user_query"], debug=DEBUG_MODE
        )

    run_data = agent.get_execution_data()

    result.tools_used = run_data.tools_used
    result.excecution_time_seconds = run_data.excecution_time_seconds
    result.model_used = run_data.model_used
    result.tokens_consumed = run_data.tokens_consumed
    result.llm_call_count = run_data.llm_call_count
    result.facts = data_row.facts
    result.generated_answer = run_data.final_output

    print("Finished agent execution")

    print("Started FactScore calculation")
    result.fact_score = await fact_scorer.get_fact_score(
        facts=data_row.facts,
        knowledge_source=result.generated_answer,
        debug=DEBUG_MODE,
    )

    print("Started BERTScore calculation")
    result.bert_score = BertScore.compute_score(
        expected_response=data_row.answer, actual_response=result.generated_answer
    )

    print("Stated LLM as a Judge calculation")
    result.llm_judge_model = LLM_JUDGE_MODEL
    llm_evaluator = LLMAsJudgeEvaluator(model=LLM_JUDGE_MODEL)

    result.llm_judge_outcome = llm_evaluator.evaluate(
        question=data_row.question, generated_answer=result.generated_answer
    )

    print(f"Finished experiment for id: {data_row.id}\n")

    experiment_results.append(result)

Running experiments:   0%|          | 0/2 [00:00<?, ?experiment/s]

Asking agent question with id: EA-7
Finished agent execution
Started FactScore calculation
Started BERTScore calculation
Stated LLM as a Judge calculation
Finished experiment for id: EA-7

Asking agent question with id: EA-8
Finished agent execution
Started FactScore calculation
Started BERTScore calculation
Stated LLM as a Judge calculation
Finished experiment for id: EA-8



In [6]:
records = [r.model_dump(mode="json") for r in experiment_results]
df = pd.json_normalize(records)

df

  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...)], supporting_facts=[])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...lectronic documents.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,excecution_time_seconds,llm_call_count,fact_score.direct_fact_score,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,EA-7,Can I extend an Action in eDocument Cockpit fo...,"No, Actions can not be extended in Public Cloud",,,,,,[{'fact': 'Actions cannot be extended in Publi...,-0.100867,...,36.183,1,0.0,0.0,0.0,[{'fact': 'Actions cannot be extended in Publi...,[],5247,688,5935
1,EA-8,Can I extend an Action in eDocument Cockpit fo...,"Yes, you can extend an action in the eDocument...",,,,,,[{'fact': 'You can extend an action in the eDo...,0.080772,...,32.804,1,1.0,0.166667,0.285714,[{'fact': 'You can extend an action in the eDo...,[{'fact': 'A function module can be configured...,5247,644,5891
