In [1]:
import os
import json
import random
import pandas as pd

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.rouge_score import RougeScore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
USE_MCP = False
DEBUG_MODE = False

fact_scorer = FactScorer()
path_to_ressources = "./resources/02_facts/"

files = {
    "extensibility": "extensibility_assistance_facts.json",
    "malaysia": "malaysia_support_facts.json",
    "peppol": "peppol_support_facts.json",
    "all": "all_cases_facts.json",
}

In [3]:
file_path = os.path.abspath(path_to_ressources + files["all"])


with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [4]:
# Select 3 random elements
random_data_set = random.sample(data_set, 1)

In [None]:
triage_agent = Triage()
execution_trail = ""
experiment_results: list[ExperimentResult] = []

for data_row in random_data_set:
    result = ExperimentResult(
        id=data_row.id,
        question=data_row.question,
        answer=data_row.answer,
        product=data_row.product,
        category=data_row.category,
        persona=data_row.persona,
        activity=data_row.activity,
        country=data_row.country,
    )
    print(f"Asking agent question with id: {data_row.id}")
    query_categorization = triage_agent.triage_user_message(
        user_message=data_row.question
    )
    if DEBUG_MODE:
        print(f"Query categorization: {query_categorization}")

    tools = ToolsFabric.get_tools_for_category(
        use_mcp=USE_MCP,
        configuration=query_categorization["category"],
    )

    if USE_MCP:
        async with MultiServerMCPClient(tools) as client:
            agent = ReActAgent(tool_list=client.get_tools())
            execution_trail = await agent.arun_agent_with_input(
                user_message=query_categorization["user_query"], debug=DEBUG_MODE
            )
    else:
        agent = ReActAgent(tool_list=tools)
        execution_trail = agent.run_agent_with_input(
            user_message=query_categorization["user_query"], debug=DEBUG_MODE
        )

    run_data = agent.get_execution_data()

    result.tools_used = run_data.tools_used
    result.excecution_time_seconds = run_data.excecution_time_seconds
    result.model_used = run_data.model_used
    result.tokens_consumed = run_data.tokens_consumed
    result.llm_call_count = run_data.llm_call_count
    result.facts = data_row.facts

    print("Finished agent execution")

    print("Started FactScore calculation")
    result.fact_score = await fact_scorer.get_fact_score(
        facts=data_row.facts,
        knowledge_source=run_data.final_output,
        debug=DEBUG_MODE,
    )

    print("Started BERTScore calculation")
    result.bert_score = BertScore.compute_score(
        expected_response=data_row.answer, actual_response=run_data.final_output
    )

    print(f"Finished experiment for id: {data_row.id}\n")

    experiment_results.append(result)

Asking agent question with id: PS-3
Finished agent execution
Started FactScore calculation
Started BertScore calculation
Started Rouge calculation
Finished experiment for id: PS-3



In [6]:
records = [r.model_dump(mode="json") for r in experiment_results]
df = pd.json_normalize(records)

df

  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...rocess specifically.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,model_used,llm_call_count,fact_score.direct_fact_score,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,PS-3,Incoming ZUGFeRD invoices (from suppliers) we ...,Here are some hints that might be helpful. We ...,,,,,,[{'fact': 'Extend EDOPROCFUNCASGV as indicated...,0.076698,...,gpt-4o,10,0.0,0.0,0.0,[{'fact': 'Extend EDOPROCFUNCASGV as indicated...,[{'fact': 'EDOACTIONUIPROCV is indicated in Co...,11688,641,12329
