In [1]:
import os
import json
import random
import pandas as pd

from tqdm.notebook import tqdm

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.llm_judge import LLMAsJudgeEvaluator

In [2]:
USE_MCP = False
DEBUG_MODE = True

LLM_JUDGE_MODEL = "gemini-2.0-flash"
# LLM_JUDGE_MODEL = "gpt-4o"

fact_scorer = FactScorer()
path_to_ressources = "./resources/02_facts/"

files = {
    "extensibility": "extensibility_assistance_facts.json",
    "malaysia": "malaysia_support_facts.json",
    "peppol": "peppol_support_facts.json",
    "all": "all_cases_facts.json",
}

In [3]:
file_path = os.path.abspath(path_to_ressources + files["all"])


with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [4]:
random_data_set = random.sample(data_set, 30)

In [5]:
triage_agent = Triage()
execution_trail = ""
experiment_results: list[ExperimentResult] = []

for data_row in tqdm(random_data_set, desc="Running experiments", unit="experiment"):
    result = ExperimentResult(
        id=data_row.id,
        question=data_row.question,
        answer=data_row.answer,
        product=data_row.product,
        category=data_row.category,
        persona=data_row.persona,
        activity=data_row.activity,
        country=data_row.country,
    )
    print(f"Asking agent question with id: {data_row.id}")
    query_categorization = triage_agent.triage_user_message(
        user_message=data_row.question
    )
    if DEBUG_MODE:
        print(f"Query categorization: {query_categorization}")

    result.triage_category = query_categorization["category"]

    tools = ToolsFabric.get_tools_for_category(
        use_mcp=USE_MCP,
        configuration=result.triage_category,
    )

    if USE_MCP:
        async with MultiServerMCPClient(tools) as client:
            agent = ReActAgent(tool_list=client.get_tools())
            execution_trail = await agent.arun_agent_with_input(
                user_message=data_row.question, debug=DEBUG_MODE
            )
    else:
        agent = ReActAgent(tool_list=tools)
        execution_trail = agent.run_agent_with_input(
            user_message=data_row.question, debug=DEBUG_MODE
        )

    run_data = agent.get_execution_data()

    result.tools_used = run_data.tools_used
    result.tool_calls_count = len(run_data.tools_used)
    result.excecution_time_seconds = run_data.excecution_time_seconds
    result.model_used = run_data.model_used
    result.tokens_consumed = run_data.tokens_consumed
    result.llm_call_count = run_data.llm_call_count
    result.facts = data_row.facts
    result.generated_answer = run_data.final_output

    print("Finished agent execution")

    print("Started FactScore calculation")
    result.fact_score = await fact_scorer.get_fact_score(
        facts=data_row.facts,
        knowledge_source=result.generated_answer,
        debug=DEBUG_MODE,
    )

    print("Started BERTScore calculation")
    result.bert_score = BertScore.compute_score(
        expected_response=data_row.answer, actual_response=result.generated_answer
    )

    print("Stated LLM as a Judge calculation")
    result.llm_judge_model = LLM_JUDGE_MODEL
    llm_evaluator = LLMAsJudgeEvaluator(model=LLM_JUDGE_MODEL)

    result.llm_judge_outcome = llm_evaluator.evaluate(
        question=data_row.question, generated_answer=result.generated_answer
    )

    result.llm_judge_call_count = llm_evaluator.get_llm_call_count()
    result.llm_judge_tokens_consumed = llm_evaluator.get_token_consumption()

    print(f"Finished experiment for id: {data_row.id}\n")

    experiment_results.append(result)

Running experiments:   0%|          | 0/30 [00:00<?, ?experiment/s]

Asking agent question with id: EA-41
Query categorization: {'user_query': 'In DRC extensibility, what are the supported countries for external documents created in external systems?', 'category': 'Knowledge-QA'}

In DRC extensibility, what are the supported countries for external documents created in external systems?

Initial Observation: The user is asking about the supported countries for external documents created in external systems within the SAP Document and Reporting Compliance (DRC) extensibility framework.

Thought: To answer this question, I need to find specific information about the countries supported by SAP DRC for external documents. This information is likely detailed in SAP documentation or help articles.

Action Plan: 
1. Use the `sap_documentation_summary` tool to gather detailed information about DRC extensibility and supported countries.
2. Validate the findings using the `sap_help_lookup` tool to ensure accuracy and completeness.

Action: Call the `sap_documentat

In [6]:
records = [r.model_dump(mode="json") for r in experiment_results]
df = pd.json_normalize(records)

df

  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...)], supporting_facts=[])` - serialized value may not be as expected
  Expected `enum` but got `str` with value `'Knowledge-QA'` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...align with the fact.')])` - serialized value may not be as expected
  Expected `enum` but got `str` with value `'Knowledge-QA'` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...on for more details.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...ent eDocument types.')])` - serialized value may not be as expected
  Expected `enum` but got `str` with 

Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,llm_judge_tokens_consumed.input_tokens,llm_judge_tokens_consumed.output_tokens,llm_judge_tokens_consumed.total_tokens,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,EA-41,"In DRC extensibility, what are the supported c...",The supported countres for external documents ...,,,,,,[{'fact': 'Malaysia is a supported country for...,-0.133135,...,0.0,0.0,[{'fact': 'Malaysia is a supported country for...,[],719,2,721,12980,895,13875
1,EA-15,"As a Public Cloud customer in Germany, can I r...","Yes, as a Public Cloud customer in Germany, yo...",,,,,,[{'fact': 'As a Public Cloud customer in Germa...,0.225154,...,0.5,0.8,[{'fact': 'As a Public Cloud customer in Germa...,[{'fact': 'The BAdI EDOC_ADAPTOR_CLOUD method ...,798,2,800,5311,986,6297
2,EA-16,"As a Public Cloud customer in Spain, what are ...",Implementation Steps in ABAP Development Tools...,,,,,,[{'fact': 'Create implementation for enhanceme...,0.158732,...,0.5,0.571429,[{'fact': 'Create implementation for enhanceme...,[{'fact': 'Create BAdI Filter for Country and/...,785,2,787,3670,941,4611
3,EA-54,How can I inject a virus into an Electronic Do...,Injecting a virus into a document is not a rec...,,,,,,[{'fact': 'Injecting a virus into a document i...,-0.007103,...,0.0,0.0,[{'fact': 'Injecting a virus into a document i...,[],512,2,514,1270,12,1282
4,EA-40,I am a Public Cloud customer researching the s...,Source documents that have the same eDocument ...,,,,,,[{'fact': 'Source documents with the same eDoc...,-0.027779,...,0.0,0.0,[{'fact': 'Source documents with the same eDoc...,[{'fact': 'Regular invoices and their correcti...,950,2,952,11772,1394,13166
5,EA-9,Can I extend the Manage Electronic Documents a...,"Yes, you can extend the Manage Electronic Docu...",,,,,,[{'fact': 'You can extend the Manage Electroni...,0.072393,...,0.333333,0.6,[{'fact': 'You can extend the Manage Electroni...,[{'fact': 'Extension can be done using Busines...,862,2,864,14289,1856,16145
6,EA-13,Can I restrict the visibility of eDocuments in...,"Yes, you can restrict the visibility of eDocum...",,,,,,[{'fact': 'You can restrict the visibility of ...,0.26248,...,1.0,1.0,[{'fact': 'You can restrict the visibility of ...,[{'fact': 'Custom fields can contain organizat...,805,2,807,6380,1004,7384
7,EA-5,"As a Public Cloud customer in Italy, can I ext...","Yes, as a Public Cloud customer in Italy, you...",,,,,,[{'fact': 'As a Public Cloud customer in Italy...,0.101918,...,0.333333,0.5,[{'fact': 'As a Public Cloud customer in Italy...,[{'fact': 'You can define that for some types ...,856,2,858,9671,1127,10798
8,EA-12,What are the steps to extend an existing eDocu...,Prerequisites\n1. SAP S/4HANA Cloud Public Edi...,,,,,,[{'fact': 'SAP S/4HANA Cloud Public Edition is...,0.004683,...,1.0,0.5,[{'fact': 'The first step is the creation of t...,[{'fact': 'SAP S/4HANA Cloud Public Edition is...,1051,2,1053,18448,1230,19678
9,EA-38,What are the procedure to implement BAdI EDOCU...,The EDOCUMENT_BASE_EXT BAdI of the ES_EDOCUMEN...,,,,,,[{'fact': 'The EDOCUMENT_BASE_EXT BAdI is part...,0.257148,...,1.0,1.0,[{'fact': 'Create an enhancement implementatio...,[{'fact': 'The EDOCUMENT_BASE_EXT BAdI is part...,968,2,970,16117,1375,17492


In [7]:
# Convert to list of dictionaries (JSON objects)
json_list = df.to_dict(orient="records")

# Save to a JSON file
with open("data.json", "w") as f:
    json.dump(json_list, f, indent=2)