In [1]:
import os
import json
import random
import traceback

from datetime import datetime

from tqdm.notebook import tqdm

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.config.system_parameters import TriageSettings

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.llm_judge import LLMAsJudgeEvaluator

In [2]:
USE_MCP = True
DEBUG_MODE = False
USE_TRIAGE = False

LLM_JUDGE_MODEL = "gemini-2.0-flash"
# LLM_JUDGE_MODEL = "gpt-4o"

fact_scorer = FactScorer()
path_to_source_data = "./resources/02_facts/"

date_str = datetime.now().strftime("%d%m%Y")
path_to_results = f"./results/{date_str}"

if not os.path.exists(path_to_results):
    os.makedirs(path_to_results)

time_str = datetime.now().strftime("%H%M%S")

result_file_name = f"{path_to_results}/{time_str}_results.json"
failed_experiments_file_name = f"{path_to_results}/{time_str}_failed_experiments.json"

files = {
    "extensibility": "extensibility_assistance_facts.json",
    "malaysia": "malaysia_support_facts.json",
    "peppol": "peppol_support_facts.json",
    "all": "all_cases_facts.json",
}

In [3]:
file_path = os.path.abspath(path_to_source_data + files["all"])

with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [4]:
random_data_set = random.sample(data_set, 20)

In [5]:
triage_agent = Triage()
execution_trail = ""
experiment_results: list[ExperimentResult] = []
failed_experiments: list = []

for data_row in tqdm(random_data_set, desc="Running experiments", unit="experiment"):
    try:
        result = ExperimentResult(
            id=data_row.id,
            question=data_row.question,
            answer=data_row.answer,
            product=data_row.product,
            category=data_row.category,
            persona=data_row.persona,
            activity=data_row.activity,
            country=data_row.country,
        )
        print(f"Asking agent question with id: {data_row.id}")
        if USE_TRIAGE:
            print("\tUsing Triage to categorize the query...")
            query_categorization = triage_agent.triage_user_message(
                user_message=data_row.question
            )
            if DEBUG_MODE:
                print(f"\tQuery categorization: {query_categorization}")

        result.triage_category = (
            query_categorization["category"]
            if USE_TRIAGE
            else TriageSettings.Categories.ALL
        )

        tools = ToolsFabric.get_tools_for_category(
            use_mcp=USE_MCP,
            configuration=result.triage_category,
        )

        if USE_MCP:
            async with MultiServerMCPClient(tools) as client:
                agent = ReActAgent(tool_list=client.get_tools())
                execution_trail = await agent.arun_agent_with_input(
                    user_message=data_row.question, debug=DEBUG_MODE
                )
        else:
            agent = ReActAgent(tool_list=tools)
            execution_trail = agent.run_agent_with_input(
                user_message=data_row.question, debug=DEBUG_MODE
            )

        run_data = agent.get_execution_data()

        result.tools_used = run_data.tools_used
        result.tool_calls_count = len(run_data.tools_used)
        result.excecution_time_seconds = run_data.excecution_time_seconds
        result.model_used = run_data.model_used
        result.tokens_consumed = run_data.tokens_consumed
        result.llm_call_count = run_data.llm_call_count
        result.facts = data_row.facts
        result.generated_answer = run_data.final_output

        print("\tFinished agent execution")

        print("\tCalculating FactScore...")
        result.fact_score = await fact_scorer.get_fact_score(
            facts=data_row.facts,
            knowledge_source=result.generated_answer,
            debug=DEBUG_MODE,
        )

        print("\tCalculating BERTScore...")
        result.bert_score = BertScore.compute_score(
            expected_response=data_row.answer, actual_response=result.generated_answer
        )

        print("\tEvaluating agent response with LLM as a Judge...")
        result.llm_judge_model = LLM_JUDGE_MODEL
        llm_evaluator = LLMAsJudgeEvaluator(model=LLM_JUDGE_MODEL)

        result.llm_judge_outcome = llm_evaluator.evaluate(
            question=data_row.question, generated_answer=result.generated_answer
        )

        result.llm_judge_call_count = llm_evaluator.get_llm_call_count()
        result.llm_judge_tokens_consumed = llm_evaluator.get_token_consumption()

        print(f"\tFinished experiment for id: {data_row.id}\n")

        experiment_results.append(result)
    except Exception as e:
        print(f"\tExperiment failed for id {data_row.id}: {e}")
        if DEBUG_MODE:
            traceback.print_exc()

        failed_experiments.append(
            {
                "id": data_row.id,
                "error": str(e),
                "traceback": traceback.format_exc(),
            }
        )

Running experiments:   0%|          | 0/20 [00:00<?, ?experiment/s]

Asking agent question with id: EA-24
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with LLM as a Judge...
	Finished experiment for id: EA-24

Asking agent question with id: EA-18
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with LLM as a Judge...
	Finished experiment for id: EA-18

Asking agent question with id: PS-2
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with LLM as a Judge...
	Finished experiment for id: PS-2

Asking agent question with id: EA-38
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with LLM as a Judge...
	Finished experiment for id: EA-38

Asking agent question with id: EA-23
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with LLM as a Judge...
	Finished experiment for id: EA-2

In [6]:
if failed_experiments:
    with open(failed_experiments_file_name, "w") as f:
        json.dump(failed_experiments, f, indent=2)

In [7]:
# Serialize experiment results to list of dicts
import pandas as pd

records = [r.model_dump(mode="json") for r in experiment_results]
# Make it a dataframe in order to flatten nested structures
df = pd.json_normalize(records)

df

  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...ze the email itself.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...ition as being TRUE.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...olve related issues.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...ilters for the BAdI.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...DOC_CHANGE_EMAIL_V2.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.t

Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,llm_judge_tokens_consumed.input_tokens,llm_judge_tokens_consumed.output_tokens,llm_judge_tokens_consumed.total_tokens,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,EA-24,"In Public cloud, Can I attach additional files...","Yes, in the Public cloud, you can attach addit...",,,,,,[{'fact': 'You can attach additional files to ...,0.07721,...,0.0,0.333333,[{'fact': 'You can attach additional files to ...,[{'fact': 'This can be done by implementing an...,846,2,848,6902,338,7240
1,EA-18,Can you provide sample implementation to preve...,"In the example below, eDocument creations are ...",,,,,,[{'fact': 'eDocument creations are restricted ...,0.134254,...,0.4,0.428571,[{'fact': 'eDocument creations are restricted ...,[{'fact': 'The class used is zcl_badi_edoc_ada...,1090,2,1092,6731,554,7285
2,PS-2,SOA Manager configuration was also switched. d...,"Based on the provided information, it appears ...",,,,,,[{'fact': 'The wrong logical port and URL path...,0.170145,...,0.4,0.333333,[{'fact': 'The wrong logical port and URL path...,[{'fact': 'Access SAP and navigate to 'SM30 ->...,1020,2,1022,5469,446,5915
3,EA-38,What are the procedure to implement BAdI EDOCU...,The EDOCUMENT_BASE_EXT BAdI of the ES_EDOCUMEN...,,,,,,[{'fact': 'The EDOCUMENT_BASE_EXT BAdI is part...,0.299903,...,1.0,1.0,[{'fact': 'Create an enhancement implementatio...,[{'fact': 'The EDOCUMENT_BASE_EXT BAdI is part...,1040,2,1042,10964,484,11448
4,EA-23,What are the required steps to implement Busin...,To implement the Business Add-In (BAdI) EDOC_P...,,,,,,[{'fact': 'The BAdI to implement is EDOC_PART_...,0.259401,...,0.0,0.4,[{'fact': 'The BAdI to implement is EDOC_PART_...,[{'fact': 'The path to ES_EDOCUMENT_CLOUD is p...,1110,2,1112,9905,549,10454
5,EA-10,What BAdIs can be used to extend the Manage El...,The BAdIs EDOC_CUSTOM_FIELDS_CLOUD and EDOC_CU...,,,,,,[{'fact': 'The BAdI EDOC_CUSTOM_FIELDS_CLOUD c...,0.095186,...,0.0,0.0,[{'fact': 'The BAdI EDOC_CUSTOM_FIELDS_CLOUD c...,[],927,2,929,8161,408,8569
6,PS-4,"Hello,\ndespite assigning all 3 relevant roles...",Would you please log off and log in again? Thi...,,,,,,[{'fact': 'Logging off and logging in again re...,-0.134318,...,0.0,0.0,[],[{'fact': 'Logging off and logging in again re...,926,2,928,5671,273,5944
7,EA-33,Create a sample code for DRC database table to...,"Yes, I can create a sample database table code...",,,,,,[{'fact': 'The table is created to store eDocu...,-0.088545,...,0.0,0.0,[{'fact': 'The table is created to store eDocu...,[{'fact': 'The table is labeled as 'eDocument ...,999,2,1001,7005,441,7446
8,EA-13,Can I restrict the visibility of eDocuments in...,"Yes, you can restrict the visibility of eDocum...",,,,,,[{'fact': 'You can restrict the visibility of ...,0.360696,...,1.0,1.0,[{'fact': 'You can restrict the visibility of ...,[{'fact': 'Custom fields can contain organizat...,737,2,739,10806,226,11032
9,EA-53,How can I bypass the DRC Cloud Edition license...,"Unfortunately, none of the provided sources co...",,,,,,[{'fact': 'No sources found on bypassing the D...,0.217062,...,0.0,0.0,[{'fact': 'No sources found on bypassing the D...,[],548,2,550,3436,54,3490


In [8]:
# Convert to list of dictionaries (JSON objects)
json_list = df.to_dict(orient="records")

with open(result_file_name, "w") as f:
    json.dump(json_list, f, indent=2)