In [1]:
import os
import json
import traceback

from datetime import datetime

from tqdm.notebook import tqdm

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.config.system_parameters import TriageSettings

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.metrics.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.agent_judge import AgentJudgeEvaluator

In [2]:
USE_MCP = False
DEBUG_MODE = False
USE_TRIAGE = False

AGENT_JUDGE_MODEL = "anthropic--claude-3.5-sonnet"

fact_scorer = FactScorer()
path_to_source_data = "./resources/02_facts/"

date_str = datetime.now().strftime("%d%m%Y")
path_to_results = f"./results/{date_str}"

if not os.path.exists(path_to_results):
    os.makedirs(path_to_results)

time_str = datetime.now().strftime("%H%M%S")

result_file_name = f"{path_to_results}/{time_str}_results.json"
failed_experiments_file_name = f"{path_to_results}/{time_str}_failed_experiments.json"

In [3]:
file_path = os.path.abspath(path_to_source_data + "data_set_facts.json")

with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [4]:
data_set_for_experiment = data_set[:10]

In [5]:
triage_agent = Triage()
execution_trail = ""
experiment_results: list[ExperimentResult] = []
failed_experiments: list = []

for data_row in tqdm(data_set_for_experiment, desc="Running cases", unit="case"):
    try:
        result = ExperimentResult(
            id=data_row.id,
            question=data_row.question,
            answer=data_row.answer,
            product=data_row.product,
            category=data_row.category,
            persona=data_row.persona,
            activity=data_row.activity,
            country=data_row.country,
        )
        print(f"Asking agent question with id: {data_row.id}")
        if USE_TRIAGE:
            print("\tUsing Triage to categorize the query...")
            query_categorization = triage_agent.triage_user_message(
                user_message=data_row.question
            )
            if DEBUG_MODE:
                print(f"\tQuery categorization: {query_categorization}")

        result.triage_category = (
            query_categorization["category"]
            if USE_TRIAGE
            else TriageSettings.Categories.ALL
        )

        tools = ToolsFabric.get_tools_for_category(
            use_mcp=USE_MCP,
            configuration=result.triage_category,
        )

        if USE_MCP:
            async with MultiServerMCPClient(tools) as client:
                agent = ReActAgent(tool_list=client.get_tools())
                execution_trail = await agent.arun_agent_with_input(
                    user_message=data_row.question, debug=DEBUG_MODE
                )
        else:
            agent = ReActAgent(tool_list=tools)
            execution_trail = agent.run_agent_with_input(
                user_message=data_row.question, debug=DEBUG_MODE
            )

        run_data = agent.get_execution_data()

        result.tools_used = run_data.tools_used
        result.tool_calls_count = len(run_data.tools_used)
        result.excecution_time_seconds = run_data.excecution_time_seconds
        result.model_used = run_data.model_used
        result.tokens_consumed = run_data.tokens_consumed
        result.llm_call_count = run_data.llm_call_count
        result.facts = data_row.facts
        result.generated_answer = run_data.final_output

        print("\tFinished agent execution")

        print("\tCalculating FactScore...")
        result.fact_score = await fact_scorer.get_fact_score(
            facts=data_row.facts,
            knowledge_source=result.generated_answer,
            debug=DEBUG_MODE,
        )

        print("\tCalculating BERTScore...")
        result.bert_score = BertScore.compute_score(
            expected_response=data_row.answer, actual_response=result.generated_answer
        )

        print("\tEvaluating agent response with Agent as a Judge...")
        result.agent_judge_model = AGENT_JUDGE_MODEL
        agent_evaluator = AgentJudgeEvaluator(model=AGENT_JUDGE_MODEL)

        agent_judge_outcome = agent_evaluator.evaluate(
            question=data_row.question,
            expert_answer=data_row.answer,
            generated_answer=result.generated_answer,
        )

        result.agent_judge_outcome = agent_judge_outcome.answer
        result.agent_judge_reasoning = agent_judge_outcome.reasoning

        result.agent_judge_call_count = agent_evaluator.get_llm_call_count()
        result.agent_judge_tokens_consumed = agent_evaluator.get_token_consumption()

        print(f"\tFinished experiment for id: {data_row.id}\n")

        experiment_results.append(result)
    except Exception as e:
        print(f"\tExperiment failed for id {data_row.id}: {e}")
        if DEBUG_MODE:
            traceback.print_exc()

        failed_experiments.append(
            {
                "id": data_row.id,
                "error": str(e),
                "traceback": traceback.format_exc(),
            }
        )

Running cases:   0%|          | 0/10 [00:00<?, ?case/s]

Asking agent question with id: EA-1
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-1

Asking agent question with id: EA-2
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-2

Asking agent question with id: EA-3
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-3

Asking agent question with id: EA-4
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-4

Asking agent question with id: EA-5
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...


  agent_evaluator = AgentJudgeEvaluator(model=AGENT_JUDGE_MODEL)


	Finished experiment for id: EA-5

Asking agent question with id: EA-6
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-6

Asking agent question with id: EA-7
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-7

Asking agent question with id: EA-8
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Finished experiment for id: EA-8

Asking agent question with id: EA-9
	Finished agent execution
	Calculating FactScore...
	Calculating BERTScore...
	Evaluating agent response with Agent as a Judge...
	Experiment failed for id EA-9: 2 validation errors for JudgementResponseFormat
answer
  Field required [type=missing, input_value={'query': 'Custom Fields ...om Logic extensibility'}, input_type=

In [6]:
if failed_experiments:
    with open(failed_experiments_file_name, "w") as f:
        json.dump(failed_experiments, f, indent=2)

In [7]:
# Serialize experiment results to list of dicts
import pandas as pd

records = [r.model_dump(mode="json") for r in experiment_results]
# Make it a dataframe in order to flatten nested structures
df = pd.json_normalize(records)

df

  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...nd added to the XML.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...ronic Documents app.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...he provided context.')])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...our business needs.'")])` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
  Expected `float` but got `FactScoreResult` with value `FactScoreResult(direct_fa...supporting the fact.")])` - serialized value may not be as expected
  return self.__pydantic_serializer__.t

Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,agent_judge_tokens_consumed.input_tokens,agent_judge_tokens_consumed.output_tokens,agent_judge_tokens_consumed.total_tokens,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,EA-1,How can I attach additional files to standard ...,To attach additional files to the standard XML...,ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,Cross,[{'fact': 'You can attach additional files to ...,0.182798,...,0.2,0.375,[{'fact': 'You can attach additional files to ...,[{'fact': 'The enhancement is implemented in A...,4646,3,4649,9488,2602,12090
1,EA-2,"For a Greece customer in Public Cloud, what ar...",To enhance the generated XML file for a Greece...,ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,GR,[{'fact': 'To enhance the generated XML for a ...,0.200139,...,0.0,0.555556,[{'fact': 'To enhance the generated XML for a ...,[{'fact': 'You can optionally create a BAdI fi...,17704,3,17707,8280,1564,9844
2,EA-3,For a Greece customer invoice process in Publi...,"Yes, you can enhance or extend the generated X...",ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,GR,[{'fact': 'You can enhance or extend the gener...,0.241308,...,0.0,0.333333,[{'fact': 'You can enhance or extend the gener...,[{'fact': 'The method SET_OUTPUT_DATA belongs ...,6223,3,6226,8916,2193,11109
3,EA-4,"As a Public Cloud customer in Spain, can I ext...","Yes, as a Public Cloud customer in Spain, you ...",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,ES,[{'fact': 'A Public Cloud customer in Spain ca...,0.221864,...,0.5,0.75,[{'fact': 'A Public Cloud customer in Spain ca...,[{'fact': 'It is possible to define that for s...,14813,1778,16591,6877,791,7668
4,EA-5,"As a Public Cloud customer in Italy, can I ext...","Yes, as a Public Cloud customer in Italy, you...",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,IT,[{'fact': 'As a Public Cloud customer in Italy...,0.207809,...,0.333333,0.6,[{'fact': 'As a Public Cloud customer in Italy...,[{'fact': 'You can define that for some types ...,6047,3,6050,15516,2389,17905
5,EA-6,"As a Public Cloud customer in Turkey, can I ex...","No. In Public Cloud, eDocument Actions can not...",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,TR,"[{'fact': 'In Public Cloud, eDocument Actions ...",0.117519,...,0.0,0.0,"[{'fact': 'In Public Cloud, eDocument Actions ...",[],7864,3,7867,16436,1637,18073
6,EA-7,Can I extend an Action in eDocument Cockpit fo...,"No, Actions can not be extended in Public Cloud",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,Cross,[{'fact': 'Actions cannot be extended in Publi...,0.038973,...,0.0,0.0,[{'fact': 'Actions cannot be extended in Publi...,[],7389,3,7392,11457,2221,13678
7,EA-8,Can I extend an Action in eDocument Cockpit fo...,"Yes, you can extend an action in the eDocument...",ERP (excluding SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,Cross,[{'fact': 'You can extend an action in the eDo...,0.153957,...,0.0,0.0,[{'fact': 'You can extend an action in the eDo...,[{'fact': 'The function module is executed imm...,11511,1544,13055,24794,1258,26052
8,EA-10,What BAdIs can be used to extend the Manage El...,The BAdIs EDOC_CUSTOM_FIELDS_CLOUD and EDOC_CU...,ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,Cross,[{'fact': 'The BAdI EDOC_CUSTOM_FIELDS_CLOUD c...,0.304106,...,0.0,1.0,[{'fact': 'The BAdI EDOC_CUSTOM_FIELDS_CLOUD c...,[],6972,3,6975,17077,1202,18279


In [8]:
# Convert to list of dictionaries (JSON objects)
json_list = df.to_dict(orient="records")

with open(result_file_name, "w") as f:
    json.dump(json_list, f, indent=2)