In [1]:
import os
import json
import traceback
import asyncio

from datetime import datetime

from tqdm.notebook import tqdm

from langchain_mcp_adapters.client import MultiServerMCPClient

from react_agent.src.agents.react_agent import ReActAgent
from react_agent.src.agents.triage import Triage

from react_agent.src.config.system_parameters import TriageSettings

from react_agent.src.util.tools_fabric import ToolsFabric

from experiments.models.experiment_models import LabeledQAPairFacts, ExperimentResult
from experiments.metrics.fact_score.fact_scorer import FactScorer
from experiments.metrics.bert_score import BertScore
from experiments.metrics.agent_judge import AgentJudgeEvaluator

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
USE_MCP = True
DEBUG_MODE = False
USE_TRIAGE = False

AGENT_JUDGE_MODEL = "anthropic--claude-3.5-sonnet"

fact_scorer = FactScorer()
path_to_source_data = "./resources/02_facts/"

date_str = datetime.now().strftime("%d%m%Y")
path_to_results = f"./results/{date_str}"

if not os.path.exists(path_to_results):
    os.makedirs(path_to_results)

time_str = datetime.now().strftime("%H%M%S")

result_file_name = f"{path_to_results}/{time_str}_results.json"
failed_experiments_file_name = f"{path_to_results}/{time_str}_failed_experiments.json"

In [4]:
file_path = os.path.abspath(path_to_source_data + "data_set_facts.json")

with open(file_path, encoding="utf8") as f:
    data = json.load(f)
    data_set = [LabeledQAPairFacts(**item) for item in data]

In [5]:
data_set_for_experiment = data_set

In [6]:
triage_agent = Triage()

In [7]:
async def run_experiment(data_row):
    result = ExperimentResult(
        id=data_row.id,
        question=data_row.question,
        answer=data_row.answer,
        product=data_row.product,
        category=data_row.category,
        persona=data_row.persona,
        activity=data_row.activity,
        country=data_row.country,
    )
    print(f"Processing question with id: {data_row.id}")
    if USE_TRIAGE:
        print("\tUsing Triage to categorize the query...")
        query_categorization = triage_agent.triage_user_message(
            user_message=data_row.question
        )
        if DEBUG_MODE:
            print(f"\tQuery categorization: {query_categorization}")

    result.triage_category = (
        query_categorization["category"]
        if USE_TRIAGE
        else TriageSettings.Categories.ALL
    )

    print(f"\tFetching tools for category: {result.triage_category}...")
    tools = ToolsFabric.get_tools_for_category(
        use_mcp=USE_MCP,
        configuration=result.triage_category,
    )

    print("\tAsking agent question...")
    if USE_MCP:
        if USE_MCP:
            try:
                # Set a reasonable timeout for the entire MCP client interaction
                # Adjust the 300 seconds (5 minutes) as needed
                async with asyncio.timeout(300):
                    async with MultiServerMCPClient(tools) as client:
                        agent = ReActAgent(tool_list=client.get_tools())
                        await agent.arun_agent_with_input(
                            user_message=data_row.question,
                            debug=DEBUG_MODE,
                        )
            except asyncio.TimeoutError:
                print(
                    f"ERROR: MultiServerMCPClient operation timed out for question ID: {data_row.id}"
                )
                # You might want to log this, set a specific result status,
                # or raise a custom exception here.
                return result  # Or handle it as appropriate for your experiment
            except Exception as e:
                print(
                    f"ERROR: An unexpected error occurred with MultiServerMCPClient for question ID {data_row.id}: {e}"
                )
                raise  # Re-raise if you want the exception to propagate
    else:
        agent = ReActAgent(tool_list=tools)
        agent.run_agent_with_input(user_message=data_row.question, debug=DEBUG_MODE)

    run_data = agent.get_execution_data()

    result.tools_used = run_data.tools_used
    result.tool_calls_count = len(run_data.tools_used)
    result.excecution_time_seconds = run_data.excecution_time_seconds
    result.model_used = run_data.model_used
    result.tokens_consumed = run_data.tokens_consumed
    result.llm_call_count = run_data.llm_call_count
    result.facts = data_row.facts
    result.generated_answer = run_data.final_output

    print("\tFinished agent execution")

    print("\tCalculating FactScore...")
    result.fact_score = await fact_scorer.get_fact_score(
        facts=data_row.facts,
        knowledge_source=result.generated_answer,
        debug=DEBUG_MODE,
    )

    print("\tCalculating BERTScore...")
    result.bert_score = BertScore.compute_score(
        expected_response=data_row.answer, actual_response=result.generated_answer
    )

    print("\tEvaluating agent response with Agent as a Judge...")
    result.agent_judge_model = AGENT_JUDGE_MODEL
    agent_evaluator = AgentJudgeEvaluator(model=AGENT_JUDGE_MODEL)

    try:
        agent_judge_outcome = agent_evaluator.evaluate(
            question=data_row.question,
            expert_answer=data_row.answer,
            generated_answer=result.generated_answer,
        )
    except Exception as e:
        print(f"\tAn error occurred during agent evaluation: {e}")
        print("\t Trying to run evaluation again...")
        agent_judge_outcome = agent_evaluator.evaluate(
            question=data_row.question,
            expert_answer=data_row.answer,
            generated_answer=result.generated_answer,
        )

    result.agent_judge_outcome = agent_judge_outcome.answer
    result.agent_judge_reasoning = agent_judge_outcome.reasoning

    result.agent_judge_call_count = agent_evaluator.get_llm_call_count()
    result.agent_judge_tokens_consumed = agent_evaluator.get_token_consumption()

    print(f"\tFinished experiment for id: {data_row.id}\n")

    return result

In [8]:
experiment_results: list[ExperimentResult] = []
failed_experiments = []

In [13]:
for data_row in tqdm(data_set_for_experiment, desc="Running cases", unit="case"):
    if data_row.id in [result.id for result in experiment_results]:
        print(f"Skipping already processed id: {data_row.id}")
        continue
    try:
        result = await run_experiment(data_row)

        experiment_results.append(result)
    except Exception as e:
        failed_experiments.append(
            {
                "id": data_row.id,
                "error": str(e),
                "traceback": traceback.format_exc(),
            }
        )

Running cases:   0%|          | 0/91 [00:00<?, ?case/s]

Skipping already processed id: EA-1
Skipping already processed id: EA-2
Skipping already processed id: EA-3
Skipping already processed id: EA-4
Skipping already processed id: EA-5
Skipping already processed id: EA-6
Skipping already processed id: EA-7
Skipping already processed id: EA-8
Skipping already processed id: EA-9
Skipping already processed id: EA-10
Skipping already processed id: EA-11
Skipping already processed id: EA-12
Skipping already processed id: EA-13
Skipping already processed id: EA-14
Skipping already processed id: EA-15
Skipping already processed id: EA-16
Skipping already processed id: EA-17
Skipping already processed id: EA-18
Skipping already processed id: EA-19
Skipping already processed id: EA-20
Skipping already processed id: EA-21
Skipping already processed id: EA-22
Skipping already processed id: EA-23
Skipping already processed id: EA-24
Skipping already processed id: EA-25
Skipping already processed id: EA-26
Skipping already processed id: EA-27
Skipping a

In [14]:
# Serialize experiment results to list of dicts
import pandas as pd

records = [r.model_dump(mode="json") for r in experiment_results]
# Make it a dataframe in order to flatten nested structures
df = pd.json_normalize(records)

df

Unnamed: 0,id,question,answer,product,category,persona,activity,country,facts,bert_score,...,fact_score.supporting_fact_score,fact_score.combined_fact_score,fact_score.direct_facts,fact_score.supporting_facts,agent_judge_tokens_consumed.input_tokens,agent_judge_tokens_consumed.output_tokens,agent_judge_tokens_consumed.total_tokens,tokens_consumed.input_tokens,tokens_consumed.output_tokens,tokens_consumed.total_tokens
0,EA-1,How can I attach additional files to standard ...,To attach additional files to the standard XML...,ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,Cross,[{'fact': 'You can attach additional files to ...,0.184674,...,0.400000,0.500000,[{'fact': 'You can attach additional files to ...,[{'fact': 'The enhancement is implemented in A...,11715,655,12370,9315,2305,11620
1,EA-2,"For a Greece customer in Public Cloud, what ar...",To enhance the generated XML file for a Greece...,ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,GR,[{'fact': 'To enhance the generated XML for a ...,0.208072,...,0.000000,0.444444,[{'fact': 'To enhance the generated XML for a ...,[{'fact': 'You can optionally create a BAdI fi...,21147,3,21150,4424,793,5217
2,EA-3,For a Greece customer invoice process in Publi...,"Yes, you can enhance or extend the generated X...",ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,GR,[{'fact': 'You can enhance or extend the gener...,0.231548,...,0.000000,0.333333,[{'fact': 'You can enhance or extend the gener...,[{'fact': 'The method SET_OUTPUT_DATA belongs ...,8673,3,8676,13376,2091,15467
3,EA-4,"As a Public Cloud customer in Spain, can I ext...","Yes, as a Public Cloud customer in Spain, you ...",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,ES,[{'fact': 'A Public Cloud customer in Spain ca...,0.257452,...,0.500000,0.750000,[{'fact': 'A Public Cloud customer in Spain ca...,[{'fact': 'It is possible to define that for s...,6903,113,7016,3396,560,3956
4,EA-5,"As a Public Cloud customer in Italy, can I ext...","Yes, as a Public Cloud customer in Italy, you...",ERP (only SAP S/4HANA Cloud Public Edition),Documentation (Maintenance - Customer case sup...,Developer (Customer side),Custom Development,IT,[{'fact': 'As a Public Cloud customer in Italy...,0.222454,...,0.666667,0.600000,[{'fact': 'As a Public Cloud customer in Italy...,[{'fact': 'You can define that for some types ...,16694,500,17194,4253,579,4832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,EA-55,What are the steps to develop new processes fo...,"For new processes in Public Cloud, you develop...",ERP (only SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,Cross,[{'fact': 'Development for new electronic docu...,0.157626,...,0.000000,0.400000,[{'fact': 'Development for new electronic docu...,[],12974,3,12977,23081,2410,25491
87,MS-2,eDoc for CN is not generated after the recent ...,You need to assign invoice verification transa...,ERP (excluding SAP S/4HANA Cloud Public Edition),Initial error explanation and cause analysis (...,Functional consultant (Customer side),Maintenance,MY,[{'fact': 'Invoice verification transactions m...,-0.045976,...,0.000000,0.000000,[{'fact': 'Invoice verification transactions m...,[{'fact': 'The eDocument type for Invoice in M...,22803,564,23367,4044,878,4922
88,MS-4,einvoice version 1.0 to 1.1 --- Product/Functi...,Regarding Einvoice version 1.1 Please find bel...,ERP (excluding SAP S/4HANA Cloud Public Edition),How to Configure (Maintenance - Customer case ...,Functional consultant (Customer side),Business Configuration,MY,[{'fact': 'SAP Note 3498572 is available for a...,0.141317,...,0.000000,0.111111,[{'fact': 'SAP Note 3498572 is available for a...,[{'fact': 'SAP Note 3498572 was previously rel...,19713,423,20136,3245,417,3662
89,PS-3,Incoming ZUGFeRD invoices (from suppliers) we ...,Here are some hints that might be helpful. We ...,ERP (excluding SAP S/4HANA Cloud Public Edition),How to Develop (Maintenance - Customer case su...,Developer (Customer side),Custom Development,DE,[{'fact': 'You can extend EDOPROCFUNCASGV as i...,0.068049,...,0.000000,0.000000,[{'fact': 'You can extend EDOPROCFUNCASGV as i...,[{'fact': 'There are screenshots available sho...,15612,606,16218,15032,2122,17154


In [15]:
# Convert to list of dictionaries (JSON objects)
json_list = df.to_dict(orient="records")

with open(result_file_name, "w") as f:
    json.dump(json_list, f, indent=2)