In [None]:
"""
This script runs an evaluation of a LangGraph agent on a dataset using LangSmith's async evaluation API.
It converts dataset examples into the agent's internal state format, invokes the agent graph with unique checkpointing,
and uses a custom evaluator that leverages an LLM judge to compare the agent's output against reference answers.

Key components:
- example_to_state: converts dataset inputs to the agent's state format.
- target_with_config: async wrapper to invoke the agent graph with checkpoint config.
- correct: async evaluator function that prompts an LLM to judge correctness of the agent's answer.
- aevaluate: runs the evaluation over the dataset with concurrency and experiment tracking.
"""

#==============================================================================
# IMPORTS
#==============================================================================
import sys
import os

# Temporarily override MY_AGENT_DEBUG for this notebook execution
original_debug_value = os.environ.get('MY_AGENT_DEBUG')
os.environ['MY_AGENT_DEBUG'] = '0'
print(f"Temporarily setting MY_AGENT_DEBUG=0 for this notebook execution (was: {original_debug_value})")

# Add the parent directory to the Python path so we can import the my_agent module
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import uuid
from langsmith import aevaluate
from my_agent import create_graph
from my_agent.utils.state import DataAnalysisState
from my_agent.utils.models import get_azure_llm

#==============================================================================
# INITIALIZATION
#==============================================================================
# Experiment configuration
EXPERIMENT_CONFIG = {
    "dataset_name": "czsu agent problematic2c",   # Name of the LangSmith dataset to use
    "experiment_prefix": "test1",   # Prefix for the experiment run
    "max_concurrency": 4,            # Maximum number of concurrent evaluations
    "evaluators": ["correctness"],   # List of evaluator functions to use
}

# Get Model
judge_llm = get_azure_llm(temperature=0.0)

# Create the agent graph
graph = create_graph()

#==============================================================================
# EVALUATION FUNCTIONS
#==============================================================================
async def correctness(outputs: dict, reference_outputs: dict) -> bool:
    """
    Evaluator function that uses an LLM to determine if the actual answer contains all information
    from the expected answer.

    Args:
        outputs (dict): The output dictionary from the agent, expected to contain a 'messages' list.
        reference_outputs (dict): The reference outputs from the dataset, expected to have 'answers' key.

    Returns:
        bool: True if the LLM judge determines the answer is correct, False otherwise.
    """
    if not outputs or "messages" not in outputs or not outputs["messages"]:
        return False
        
    actual_answer = outputs["messages"][-1].content
    expected_answer = reference_outputs.get("answers", "[NO EXPECTED ANSWER PROVIDED]")
    
    instructions = (
        "Given an actual answer and an expected answer, determine whether"
        " the actual answer contains all of the information in the"
        " expected answer. Respond with 'CORRECT' if the actual answer"
        " does contain all of the expected information and 'INCORRECT'"
        " otherwise. Do not include anything else in your response."
    )
    user_msg = f"ACTUAL ANSWER: {actual_answer}\n\nEXPECTED ANSWER: {expected_answer}"
    
    response = await judge_llm.ainvoke(
        [
            {"role": "system", "content": instructions},
            {"role": "user", "content": user_msg}
        ]
    )
    return response.content.upper() == "CORRECT"

#==============================================================================
# STATE CONVERSION FUNCTIONS
#==============================================================================
def example_to_state(inputs: dict) -> dict:
    """
    Converts dataset example inputs into the agent's internal DataAnalysisState format.

    Args:
        inputs (dict): Input dictionary from the dataset, expected to have 'question' key.

    Returns:
        DataAnalysisState: Initialized state object for the agent.
    """
    return DataAnalysisState(
        prompt=inputs['question'],
        messages=[],
        result="",
        iteration=0
    )

async def target_with_config(inputs: dict):
    """
    Async wrapper to invoke the agent graph with a unique checkpoint thread_id in the config.

    Args:
        inputs (dict): Input dictionary from the dataset.

    Returns:
        dict: The output from the agent graph invocation, including 'messages'.
    """
    state = example_to_state(inputs)
    config = {"configurable": {"thread_id": str(uuid.uuid4())}}
    return await graph.ainvoke(state, config=config)

#==============================================================================
# MAIN EVALUATION
#==============================================================================
experiment_results = await aevaluate(
    target_with_config,
    data=EXPERIMENT_CONFIG["dataset_name"],
    evaluators=[correctness],
    max_concurrency=EXPERIMENT_CONFIG["max_concurrency"],
    experiment_prefix=EXPERIMENT_CONFIG["experiment_prefix"],
)

print(f"Evaluation results: {experiment_results}")

Temporarily setting MY_AGENT_DEBUG=0 for this notebook execution (was: 1)
View the evaluation results for experiment: 'test1-c80fcf42' at:
https://smith.langchain.com/o/817120e4-bc86-484a-9c30-795e42f5bb9e/datasets/038e0f73-9559-4744-bdb6-a36c24a11f96/compare?selectedSessions=13396ef4-7da5-4727-9ac3-31e6706202ae




0it [00:00, ?it/s]

Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1
Routing decision, iteration=2
Routing decision, iteration=2
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1
Routing decision, iteration=0
Routing decision, iteration=2
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1


Error running target function: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-05-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}
Traceback (most recent call last):
  File "c:\Users\mmiscanuk\AppData\Local\anaconda3\envs\CZSU_Multi_Agent_Appb\Lib\site-packages\langsmith\evaluation\_arunner.py", line 1239, in _aforward
    await fn(
  File "c:\Users\mmiscanuk\AppData\Local\anaconda3\envs\CZSU_Multi_Agent_Appb\Lib\site-packages\langsmith\run_helpers.py", line 524, in async_wrapper
    function_result = await asyncio.create_task(  # type: ignore[call-arg]
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fi

Routing decision, iteration=2
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1
Routing decision, iteration=0
Routing decision, iteration=2
Routing decision, iteration=1
Routing decision, iteration=0
Routing decision, iteration=2
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1
Routing decision, iteration=2
Routing decision, iteration=2
Routing decision, iteration=0
Routing decision, iteration=0
Routing decision, iteration=1
Routing decision, iteration=1
Routing decision, iteration=2


Error running target function: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-05-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 3 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}
Traceback (most recent call last):
  File "c:\Users\mmiscanuk\AppData\Local\anaconda3\envs\CZSU_Multi_Agent_Appb\Lib\site-packages\langsmith\evaluation\_arunner.py", line 1239, in _aforward
    await fn(
  File "c:\Users\mmiscanuk\AppData\Local\anaconda3\envs\CZSU_Multi_Agent_Appb\Lib\site-packages\langsmith\run_helpers.py", line 524, in async_wrapper
    function_result = await asyncio.create_task(  # type: ignore[call-arg]
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fi

Routing decision, iteration=0
Evaluation results: <AsyncExperimentResults test1-c80fcf42>


In [2]:
#==============================================================================
# CLEANUP
#==============================================================================
# Restore original MY_AGENT_DEBUG value
print(f"Finished evaluation. Restoring MY_AGENT_DEBUG environment variable...")
if original_debug_value is not None:
    os.environ['MY_AGENT_DEBUG'] = original_debug_value
    print(f"Restored MY_AGENT_DEBUG to original value: {original_debug_value}")
else:
    # If it was not set before, remove it
    if 'MY_AGENT_DEBUG' in os.environ:
        del os.environ['MY_AGENT_DEBUG']
        print("Removed temporary MY_AGENT_DEBUG environment variable")

Finished evaluation. Restoring MY_AGENT_DEBUG environment variable...
Restored MY_AGENT_DEBUG to original value: 1
