# Agent Output Evaluation Experiment

This notebook sets up an experiment to evaluate the agent's output against ground truth data.

In [7]:
import uuid
import pandas as pd
import nest_asyncio

import phoenix as px
from phoenix.evals import TOOL_CALLING_PROMPT_TEMPLATE, OpenAIModel, llm_classify
from phoenix.experiments import evaluate_experiment, run_experiment
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import Example
from phoenix.trace import SpanEvaluations
from phoenix.trace.dsl import SpanQuery
import re
from subprocess import run, PIPE
from pathlib import Path
import os
import time

nest_asyncio.apply()

In [44]:
import pandas as pd
import phoenix as px

# Initialize Phoenix client
px_client = px.Client(warn_if_server_not_running=True)

# Prepare test data
agent_ground_truth = {
    "What is the amount of men in Prague at the end of Q3 2024?": "676069",
    "What is the amount of women in Prague at the end of Q3 2024?": "716056",
    "What is the amount of women in Zlin region at the end of Q3 2024?": "294996",
}

# Convert to DataFrame
new_data_df = pd.DataFrame(agent_ground_truth.items(), columns=["question", "expected_answer"])
dataset_name = "agent_output_evaluation"

# Function to handle dataset operations
def get_or_create_dataset(client, name: str, data_df: pd.DataFrame):
    """Get existing dataset or create new one"""
    try:
        # Try to get existing dataset
        dataset = client.get_dataset(name=name)
        print("Found existing dataset")
        return dataset
    except (ValueError, Exception):
        # If any error (including not found), create new dataset
        print("Creating new dataset")
        return client.upload_dataset(
            dataset_name=name,
            dataframe=data_df,
            input_keys=["question"],
            output_keys=["expected_answer"]
        )

# Get or create dataset
dataset = get_or_create_dataset(px_client, dataset_name, new_data_df)

# If dataset exists, check for new records to append
if dataset:
    existing_df = dataset.as_dataframe()
    current_questions = [row['input']['question'] for _, row in existing_df.iterrows()]
    
    # Find new records
    new_records = new_data_df[~new_data_df['question'].isin(current_questions)]
    
    if len(new_records) > 0:
        print(f"Appending {len(new_records)} new records")
        dataset = px_client.append_to_dataset(
            dataset_name=dataset_name,
            dataframe=new_records,
            input_keys=["question"],
            output_keys=["expected_answer"]
        )
    else:
        print("No new records to append")

# Show final state
final_df = dataset.as_dataframe()
print(f"\nFinal dataset has {len(final_df)} records")
display(final_df)



Creating new dataset
📤 Uploading dataset...
💾 Examples uploaded: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxOA==/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246MTg=
No new records to append

Final dataset has 3 records


Unnamed: 0_level_0,input,output
example_id,Unnamed: 1_level_1,Unnamed: 2_level_1
RGF0YXNldEV4YW1wbGU6MTM1,{'question': 'What is the amount of men in Pra...,{'expected_answer': '676069'}
RGF0YXNldEV4YW1wbGU6MTM2,{'question': 'What is the amount of women in P...,{'expected_answer': '716056'}
RGF0YXNldEV4YW1wbGU6MTM3,{'question': 'What is the amount of women in Z...,{'expected_answer': '294996'}


In [None]:
def run_batch_analysis(example: Example) -> str:
    print(f"\n[DEBUG] Starting analysis for example: {example.input}")
    
    # Get the project root directory
    root_dir = Path.cwd()
    results_file = root_dir / "analysis_results.txt"
    bat_path = root_dir / "safe_crewai.bat"
    
    print(f"[DEBUG] Results file path: {results_file}")
    print(f"[DEBUG] Batch script path: {bat_path}")
    
    # Do NOT clear previous results - we want to append
    
    # Run the batch processor with the question
    print("[DEBUG] Executing batch processor...")
    result = run(
        [str(bat_path), "flow", "kickoff"],
        env={"ANALYSIS_PROMPT": example.input["question"], **os.environ},
        shell=True,
        text=True,
        capture_output=True,
        cwd=str(root_dir)
    )
    
    print(f"[DEBUG] Batch processor stdout: {result.stdout}")
    print(f"[DEBUG] Batch processor stderr: {result.stderr}")
    
    # Longer wait for file to be written
    time.sleep(5)
    
    # Read the result from analysis_results.txt
    print("[DEBUG] Reading analysis results file...")
    try:
        if not results_file.exists():
            print(f"[ERROR] Results file does not exist at: {results_file}")
            return "ERROR: No results file found"
            
        with open(results_file, "r", encoding='utf-8') as f:
            content = f.read()
            print(f"[DEBUG] Raw file content length: {len(content)}")
            if not content:
                print("[ERROR] Results file is empty")
                return "ERROR: Empty results file"

            # Look for the entry matching our prompt
            entries = content.split("-" * 50)
            for entry in entries:
                if example.input["question"] in entry:
                    print(f"[DEBUG] Found matching entry for prompt")
                    return entry.strip()

            print("[ERROR] No matching entry found for prompt")
            return "ERROR: No matching entry found"
            
    except Exception as e:
        print(f"[ERROR] Failed to read results: {str(e)}")
        return f"ERROR: {str(e)}"

In [None]:
@create_evaluator(name="answer_match", kind="CODE")
def evaluate_answer(output: str, expected: dict) -> float:
    print(f"\n[DEBUG] Evaluating answer:")
    print(f"[DEBUG] Expected: {expected['expected_answer']}")
    print(f"[DEBUG] Actual (first 100 chars): {output[:100]}...")
    
    if isinstance(output, str) and output.startswith('ERROR:'):
        print(f"[DEBUG] Error in output: {output}")
        return 0.0
    
    pattern = re.compile(expected['expected_answer'])
    result = bool(pattern.search(str(output)))
    print(f"[DEBUG] Match found: {result}")
    
    # Return float score directly
    return float(result)

In [None]:
# Clear results file before starting experiment
results_file = Path.cwd() / "analysis_results.txt"
if results_file.exists():
    print("[DEBUG] Clearing previous results file before starting experiment")
    results_file.unlink()

# Run experiment
experiment = run_experiment(
    dataset,
    run_batch_analysis,
    evaluators=[evaluate_answer],
    experiment_name="test_experiment_v1__agent_answer_evaluation",
    experiment_description="Evaluating if the agent's output contains the correct numerical answer"
)

# Get results and handle DataFrame structure
try:
    # Try to get results with original structure
    results_df = experiment.as_dataframe()
except KeyError as e:
    print(f"[DEBUG] DataFrame structure differs from expected: {e}")
    # Create DataFrame manually from experiment results
    results = []
    for run_id, run in experiment.runs.items():
        results.append({
            'id': run_id,
            'input': run.input,
            'output': run.output,
            'expected': run.expected,
            'example_id': run.dataset_example_id
        })
    results_df = pd.DataFrame(results)

# Debug info
print("\nDebug Info:")
print(f"DataFrame columns: {results_df.columns}")
print("\nFirst row:")
print(results_df.iloc[0].to_dict() if not results_df.empty else "No results")

# Display full results
print("\nFull Results DataFrame:")
display(results_df)

[DEBUG] Clearing previous results file before starting experiment
🧪 Experiment started.
📺 View dataset experiments: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxNg==/experiments
🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxNg==/compare?experimentId=RXhwZXJpbWVudDoxNg==
🧪 Experiment started.
📺 View dataset experiments: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxNg==/experiments
🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxNg==/compare?experimentId=RXhwZXJpbWVudDoxNg==




running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s


[DEBUG] Starting analysis for example: {'question': 'What is the amount of men in Prague at the end of Q3 2024?'}
[DEBUG] Results file path: e:\OneDrive\Knowledge Base\0207_GenAI\Code\CrewAI\test_prototypes\prototype3\analysis_results.txt
[DEBUG] Batch script path: e:\OneDrive\Knowledge Base\0207_GenAI\Code\CrewAI\test_prototypes\prototype3\safe_crewai.bat
[DEBUG] Executing batch processor...
[DEBUG] Batch processor stdout: Running flow kickoff safely...
Initializing Phoenix tracing...
OpenTelemetry Tracing Details
|  Phoenix Project: CrewAI_Prototype3
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: https://app.phoenix.arize.com/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {'api_key': '****', 'authorization': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_trac

running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s


[DEBUG] Evaluating answer:
[DEBUG] Expected: 676069
[DEBUG] Actual (first 100 chars): Prompt: What is the amount of men in Prague at the end of Q3 2024?
Result: value: 
  - 676069...
[DEBUG] Match found: True

[DEBUG] Evaluating answer:
[DEBUG] Expected: 716056
[DEBUG] Actual (first 100 chars): Prompt: What is the amount of women in Prague at the end of Q3 2024?
Result: The amount of women in ...
[DEBUG] Match found: True

[DEBUG] Evaluating answer:
[DEBUG] Expected: 716056
[DEBUG] Actual (first 100 chars): Prompt: What is the amount of women in Prague at the end of Q3 2024?
Result: The amount of women in ...
[DEBUG] Match found: True

🔗 View this experiment: https://app.phoenix.arize.com/datasets/RGF0YXNldDoxNg==/compare?experimentId=RXhwZXJpbWVudDoxNg==

Experiment Summary (04/22/25 11:33 PM +0200)
--------------------------------------------
| evaluator    |   n |   n_scores |   avg_score |
|:-------------|----:|-----------:|------------:|
| answer_match |   2 |          2 |       

Unnamed: 0_level_0,output,input,expected,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RXhwZXJpbWVudFJ1bjoyOQ==,Prompt: What is the amount of men in Prague at...,{'question': 'What is the amount of men in Pra...,{'expected_answer': '676069'},RGF0YXNldEV4YW1wbGU6MTI5
RXhwZXJpbWVudFJ1bjozMA==,Prompt: What is the amount of women in Prague ...,{'question': 'What is the amount of women in P...,{'expected_answer': '716056'},RGF0YXNldEV4YW1wbGU6MTMw
