In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
import phoenix as px
from phoenix.evals import OpenAIModel
from phoenix.experiments import run_experiment, evaluate_experiment
from phoenix.experiments.types import Example
from phoenix.experiments.evaluators import create_evaluator
from phoenix.otel import register
import pandas as pd
from datetime import datetime
import os
import nest_asyncio
nest_asyncio.apply()

In [27]:
from utilsl9 import run_agent, get_phoenix_endpoint

Overriding of current TracerProvider is not allowed
Attempting to instrument while already instrumented


🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: evaluating-agent-path
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [28]:
px_client = px.Client()

In [29]:
convergence_questions = [
    "What was the average quantity sold per transaction?",
    "What is the mean number of items per sale?", 
    "Calculate the typical quantity per transaction",
    "What's the mean transaction size in terms of quantity?",
    "On average, how many items were purchased per transaction?",
    "What is the average basket size per sale?",
    "Calculate the mean number of products per purchase",
    "What's the typical number of units per order?",
    "What is the average number of products bought per purchase?",
    "Tell me the mean quantity of items in a typical transaction",
    "How many items does a customer buy on average per transaction?",
    "What's the usual number of units in each sale?",
    "What is the typical amount of products per transaction?",
    "Show the mean number of items customers purchase per visit",
    "What's the average quantity of units per shopping trip?",
    "How many products do customers typically buy in one transaction?",
    "What is the standard basket size in terms of quantity?"
]

convergence_df = pd.DataFrame({
    'question': convergence_questions
})

now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dataset = px_client.upload_dataset(dataframe=convergence_df, 
                                   dataset_name=f"convergence_questions-{now}",
                                   input_keys=["question"])

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDoz/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Mw==


In [30]:
print(get_phoenix_endpoint())

http://localhost:6006/


In [31]:
# helper method to format the output returned by the task
def format_message_steps(messages):
    """
    Convert a list of message objects into a readable format that shows the steps taken.

    Args:
        messages (list): A list of message objects containing role, content, tool calls, etc.

    Returns:
        str: A readable string showing the steps taken.
    """
    steps = []
    for message in messages:
        role = message.get("role")
        if role == "user":
            steps.append(f"User: {message.get('content')}")
        elif role == "system":
            steps.append("System: Provided context")
        elif role == "assistant":
            if message.get("tool_calls"):
                for tool_call in message["tool_calls"]:
                    tool_name = tool_call["function"]["name"]
                    steps.append(f"Assistant: Called tool '{tool_name}'")
            else:
                steps.append(f"Assistant: {message.get('content')}")
        elif role == "tool":
            steps.append(f"Tool response: {message.get('content')}")
    
    return "\n".join(steps)

In [32]:
def run_agent_and_track_path(example: Example) -> str:
    messages = [{"role": "user", "content": example.input.get("question")}]
    ret = run_agent(messages)
    return {"path_length": len(ret), "messages": format_message_steps(ret)}

In [33]:
experiment = run_experiment(dataset,
                            run_agent_and_track_path,
                            experiment_name="Convergence Eval",
                            experiment_description="Evaluating the convergence of the agent")

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDoz/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoz/compare?experimentId=RXhwZXJpbWVudDo0


running tasks |          | 0/17 (0.0%) | ⏳ 00:00<? | ?it/s

Running agent with messages: [{'role': 'user', 'content': 'What was the average quantity sold per transaction?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'What is the mean number of items per sale?'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'content': 'Calculate the typical quantity per transaction'}]
Added system prompt to messages
Starting router
Received response with tool calls: True
Processing tool calls
Starting router
Received response with tool calls: False
No tool calls, returning final response
Running agent with messages: [{'role': 'user', 'con

In [15]:
experiment.as_dataframe()

Unnamed: 0_level_0,error,input,example_id
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RXhwZXJpbWVudFJ1bjoxOA==,"AttributeError(""'str' object has no attribute ...",{'question': 'What was the average quantity so...,RGF0YXNldEV4YW1wbGU6MTg=
RXhwZXJpbWVudFJ1bjoxOQ==,"AttributeError(""'str' object has no attribute ...",{'question': 'What is the mean number of items...,RGF0YXNldEV4YW1wbGU6MTk=
RXhwZXJpbWVudFJ1bjoyMA==,"AttributeError(""'str' object has no attribute ...",{'question': 'Calculate the typical quantity p...,RGF0YXNldEV4YW1wbGU6MjA=
RXhwZXJpbWVudFJ1bjoyMQ==,"AttributeError(""'str' object has no attribute ...",{'question': 'What's the mean transaction size...,RGF0YXNldEV4YW1wbGU6MjE=
RXhwZXJpbWVudFJ1bjoyMg==,"AttributeError(""'str' object has no attribute ...","{'question': 'On average, how many items were ...",RGF0YXNldEV4YW1wbGU6MjI=
RXhwZXJpbWVudFJ1bjoyMw==,"AttributeError(""'str' object has no attribute ...",{'question': 'What is the average basket size ...,RGF0YXNldEV4YW1wbGU6MjM=
RXhwZXJpbWVudFJ1bjoyNA==,"AttributeError(""'str' object has no attribute ...",{'question': 'Calculate the mean number of pro...,RGF0YXNldEV4YW1wbGU6MjQ=
RXhwZXJpbWVudFJ1bjoyNQ==,"AttributeError(""'str' object has no attribute ...",{'question': 'What's the typical number of uni...,RGF0YXNldEV4YW1wbGU6MjU=
RXhwZXJpbWVudFJ1bjoyNg==,"AttributeError(""'str' object has no attribute ...",{'question': 'What is the average number of pr...,RGF0YXNldEV4YW1wbGU6MjY=
RXhwZXJpbWVudFJ1bjoyNw==,"AttributeError(""'str' object has no attribute ...",{'question': 'Tell me the mean quantity of ite...,RGF0YXNldEV4YW1wbGU6Mjc=


In [34]:
outputs = experiment.as_dataframe()["output"].to_dict().values()

# Will include the user and system messages
optimal_path_length = min(output.get('path_length') for output in outputs if output and output.get('path_length') is not None)
print(f"The optimal path length is {optimal_path_length}")

The optimal path length is 5


In [35]:
@create_evaluator(name="Convergence Eval", kind="CODE")
def evaluate_path_length(output: str) -> float:
    if output and output.get("path_length"):
        return optimal_path_length/float(output.get("path_length"))
    else:
        return 0

In [36]:
experiment = evaluate_experiment(experiment,
                            evaluators=[evaluate_path_length])

🧠 Evaluation started.


running experiment evaluations |          | 0/17 (0.0%) | ⏳ 00:00<? | ?it/s


🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDoz/compare?experimentId=RXhwZXJpbWVudDo0

Experiment Summary (03/08/25 07:06 PM -0800)
--------------------------------------------
          evaluator   n  n_scores  avg_score
0  Convergence Eval  17        17        1.0

Tasks Summary (03/08/25 07:01 PM -0800)
---------------------------------------
   n_examples  n_runs  n_errors
0          17      17         0
