### Evaluation of planned agent trajectories

- This Jupyter notebook implements a system for evaluating the results of the LLM generated planned trajectories against the GT.
- Its supports both the OpenAI Eval Framework (BLEU score) along with a fine grained custom manual eval

Before launching this notebook, you need to have executed first the `tau_eval_gen_planned_trajectories.ipynb` and if you wish to use the OpenAI Eval API you need to create `.env` file with your OpenAI API key

In [1]:
import os
import requests
import json
import time
import logging

import pandas as pd
from dotenv import load_dotenv
from typing import Dict, List, Optional, Any, Tuple, Callable

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise EnvironmentError("OPENAI_API_KEY not set. Please add it to your .env file.")

EVAL_FILE_PATH = "eval_data.json"
RETAIL_DATA_PATH = "retail_eval_results.parquet"
AIRLINE_DATA_PATH = "airline_eval_results.parquet"
if not os.path.exists(RETAIL_DATA_PATH):
    raise FileNotFoundError(f"{RETAIL_DATA_PATH} not found. Please generate it first running `tau_eval_gen_planned_trajectories.ipynb`.")
if not os.path.exists(AIRLINE_DATA_PATH):
    raise FileNotFoundError(f"{AIRLINE_DATA_PATH} not found. Please generate it first running `tau_eval_gen_planned_trajectories.ipynb`.")


In [4]:
# Valid evaluation engines: 'manual' or 'openai'
# - manual: Custom evaluation metrics
# - openai: Uses OpenAI's evaluation framework with BLEU score
EVAL_ENGINE = "openai"
if EVAL_ENGINE not in ["manual", "openai"]:
    raise ValueError("EVAL_ENGINE must be either 'manual' or 'openai'")

In [None]:
#OpenAI Eval API
def create_eval():
    url = "https://api.openai.com/v1/evals"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "name": "Planned Trajectory",
        "data_source_config": {
            "type": "custom",
            "item_schema": {
                "type": "object",
                "properties": {
                    "instructions": {"type": "string"},
                    "planned_trajectory": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "arguments": {"type": "object"}
                            },
                            "required": ["name", "arguments"]
                        }
                    },
                    "output_planned_trajectory": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "properties": {
                                "name": {"type": "string"},
                                "arguments": {"type": "object"}
                            },
                            "required": ["name", "arguments"]
                        }
                    }
                },
                "required": ["instructions", "planned_trajectory", "output_planned_trajectory"]
            }
        },
        "testing_criteria": [
            {
                "type": "text_similarity", 
                "name": "Match output to human label",
                "evaluation_metric": "bleu",
                "input": "{{ item.planned_trajectory }}",
                "pass_threshold": 0.5,
                "reference": "{{ item.output_planned_trajectory }}"
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    return response.json()

def upload_eval_file(file_path):
    url = "https://api.openai.com/v1/files"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    files = {
        'file': open(file_path, 'rb'),
        'purpose': (None, 'evals')
    }
    response = requests.post(url, headers=headers, files=files)
    return response.json()

def create_eval_run(eval_id, file_id):
    url = f"https://api.openai.com/v1/evals/{eval_id}/runs"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }
    data = {
        "data_source": {
            "type": "jsonl",
            "source": {
                "type": "file_id",
                "id": file_id
            }
        }
    }
    
    response = requests.post(url, headers=headers, json=data)
    return response.json()

def get_eval_run(eval_id, run_id):
    url = f"https://api.openai.com/v1/evals/{eval_id}/runs/{run_id}"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json"
    }
    response = requests.get(url, headers=headers)
    return response.json()

def format_eval_data(retail_df, airline_df, output_file):
    with open(output_file, "w") as f:
        for _, row in retail_df.iterrows():
            json_obj = {
                "item": {
                    "instructions": row["instruction"],
                    "planned_trajectory": row["ground_truth_calls"],
                    "output_planned_trajectory": row["retail_planned_calls"]
                }
            }
            f.write(json.dumps(json_obj) + "\n")
            
        for _, row in airline_df.iterrows():
            json_obj = {
                "item": {
                    "instructions": row["instruction"],
                    "planned_trajectory": row["ground_truth_calls"],
                    "output_planned_trajectory": row["airline_planned_calls"]
                }
            }
            f.write(json.dumps(json_obj) + "\n")



In [6]:
#Manual Eval
def canonical_params(arguments: Any) -> Tuple:
    if isinstance(arguments, dict):
        return tuple(sorted((k, canonical_params(v)) for k, v in arguments.items()))
    elif isinstance(arguments, list):
        return tuple(canonical_params(item) for item in arguments)
    return arguments

def compare_function_calls(gt_calls: List[Dict[str, Any]],planned_calls: List[Dict[str, Any]],mode: str = 'strict_no_parameters') -> bool:
    if not isinstance(planned_calls, list):
        return isinstance(gt_calls, list) and not gt_calls if not planned_calls else False
    if not isinstance(gt_calls, list):
        return not planned_calls # Only match if planned_calls is also empty

    valid_planned_calls = [
        call for call in planned_calls
        if isinstance(call, dict) and 'name' in call and 'error' not in call
    ]


    try:
        # Ground Truth
        gt_names = [call['name'] for call in gt_calls if isinstance(call, dict) and 'name' in call]
        gt_params_canonical = [canonical_params(call.get('arguments', {})) for call in gt_calls if isinstance(call, dict)]
        gt_tuples = list(zip(gt_names, gt_params_canonical))

        # Planned
        planned_names = [call['name'] for call in valid_planned_calls]
        planned_params_canonical = [canonical_params(call.get('arguments', {})) for call in valid_planned_calls]
        planned_tuples = list(zip(planned_names, planned_params_canonical))
        

    except KeyError as e:
        logging.warning(f"Warning: Missing expected key ({e}) in call structure. Assuming mismatch.")
        return False
    except Exception as e:
        logging.warning(f"Warning: Error extracting call information: {e}. Assuming mismatch.")
        return False
    
    if mode == 'strict_no_parameters':
        return gt_names == planned_names

    elif mode == 'strict':
        return gt_tuples == planned_tuples

    elif mode == 'partial_no_parameters':

        return set(gt_names) == set(planned_names)

    elif mode == 'partial':
        return Counter(gt_tuples) == Counter(planned_tuples)

    else:
        raise ValueError(f"Invalid comparison mode: {mode}. Choose from 'strict', 'strict_no_parameters', 'partial', 'partial_no_parameters'.")

In [7]:
retail_df = pd.read_parquet(RETAIL_DATA_PATH)
airline_df = pd.read_parquet(AIRLINE_DATA_PATH)

if EVAL_ENGINE == "openai":
    format_eval_data(retail_df, airline_df, EVAL_FILE_PATH)

    eval = create_eval()
    logging.info(eval)
    eval_id = eval["id"]

    eval_file = upload_eval_file(EVAL_FILE_PATH)
    logging.info(eval_file)
    eval_file_id = eval_file["id"]
    run = create_eval_run(eval_id, eval_file_id)
    logging.info(run)
    run_id = run["id"]

    while get_eval_run(eval_id, run_id)["status"] != "completed":
        time.sleep(1)
        logging.info("Waiting for evaluation to complete...")

    result = get_eval_run(eval_id, run_id)
    logging.info(result)
    
elif EVAL_ENGINE == "manual":
    modes_to_test = ['strict', 'strict_no_parameters', 'partial', 'partial_no_parameters']
    logging.info("Applying comparison functions...")
    for mode in modes_to_test:
        column_name = f'plan_match_{mode}'
        logging.info(f"Calculating for mode: {mode}")
        retail_df[column_name] = retail_df.apply(
            lambda row: compare_function_calls(
                row.get('ground_truth_calls', []), 
                row.get('retail_planned_calls', []), 
                mode=mode
            ),
            axis=1
        )
        airline_df[column_name] = airline_df.apply(
            lambda row: compare_function_calls(
                row.get('ground_truth_calls', []), 
                row.get('airline_planned_calls', []), 
                mode=mode
            ),
            axis=1
        )

    results = {}
    for mode in modes_to_test:
        column_name = f'plan_match_{mode}'
        
        if column_name in retail_df.columns:
            if retail_df[column_name].dtype == 'bool':
                results[f'retail_{mode}'] = retail_df[column_name].mean()
            else:
                logging.warning(f"Column '{column_name}' in retail_df does not contain boolean values. Cannot calculate percentage.")
                logging.info(retail_df[column_name].value_counts())
        
        if column_name in airline_df.columns:
            if airline_df[column_name].dtype == 'bool':
                results[f'airline_{mode}'] = airline_df[column_name].mean()
            else:
                logging.warning(f"Column '{column_name}' in airline_df does not contain boolean values. Cannot calculate percentage.")
                logging.info(airline_df[column_name].value_counts())
        results_df = pd.DataFrame([results])
        results_df.to_csv(f'evaluation_results_manual.csv', index=False)
else:
    raise ValueError("Invalid EVAL_ENGINE. Please set EVAL_ENGINE to 'openai' or 'manual'.")

2025-04-11 13:17:31,749 - INFO - {'object': 'eval', 'id': 'eval_67f8fa45cb8881908d23e770b00e60f8', 'data_source_config': {'type': 'custom', 'schema': {'type': 'object', 'properties': {'item': {'type': 'object', 'properties': {'instructions': {'type': 'string'}, 'planned_trajectory': {'type': 'array', 'items': {'type': 'object', 'properties': {'name': {'type': 'string'}, 'arguments': {'type': 'object'}}, 'required': ['name', 'arguments']}}, 'output_planned_trajectory': {'type': 'array', 'items': {'type': 'object', 'properties': {'name': {'type': 'string'}, 'arguments': {'type': 'object'}}, 'required': ['name', 'arguments']}}}, 'required': ['instructions', 'planned_trajectory', 'output_planned_trajectory']}}, 'required': ['item']}}, 'testing_criteria': [{'name': 'Match output to human label', 'id': 'Match output to human label-d38ff888-b4d1-4b17-a43c-0c24dc1f2b7c', 'type': 'text_similarity', 'input': '{{ item.planned_trajectory }}', 'reference': '{{ item.output_planned_trajectory }}', 'p