In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ADK Agent Evaluation with Vertex AI Gen AI Evaluation Service

> **Note**: This notebook is for reference and educational purposes only. Not intended for production use.  
> **Questions?** mateuswagner@google.com

This notebook demonstrates how to build and systematically evaluate AI agents using Google's Agent Development Kit (ADK) and Vertex AI Gen AI Evaluation Service.

## What You'll Build

A product research agent that retrieves product details and prices, evaluated using three complementary approaches to measure quality, tool usage, and business logic compliance.

**Tech Stack**: ADK + Gemini + Vertex AI Gen AI Evaluation Service

## Evaluation Approaches

### 1. Rubric-Based Metrics (LLM-as-Judge)

Uses a separate LLM judge to evaluate response quality with pre-built metrics:
- `instruction_following` - Answers the question asked
- `fluency` - Grammatically correct and natural
- `coherence` - Logically consistent and clear
- `safety` - Free from harmful content
- `text_quality` - Overall quality assessment
- `verbosity` - Appropriate response length

**When to use**: Automated quality assurance for customer-facing agent responses

### 2. **Custom** Metrics (LLM-as-Judge)

Evaluates domain-specific criteria using custom prompt templates:

**When to use**: Business-specific quality validation beyond generic metrics

### 3. Custom Function Metrics (Deterministic)

Custom Python functions that validate agent behavior without Judges calls:

**Tool Usage Validation**:
- `tool_count` - Number of tools called
- `tool_efficiency` - Uses minimum necessary tools

**Response Validation**:
- `response_length` - Character count
- `response_conciseness` - Appropriately brief
- `numeric_response` - Contains numbers when expected

**Business Logic Validation**:
- `valid_product` - Queries only catalog products
- `correct_tool_selection` - Uses appropriate tool for query type
- `price_range_validation` - Price within expected bounds

**When to use**: Fast, deterministic validation of agent logic and compliance

## Implementation Workflow

**Setup**

1. Install dependencies and configure GCP project
2. Initialize Vertex AI with experiment tracking
3. Define helper functions for parsing and display

**Agent Development**

4. Create custom tools for product research
5. Configure Gemini model and build ADK agent
6. Create evaluation dataset with expected behaviors

**Evaluation**

7. Run rubric-based evaluation for response quality
8. Run custom pointwise evaluation for completeness
9. Run function-based evaluation for logic validation
10. Compare metrics across evaluation approaches
11. Review detailed results and summary statistics

## Evaluation Dataset Structure

The dataset serves as ground truth for measuring agent performance.

This enables:
- Trajectory validation against expected tool usage
- Regression testing for agent updates
- Systematic coverage of edge cases
- Objective baselines for automated metrics

## Evaluation Modes

**On-the-fly generation** (used in this notebook): Provide only prompts, agent generates responses during evaluation

**BYOD (Bring Your Own Data)**: Provide prompts, responses, and trajectories for faster evaluation of pre-generated results

## Key Concepts

**LLM-as-Judge**: Uses a separate model to evaluate response quality based on rubrics and criteria. Provides nuanced assessment but has API costs.

**Deterministic Metrics**: Python functions that validate specific behaviors. Fast, free, and reproducible but limited to predefined rules.

**Complementary Strategy**: Combine both approaches for comprehensive agent evaluation - quality assessment via LLM judges and logic validation via deterministic functions.

## Outputs

All evaluation results are tracked in Vertex AI Experiments and persisted to Cloud Storage for reproducibility and comparison across runs.

---

## Get started

In [None]:
## Installation
# Python 3.10+ recommended
# Virtual environment (venv) recommended for isolation
# Google Cloud SDK initialized: `gcloud init`

%pip install --upgrade --quiet 'google-adk' nbformat 'google-cloud-aiplatform[evaluation]'

# Restart your Jupyter kernel !


## Import libraries

In [1]:
# Standard library imports
import asyncio
import json
import os
import random
import string
import warnings
from typing import Any

# Third-party imports
import pandas as pd
from IPython.display import HTML, Markdown, display

# Google Cloud imports
import vertexai
from google.cloud import aiplatform
from google.genai import types
from google.cloud import storage
from google.api_core import exceptions

# Google ADK imports
from google.adk.agents import Agent
from google.adk.events import Event
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService

# Vertex AI Evaluation imports
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
    CustomMetric,
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    TrajectorySingleToolUse,
)

### CONFIGURATION SETTINGS

In [2]:
# Enable Vertex AI for Google Gen AI SDK
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "true"

# Google Cloud Project Configuration
# CHANGE THIS: Set your Google Cloud project ID
PROJECT_ID = "matt-demos" # CHANGE IT!

# Fallback to environment variable if not set
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

# Default region for Vertex AI resources
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

# Cloud Storage Configuration
# CHANGE THIS: Set your Cloud Storage bucket name
BUCKET_NAME = "53642dcf-cdb9-4f6c-a3bb-cf6595602893"
BUCKET_URI = f"gs://{BUCKET_NAME}"

# Set environment variables
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

# Vertex AI Experiments name for tracking evaluation runs
EXPERIMENT_NAME = "evaluate-adk-agent"

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Initialize Vertex AI with experiment tracking
vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)

In [3]:
# OPTIONAL: Create GCS bucket (skip if already exists)

def create_bucket_if_not_exists(bucket_name: str, location: str) -> None:
    """Create a GCS bucket if it doesn't already exist."""
    storage_client = storage.Client(project=PROJECT_ID)
    
    try:
        bucket = storage_client.lookup_bucket(bucket_name)
        
        if bucket is not None:
            print(f"✓ Bucket '{bucket_name}' already exists - skipping creation")
            print(f"  Location: {bucket.location} | Created: {bucket.time_created}")
            return
        
        print(f"Creating bucket '{bucket_name}' in {location}...")
        bucket = storage_client.create_bucket(bucket_name, location=location)
        print(f"✓ Successfully created bucket '{bucket_name}' at gs://{bucket_name}")
        
    except exceptions.Forbidden as e:
        print(f"✗ Permission denied. Ensure you have 'storage.buckets.create' permission")
        raise
    except exceptions.Conflict as e:
        print(f"✗ Bucket name already taken globally. Try a different name.")
        raise
    except Exception as e:
        print(f"✗ Error: {e}")
        raise

try:
    create_bucket_if_not_exists(BUCKET_NAME, LOCATION)
except Exception:
    print("\nBucket creation failed. Continue if bucket exists or create manually")

✓ Bucket '53642dcf-cdb9-4f6c-a3bb-cf6595602893' already exists - skipping creation
  Location: US | Created: 2025-10-17 18:15:55.580000+00:00


## Define helper functions

In [4]:
def get_id(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def parse_adk_output_to_dictionary(events: list[Event], *, as_json: bool = False):
    """
    Parse ADK event output into a structured dictionary format,
    with the predicted trajectory dumped as a JSON string.

    """

    final_response = ""
    trajectory = []

    for event in events:
        if not getattr(event, "content", None) or not getattr(event.content, "parts", None):
            continue
        for part in event.content.parts:
            if getattr(part, "function_call", None):
                info = {
                    "tool_name": part.function_call.name,
                    "tool_input": dict(part.function_call.args),
                }
                if info not in trajectory:
                    trajectory.append(info)
            if event.content.role == "model" and getattr(part, "text", None):
                final_response = part.text.strip()

    if as_json:
        trajectory_out = json.dumps(trajectory)
    else:
        trajectory_out = trajectory

    return {"response": final_response, "predicted_trajectory": trajectory_out}


def format_output_as_markdown(output: dict) -> str:
    """Convert the output dictionary to a formatted markdown string."""
    markdown = "### AI Response\n" + output["response"] + "\n\n"
    if output["predicted_trajectory"]:
        markdown += "### Function Calls\n"
        for call in output["predicted_trajectory"]:
            markdown += f"- **Function**: `{call['tool_name']}`\n"
            markdown += "  - **Arguments**\n"
            for key, value in call["tool_input"].items():
                markdown += f"    - `{key}`: `{value}`\n"
    return markdown


def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    display(Markdown("### Summary Metrics"))
    display(
        pd.DataFrame(
            eval_result.summary_metrics.items(), columns=["metric", "value"]
        )
    )
    if getattr(eval_result, "metrics_table", None) is not None:
        display(Markdown("### Row‑wise Metrics"))
        display(eval_result.metrics_table.head())

## Build ADK agent

Build your application using ADK, including the Gemini model and custom tools that you define.

---


### Set agent tools

To start, set the tools that a customer support agent needs to do their job.

In [5]:
def get_product_details(product_name: str):
    """Gathers basic details about a product."""
    details = {
        "smartphone": "A cutting-edge smartphone with advanced camera features and lightning-fast processing.",
        "usb charger": "A super fast and light usb charger",
        "shoes": "High-performance running shoes designed for comfort, support, and speed.",
        "headphones": "Wireless headphones with advanced noise cancellation technology for immersive audio.",
        "speaker": "A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.",
    }
    return details.get(product_name, "Product details not found.")


def get_product_price(product_name: str):
    """Gathers price about a product."""
    details = {
        "smartphone": 500,
        "usb charger": 10,
        "shoes": 100,
        "headphones": 50,
        "speaker": 80,
    }
    return details.get(product_name, "Product price not found.")

### Set Agent the model

Configure the Gemini model for your ADK agent. This notebook uses **`gemini-2.5-flash`** for fast, cost-effective function calling.

**Model Selection Guidelines:**
- **`gemini-2.5-flash`**: Fast responses, low cost, ideal for production agents with straightforward tool usage
- **`gemini-2.5-pro`**: Higher reasoning capability, better for complex multi-step workflows and ambiguous queries

See the [Gemini model documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for detailed performance benchmarks and pricing.

In [6]:
model = "gemini-2.5-flash"

### Assemble the ADK Agent

In [7]:
async def agent_parsed_outcome(query):
   # Session identifiers for tracking agent interactions
   app_name = "product_research_app"
   user_id = "user_one"
   session_id = "session_one"
   
   # Create agent with dynamic instruction based on query
   product_research_agent = Agent(
       name="ProductResearchAgent",
       model=model,
       description="An agent that performs product research.",
       instruction=f"""
       Analyze this user request: '{query}'.
       If the request is about price, use get_product_price tool.
       Otherwise, use get_product_details tool to get product information.
       """,
       tools=[get_product_details, get_product_price],
   )

   # Initialize in-memory session storage
   session_service = InMemorySessionService()
   await session_service.create_session(
       app_name=app_name, user_id=user_id, session_id=session_id
   )

   # Create runner to execute agent with session management
   runner = Runner(
       agent=product_research_agent, app_name=app_name, session_service=session_service
   )

   # Format query as user message and run agent asynchronously
   content = types.Content(role="user", parts=[types.Part(text=query)])
   events = [event async for event in runner.run_async(user_id=user_id, session_id=session_id, new_message=content)]
   
   # Parse events into dictionary with response and tool calls
   return parse_adk_output_to_dictionary(events)


In [9]:
# Agent Wrapper for Vertex AI Evaluation Service
# This function will be passed to EvalTask.evaluate(runnable=agent_parsed_outcome_sync)
# Vertex AI will call it for each prompt in the evaluation dataset, automatically generating
# responses and trajectories on-the-fly for metrics computation.

def agent_parsed_outcome_sync(prompt: str):
    result = asyncio.run(agent_parsed_outcome(prompt))
    result["predicted_trajectory"] = json.dumps(result["predicted_trajectory"])
    return result

In [10]:
# Test the agent

response = await agent_parsed_outcome(query="Get product details for shoes")
display(Markdown(format_output_as_markdown(response)))



### AI Response
High-performance running shoes designed for comfort, support, and speed.

### Function Calls
- **Function**: `get_product_details`
  - **Arguments**
    - `product_name`: `shoes`


---

### Prepare Agent Evaluation Dataset

The evaluation dataset serves as the **ground truth benchmark** for measuring agent performance. It contains:
- **`prompt`**: Test queries that cover diverse agent scenarios (price lookups, detail requests, multi-step tasks)
- **`predicted_trajectory`**: Expected tool call sequences that define correct agent behavior

**Uses:**
- **Trajectory Validation**: Compare actual vs expected tool usage to catch logic errors (wrong tools, missing steps, extra calls)
- **Regression Testing**: Ensure agent improvements don't break existing functionality
- **Coverage Analysis**: Systematically test edge cases and multi-tool workflows
- **Baseline for Metrics**: Powers both automated metrics (tool selection, efficiency) and LLM-based judges (response quality)

Without this dataset, you're evaluating in a vacuum with no objective standard for correctness.

In [11]:
eval_data = {
    "prompt": [
        "Get price for smartphone",
        "Get product details and price for headphones",
        "Get details for usb charger",
        "Get product details and price for shoes",
        "Get product details for speaker?",
    ],
    "predicted_trajectory": [
        [
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "smartphone"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "headphones"},
            },
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "headphones"},
            },
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "usb charger"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "shoes"},
            },
            {"tool_name": "get_product_price", "tool_input": {"product_name": "shoes"}},
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "speaker"},
            }
        ],
    ],
}

eval_sample_dataset = pd.DataFrame(eval_data)

# Eval #1
## Rubric-Based Evaluation (Model-Based Metrics)

Evaluates the **quality of agent text responses** using an LLM as a judge. Unlike trajectory metrics that validate tool usage, these metrics assess communication quality.

### Architecture

A separate judge LLM (Gemini) evaluates each response by analyzing:
- User prompt
- Agent response
- Metric-specific rubric/criteria

### Metrics (6 model-based)

| Metric | Evaluation Criteria | Scale |
|--------|---------------------|-------|
| `instruction_following` | Answers the question asked | 1-5 |
| `fluency` | Grammatically correct and natural | 1-5 |
| `coherence` | Logically consistent and clear | 1-5 |
| `safety` | Free from harmful content | 1 (safe) / 0 (unsafe) |
| `text_quality` | Overall quality | 1-5 |
| `verbosity` | Appropriate response length | -2 to +2 (0 = optimal) |

### Evaluation Modes

1. **On-the-fly (used here)**: Provide only `prompt` column, agent generates responses during evaluation

### Output

- **GCS**: `gs://{BUCKET_URI}/rubric-metric-eval/`
- **Tracking**: Logged to Vertex AI Experiments

In [12]:
EXPERIMENT_RUN = f"rubric-metric-eval-{get_id()}"
from vertexai.generative_models import GenerativeModel

eval_prompt_sample_dataset = pd.DataFrame(
    {"prompt": [
        "Get price for smartphone",
        "Get product details and price for headphones",
        "Get details for usb charger",
        "Get product details and price for shoes",
        "Get product details for speaker?",
    ]
    }
)

# Model-Based Metrics (uses judge LLM to evaluate response quality)
response_quality_metrics = [
    "instruction_following",  # Does the response answer the question?
    "fluency",                # Is the response well-written?
    "coherence",              # Is the response logically structured?
    "safety",                 # Is the response safe/appropriate?
    "text_quality",           # Overall text quality
    "verbosity",              # Is response too long/short?
]

response_quality_result = EvalTask(
    dataset = eval_prompt_sample_dataset,
    metrics = response_quality_metrics,
    experiment = EXPERIMENT_NAME,
    output_uri_prefix = BUCKET_URI + "/rubric-metric-eval"
).evaluate(
    runnable = agent_parsed_outcome_sync, # Generate responses on-the-fly (not BYOD mode)
    experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(response_quality_result)

Associating projects/941046250687/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-rubric-metric-eval-x940vgl7 to Experiment: evaluate-adk-agent


Logging Eval experiment evaluation metadata: {'output_file': 'gs://53642dcf-cdb9-4f6c-a3bb-cf6595602893/rubric-metric-eval/eval_results_2025-10-30-15-58-37-48036.csv'}


100%|██████████| 5/5 [00:02<00:00,  1.94it/s]

All 5 responses are successfully generated from the runnable.
Computing metrics with a total of 30 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 30/30 [00:30<00:00,  1.03s/it]

All 30 metric requests are successfully computed.
Evaluation Took:30.81328302099996 seconds





### Summary Metrics

Unnamed: 0,metric,value
0,row_count,5.0
1,instruction_following/mean,4.0
2,instruction_following/std,1.414214
3,fluency/mean,4.0
4,fluency/std,0.707107
5,coherence/mean,4.2
6,coherence/std,1.788854
7,safety/mean,1.0
8,safety/std,0.0
9,text_quality/mean,4.2


### Row‑wise Metrics

Unnamed: 0,prompt,response,latency_in_seconds,failure,predicted_trajectory,instruction_following/explanation,instruction_following/score,fluency/explanation,fluency/score,coherence/explanation,coherence/score,safety/explanation,safety/score,text_quality/explanation,text_quality/score,verbosity/explanation,verbosity/score
0,Get price for smartphone,The price for smartphone is 500.,2.279498,0,"[{""tool_name"": ""get_product_price"", ""tool_inpu...",The response directly addresses the instructio...,5.0,The response has a minor grammatical error due...,4.0,"The response is a single, clear, and direct st...",5.0,The response is safe as it contains no hate sp...,1.0,"The response is coherent, fluent, and perfectl...",4.0,The response is excessively brief and lacks cr...,-2.0
1,Get product details and price for headphones,Here are the details for headphones: Wireless ...,2.561855,0,"[{""tool_name"": ""get_product_details"", ""tool_in...",The response fully addresses both aspects of t...,5.0,The response is mostly fluent. It clearly answ...,4.0,"The response is completely coherent, with a se...",5.0,The response provides product details and pric...,1.0,"The response is exceptionally clear, coherent,...",5.0,The response is somewhat brief as it provides ...,-1.0
2,Get details for usb charger,A super fast and light usb charger,2.439887,0,"[{""tool_name"": ""get_product_details"", ""tool_in...",The prompt asked for 'details' about a USB cha...,2.0,"The response is a fragment, not a complete sen...",3.0,The response is highly illogical and lacks any...,1.0,The response is a simple description of a USB ...,1.0,The response is fluent and coherent but fails ...,2.0,The response is excessively brief and complete...,-2.0
3,Get product details and price for shoes,High-performance running shoes designed for co...,2.56927,0,"[{""tool_name"": ""get_product_details"", ""tool_in...",The response successfully provides 'product de...,5.0,The response has a minor grammatical error as ...,4.0,The response seamlessly provides product detai...,5.0,The response is safe as it contains no hate sp...,1.0,"The response is exceptionally clear, coherent,...",5.0,The response is slightly brief as it provides ...,-1.0
4,Get product details for speaker?,A voice-controlled smart speaker that plays mu...,1.979517,0,"[{""tool_name"": ""get_product_details"", ""tool_in...",The response provides a general description of...,3.0,The response is a concise and perfectly phrase...,5.0,"The response is a single, perfectly formed sen...",5.0,The response is a neutral description of a sma...,1.0,"The response is exceptionally clear, coherent,...",5.0,The response is slightly brief for a general r...,-1.0


---

# Eval #2
## Define a Custom Metric

According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.

Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.


In [13]:
criteria = {
    "Request Completeness": (
        "You are a Quality Assurance Analyst. Your task is to evaluate if an AI agent's response "
        "completely fulfills a user's shopping query based on a single criterion: Request Completeness.\n\n"
        "Assign a score based on whether the agent provided all the categories of information the user asked for.\n\n"
        "Instructions:\n"
        "  - Read the user's question to identify all requested information types (e.g., 'price', 'details').\n"
        "  - Analyze the response to see which information types were provided.\n"
        "  - If all requested types are present, score '1'. Otherwise, score '0'.\n\n"
        "For example, if the user asks for 'price and details,' the response must contain both a price and "
        "some form of product details."
    )
}

pointwise_rating_rubric = {
    "1": "The response provides all the types of information explicitly requested in the question.",
    "0": "The response is missing at least one type of information explicitly requested in the question.",
}

response_completeness_prompt_template = PointwiseMetricPromptTemplate(
    criteria=criteria,
    rating_rubric=pointwise_rating_rubric,
    input_variables=["prompt"],
)

response_completeness_metric = PointwiseMetric(
    metric="response_completeness",
    metric_prompt_template=response_completeness_prompt_template,
)

In [None]:
#### Run an evaluation task

EXPERIMENT_RUN = f"response-over-tools-{get_id()}"

response_eval_tool_result = EvalTask(
    dataset = eval_sample_dataset,
    metrics = [response_completeness_metric],
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/reasoning-metric-eval",
).evaluate(
    runnable = agent_parsed_outcome_sync,
    experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(response_eval_tool_result)

---

# Eval #3
## Custom Function-Based Metrics

Python functions for deterministic agent evaluation without LLM (Judges) calls.

### Properties
- Deterministic output
- Explicit business logic encoding
- Standard Python debugging

### Custom metric functions must:

1. Accept evaluation instance: `def metric_fn(instance: dict) -> dict`
2. Return dictionary with metric name and numeric score
3. Be wrapped in `CustomMetric` for `EvalTask` integration

---

In [None]:
# This cell defines custom evaluation functions/metrics for agent behavior validation.

def tool_count_metric(instance: dict) -> dict:
    """
    Counts the number of tools called in the predicted trajectory.
    
    Args:
        instance: Dictionary containing 'predicted_trajectory' key
        
    Returns:
        Dictionary with metric name and score
    """
    trajectory = instance.get("predicted_trajectory", "")
    
    # Handle both string (JSON) and list formats
    if isinstance(trajectory, str):
        try:
            trajectory = json.loads(trajectory)
        except json.JSONDecodeError:
            trajectory = []
    
    tool_count = len(trajectory) if isinstance(trajectory, list) else 0
    
    return {
        "tool_count": tool_count
    }


def tool_efficiency_metric(instance: dict) -> dict:
    """
    Measures if the agent uses the minimum necessary tools.
    Score of 1 if tool count <= 2, otherwise 0.
    
    Args:
        instance: Dictionary containing 'predicted_trajectory' key
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    trajectory = instance.get("predicted_trajectory", "")
    
    if isinstance(trajectory, str):
        try:
            trajectory = json.loads(trajectory)
        except json.JSONDecodeError:
            trajectory = []
    
    tool_count = len(trajectory) if isinstance(trajectory, list) else 0
    
    # Efficient if using 2 or fewer tools
    is_efficient = 1 if tool_count <= 2 else 0
    
    return {
        "tool_efficiency": is_efficient
    }
def response_length_metric(instance: dict) -> dict:
    """
    Measures the length of the response in characters.
    
    Args:
        instance: Dictionary containing 'response' key
        
    Returns:
        Dictionary with metric name and score
    """
    response = instance.get("response", "")
    
    # Handle JSON-encoded strings
    if isinstance(response, str) and response.startswith('"'):
        try:
            response = json.loads(response)
        except json.JSONDecodeError:
            pass
    
    response_length = len(str(response))
    
    return {
        "response_length": response_length
    }


def response_conciseness_metric(instance: dict) -> dict:
    """
    Checks if response is concise (under 200 characters).
    Score of 1 if concise, 0 otherwise.
    
    Args:
        instance: Dictionary containing 'response' key
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    response = instance.get("response", "")
    
    if isinstance(response, str) and response.startswith('"'):
        try:
            response = json.loads(response)
        except json.JSONDecodeError:
            pass
    
    is_concise = 1 if len(str(response)) <= 200 else 0
    
    return {
        "response_conciseness": is_concise
    }


def numeric_response_metric(instance: dict) -> dict:
    """
    Validates if response contains numeric values (for price queries).
    Score of 1 if numeric, 0 otherwise.
    
    Args:
        instance: Dictionary containing 'response' key
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    response = instance.get("response", "")
    
    if isinstance(response, str) and response.startswith('"'):
        try:
            response = json.loads(response)
        except json.JSONDecodeError:
            pass
    
    # Check if response contains digits or is a number
    response_str = str(response).strip()
    has_number = any(char.isdigit() for char in response_str)
    
    # Or check if it can be converted to a number
    try:
        float(response_str)
        is_numeric = 1
    except ValueError:
        is_numeric = 1 if has_number else 0
    
    return {
        "numeric_response": is_numeric
    }
def valid_product_metric(instance: dict) -> dict:
    """
    Validates that the agent only queries valid products from the catalog.
    
    Args:
        instance: Dictionary containing 'predicted_trajectory' key
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    VALID_PRODUCTS = {"smartphone", "usb charger", "shoes", "headphones", "speaker"}
    
    trajectory = instance.get("predicted_trajectory", "")
    
    if isinstance(trajectory, str):
        try:
            trajectory = json.loads(trajectory)
        except json.JSONDecodeError:
            trajectory = []
    
    # Check all tool inputs for valid product names
    all_valid = True
    if isinstance(trajectory, list):
        for tool_call in trajectory:
            if isinstance(tool_call, dict):
                tool_input = tool_call.get("tool_input", {})
                product_name = tool_input.get("product_name", "")
                if product_name and product_name not in VALID_PRODUCTS:
                    all_valid = False
                    break
    
    return {
        "valid_product": 1 if all_valid else 0
    }


def correct_tool_selection_metric(instance: dict) -> dict:
    """
    Validates that the agent uses the correct tool based on the prompt.
    - Price queries should use 'get_product_price'
    - Details queries should use 'get_product_details'
    
    Args:
        instance: Dictionary containing 'prompt' and 'predicted_trajectory' keys
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    prompt = instance.get("prompt", "").lower()
    trajectory = instance.get("predicted_trajectory", "")
    
    if isinstance(trajectory, str):
        try:
            trajectory = json.loads(trajectory)
        except json.JSONDecodeError:
            return {"correct_tool_selection": 0}
    
    if not isinstance(trajectory, list) or len(trajectory) == 0:
        return {"correct_tool_selection": 0}
    
    # Extract tool names used
    tools_used = [call.get("tool_name", "") for call in trajectory if isinstance(call, dict)]
    
    correct = False
    
    # Check if correct tools were used based on prompt
    if "price" in prompt and "get_product_price" in tools_used:
        correct = True
    elif "details" in prompt and "price" not in prompt and "get_product_details" in tools_used:
        correct = True
    elif "details" in prompt and "price" in prompt:
        # Both tools should be used
        correct = "get_product_details" in tools_used and "get_product_price" in tools_used
    
    return {
        "correct_tool_selection": 1 if correct else 0
    }


def price_range_validation_metric(instance: dict) -> dict:
    """
    Validates that price responses are within expected range ($1-$1000).
    
    Args:
        instance: Dictionary containing 'response' key
        
    Returns:
        Dictionary with metric name and score (0 or 1)
    """
    response = instance.get("response", "")
    
    if isinstance(response, str) and response.startswith('"'):
        try:
            response = json.loads(response)
        except json.JSONDecodeError:
            pass
    
    response_str = str(response).strip()
    
    # Try to extract numeric value
    try:
        # Remove currency symbols and convert to float
        price = float(response_str.replace("$", "").replace(",", ""))
        valid = 1 if 1 <= price <= 1000 else 0
    except ValueError:
        # If not a price response, consider it valid (not applicable)
        valid = 1
    
    return {
        "price_range_valid": valid
    }

## Define CustomMetric

In [None]:
# Wrap all custom metrics using CustomMetric class
custom_metrics = [
    # Tool usage metrics
    CustomMetric(name="tool_count", metric_function=tool_count_metric),
    CustomMetric(name="tool_efficiency", metric_function=tool_efficiency_metric),
    
    # Response quality metrics
    CustomMetric(name="response_length", metric_function=response_length_metric),
    CustomMetric(name="response_conciseness", metric_function=response_conciseness_metric),
    CustomMetric(name="numeric_response", metric_function=numeric_response_metric),
    
    # Business logic metrics
    CustomMetric(name="valid_product", metric_function=valid_product_metric),
    CustomMetric(name="correct_tool_selection", metric_function=correct_tool_selection_metric),
    CustomMetric(name="price_range_valid", metric_function=price_range_validation_metric),
]

print(f"✓ Created {len(custom_metrics)} custom metrics")

In [None]:
# Run evaluation with custom metrics

# Generate a unique experiment run identifier
EXPERIMENT_RUN = f"custom-metrics-eval-{get_id()}"

# Execute the evaluation task with custom deterministic metrics
custom_metrics_result = EvalTask(
    dataset=eval_prompt_sample_dataset,  # Dataset with only prompts (agent generates responses during eval)
    metrics=custom_metrics,  # List of 8 custom function-based metrics defined above
    experiment=EXPERIMENT_NAME,  # Vertex AI experiment name for tracking across runs
    output_uri_prefix=BUCKET_URI + "/custom-metrics-eval",  # GCS path to persist results
).evaluate(
    runnable=agent_parsed_outcome_sync,  # Wrapper function that invokes the agent for each prompt
    experiment_run_name=EXPERIMENT_RUN  # Unique name for this specific evaluation run
)

# Display summary statistics and row-level metrics in formatted tables
display_eval_report(custom_metrics_result)

In [None]:
# Display detailed metrics for each row
display(Markdown("### Detailed Results by Instance"))
display(custom_metrics_result.metrics_table)

# Show specific metrics of interest
display(Markdown("### Key Custom Metrics Summary"))
key_metrics = [
    "tool_efficiency/mean",
    "correct_tool_selection/mean", 
    "valid_product/mean",
    "response_conciseness/mean"
]

summary_df = pd.DataFrame([
    {"Metric": metric.replace("/mean", "").replace("_", " ").title(), 
     "Score": custom_metrics_result.summary_metrics.get(metric, "N/A")}
    for metric in key_metrics
])
display(summary_df)

---

## Cleaning up


In [None]:
### Optional: Delete Experiment Artifacts
delete_experiment = True

if delete_experiment:
    try:
        experiment = aiplatform.Experiment(EXPERIMENT_NAME)
        experiment.delete(delete_backing_tensorboard_runs=True)
    except Exception as e:
        print(e)

# Thank you