## Setup & Environment

In [2]:
import os
import re
from dotenv import load_dotenv

# Load environment variables
load_dotenv(override=True)

from ddtrace.llmobs import LLMObs

# Initialize LLM Observability
LLMObs.enable(
    site=os.getenv("DD_SITE", "datadoghq.com"),
    api_key=os.getenv("DD_API_KEY"),
    app_key=os.getenv("DD_APPLICATION_KEY"),
    project_name="wwktm-kb-agent-new",
    ml_app="strands-agents"
)

print("âœ… LLM Observability initialized")

OpenTelemetry configuration OTEL_EXPORTER_OTLP_TRACES_PROTOCOL is not supported by Datadog.


âœ… LLM Observability initialized


## Load Dataset

We'll load our KB agent test dataset containing questions about wwktm policies.

In [None]:
# Create dataset from CSV
# dataset = LLMObs.create_dataset_from_csv(
#     csv_path="kb_agent_dataset.csv",
#     dataset_name="wwktm-kb-agent-eval",
#     description="Dataset for evaluating wwktm KB Agent performance",
#     input_data_columns=["question", "policy_source", "category"],
#     expected_output_columns=["expected_answer"],
#     csv_delimiter=","
# )

# print(f"ðŸ“Š Dataset loaded with {len(dataset)} samples")
# dataset.as_dataframe()

# Pull the dataset if your dataset has not changed
dataset = LLMObs.pull_dataset(
    dataset_name="wwktm-kb-agent-eval",
    project_name="wwktm-kb-agent-new",
    version=1,
)

## Import Agent Configuration

We'll use our existing agent configuration to create agents with different models.

In [3]:
from agent_config import create_bedrock_model, create_agent, SYSTEM_PROMPT
from strands_tools import retrieve, use_agent

# Model IDs for comparison
MODEL_CLAUDE_HAIKU_4_5 = "us.anthropic.claude-haiku-4-5-20251001-v1:0"
MODEL_AMAZON_NOVA_PRO = "us.amazon.nova-pro-v1:0"
MODEL_AMAZON_NOVA_MICRO = "us.amazon.nova-micro-v1:0"

print("âœ… Agent configuration imported")

âœ… Agent configuration imported


## Define Task Function

The task function runs our KB agent against a question and returns the response.

In [4]:
def kb_agent_task(input_data, config):
    """
    Task function that runs the KB agent with the configured model.

    Args:
        input_data: Contains 'question', 'policy_source', and 'category'
        config: Contains 'model_id' and optional guardrail settings

    Returns:
        dict with 'response' and 'model' keys
    """
    # Create model with specified configuration
    model = create_bedrock_model(
        model_id=config.get("model_id", MODEL_CLAUDE_HAIKU_4_5),
    )

    # Create agent with the model (no console output for experiments)
    agent = create_agent(
        model=model,
        callback_handler=None  # Disable console output
    )

    # Run the agent with the question
    question = input_data["question"]
    result = agent(question)

    # Extract text response
    response_text = str(result)

    return {
        "response": response_text,
        "model": config.get("model_id"),
        "category": input_data.get("category", "unknown")
    }

## Define Evaluators

We'll create evaluators to assess the agent's responses:
1. **Semantic Match**: Check if the response contains key elements from the expected answer
2. **Correctness Score**: LLM-based evaluation of response quality
3. **Policy Citation**: Check if the agent cites the correct policy source

In [5]:
def contains_key_info(input_data, output_data, expected_output):
    """
    Check if the response contains key information from the expected answer.
    Uses simple keyword matching for efficiency.
    """
    response = output_data.get("response", "").lower()
    expected = expected_output.get("expected_answer", "").lower()

    # Extract key terms (numbers, important words)
    # Look for numbers and key policy terms
    number_pattern = r'\d+'
    expected_numbers = set(re.findall(number_pattern, expected))
    response_numbers = set(re.findall(number_pattern, response))

    # Check if key numbers match
    if expected_numbers:
        number_match = len(expected_numbers.intersection(response_numbers)) / len(expected_numbers)
    else:
        number_match = 1.0

    # Check for key terms
    key_terms = [term for term in expected.split() if len(term) > 4]
    if key_terms:
        term_matches = sum(1 for term in key_terms if term in response)
        term_match = term_matches / len(key_terms)
    else:
        term_match = 1.0

    # Combined score - pass if average is above 0.5
    score = (number_match + term_match) / 2
    return score >= 0.5


def response_not_empty(input_data, output_data, expected_output):
    """
    Basic check that the agent provided a non-empty response.
    """
    response = output_data.get("response", "")
    return len(response.strip()) > 20


def no_hallucination_indicators(input_data, output_data, expected_output):
    """
    Check for common hallucination indicators in the response.
    Returns True if no obvious hallucination patterns are detected.
    """
    response = output_data.get("response", "").lower()

    # Patterns that might indicate uncertainty or hallucination
    uncertainty_patterns = [
        "i don't have information",
        "i cannot find",
        "not specified in the",
        "i'm not sure",
        "i don't know"
    ]

    # If the agent admits uncertainty, that's actually good (not hallucinating)
    # But we want to flag when it gives confident wrong answers
    for pattern in uncertainty_patterns:
        if pattern in response:
            # Agent admitted uncertainty - this is honest, not a pass/fail
            return True

    return True  # No hallucination indicators found


print("âœ… Evaluators defined")

âœ… Evaluators defined


## Test Single Sample

Before running the full experiment, let's test with a single sample to verify everything works.

In [10]:
# Test with a single sample
test_input = dataset[0]["input_data"]
test_expected = dataset[0]["expected_output"]

print(f"Question: {test_input['question']}")
print(f"Expected: {test_expected['expected_answer']}")
print(f"Category: {test_input['category']}")
print("-" * 50)

# Run task
test_config = {"model_id": MODEL_CLAUDE_HAIKU_4_5}
test_output = kb_agent_task(test_input, test_config)

print(f"Response: {test_output['response'][:500]}..." if len(test_output['response']) > 500 else f"Response: {test_output['response']}")
print("-" * 50)

# Test evaluators
print(f"Contains key info: {contains_key_info(test_input, test_output, test_expected)}")
print(f"Response not empty: {response_not_empty(test_input, test_output, test_expected)}")
print(f"No hallucination indicators: {no_hallucination_indicators(test_input, test_output, test_expected)}")

Question: What is the minimum password length requirement at wwktm?
Expected: Passwords must be at least 12 characters
Category: password
--------------------------------------------------


failed to send, dropping 1 traces to intake at http://localhost:8126/v0.5/traces: client error (Connect)


Response: According to wwktm's **Password & Authentication Policy**, the minimum password length requirement is **12 characters**.

The policy also specifies that passwords should avoid common words and reused credentials, with passphrases being preferred. Password managers are encouraged for employees, and credential sharing is strictly prohibited.

--------------------------------------------------
Contains key info: True
Response not empty: True
No hallucination indicators: True


## Experiment 1: Claude Haiku 4.5

Run the experiment with Claude Haiku 4.5 (faster, more cost-effective model).

In [None]:
# Create experiment for Claude Haiku 4.5
experiment_haiku = LLMObs.experiment(
    name="kb-agent-claude-haiku-4-5",
    dataset=dataset,
    task=kb_agent_task,
    evaluators=[
        contains_key_info,
        response_not_empty,
        no_hallucination_indicators
    ],
    config={
        "model_id": MODEL_CLAUDE_HAIKU_4_5,
        "model_name": "Claude Haiku 4.5"
    }
)

print("Starting experiment with Claude Haiku 4.5...")

# Run the Haiku experiment
results_haiku = experiment_haiku.run(jobs=5)  # Use 5 parallel jobs

print(f"\nHaiku experiment complete!")
print(f"View results: {experiment_haiku.url}")

ðŸš€ Starting experiment with Claude Haiku 4.5...


## Experiment 2: Amazon Nova Pro 

Run the experiment with Amazon Nova Pro (even cheaper than Claude Haiku 4.5).

In [None]:
# Create experiment for Amazon Nova Pro
experiment_nova_pro = LLMObs.experiment(
    name="kb-agent-amazon-nova-pro",
    dataset=dataset,
    task=kb_agent_task,
    evaluators=[
        contains_key_info,
        response_not_empty,
        no_hallucination_indicators
    ],
    config={
        "model_id": MODEL_AMAZON_NOVA_PRO,
        "model_name": "Amazon Nova Pro"
    }
)

print("Starting experiment with Amazon Nova Pro...")

# Run the Nova Pro experiment
results_nova_pro = experiment_nova_pro.run(jobs=5)  # Use 5 parallel jobs

print(f"\nAmazon Nova Pro experiment complete!")
print(f"View results: {experiment_nova_pro.url}")

ðŸš€ Starting experiment with Amazon Nova Pro...


## Experiment 3: Amazon Nova Micro

Run the experiment with Amazon Nova Micro (even cheaper than Amazon Nova Pro).

In [8]:
# Create experiment for Amazon Nova Micro
experiment_nova_micro = LLMObs.experiment(
    name="kb-agent-amazon-nova-micro",
    dataset=dataset,
    task=kb_agent_task,
    evaluators=[
        contains_key_info,
        response_not_empty,
        no_hallucination_indicators
    ],
    config={
        "model_id": MODEL_AMAZON_NOVA_MICRO,
        "model_name": "Amazon Nova Micro"
    }
)

print("Starting experiment with Amazon Nova Micro...")

# Run the Nova Pro experiment
results_nova_micro = experiment_nova_micro.run(jobs=5)  # Use 5 parallel jobs

print(f"\nAmazon Nova Micro experiment complete!")
print(f"View results: {experiment_nova_micro.url}")

Starting experiment with Amazon Nova Micro...


failed to send, dropping 4 traces to intake at http://localhost:8126/v0.5/traces: client error (Connect)



Amazon Nova Micro experiment complete!
View results: https://app.datadoghq.com/llm/experiments/2c79f8ba-47f0-46f3-85bb-268cddf36e48


## Results Summary

Compare the experiment results and view them in Datadog.

In [None]:
print("Experiment Results Summary")
print("=" * 50)
print(f"\nClaude Haiku 4.5: {experiment_haiku.url}")
print(f"\nAmazon Nova Pro: {experiment_nova_pro.url}")
print(f"\nAmazon Nova Micro: {experiment_nova_micro.url}")
print("\n" + "=" * 50)
print("\nView detailed metrics, traces and comparisons in Datadog LLM Observability!")

Experiment Results Summary

Claude Haiku 4.5: https://app.datadoghq.com/llm/experiments/35be1ff9-44d7-4128-bd55-8e59e7e9caf2

Amazon Nova Pro: https://app.datadoghq.com/llm/experiments/6dcc72e8-c5d7-4462-973a-ad801b7e2819


View detailed metrics and comparisons in Datadog LLM Observability!
