# Pydantic Evals Framework

A structured approach to evals using pydantic-evals library.
Core abstractions: Case (test scenario), Evaluator (what "good" means), Dataset (collection of cases).

In [None]:
from typing import Any
from dataclasses import dataclass

from pydantic import BaseModel
from pydantic_ai import format_as_xml
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Evaluator, EvaluatorContext, IsInstance, LLMJudge

from agentic_patterns.core.agents import get_agent, run_agent

## Core Concepts: Case, Evaluator, Dataset

A Case defines inputs, expected outputs, and metadata for a single test scenario.
An Evaluator checks whether the output meets specific criteria.
A Dataset combines cases with evaluators and runs them against a task function.

In [None]:
case1 = Case(
    name='simple_case',
    inputs='What is the capital of France?',
    expected_output='Paris',
    metadata={'difficulty': 'easy'},
)

In [None]:
@dataclass
class MyEvaluator(Evaluator):
    async def evaluate(self, ctx: EvaluatorContext[str, str]) -> float:  
        if ctx.output == ctx.expected_output:
            return 1.0
        elif (
            isinstance(ctx.output, str)
            and ctx.expected_output.lower() in ctx.output.lower()
        ):
            return 0.8
        else:
            return 0.0

In [None]:
dataset = Dataset(
    cases=[case1],
    evaluators=[IsInstance(type_name='str'), MyEvaluator()],  
)

In [None]:
async def guess_city(question: str) -> str:  
    return 'Paris'


In [None]:
report = await dataset.evaluate(guess_city)  
report.print(include_input=True, include_output=True, include_durations=False)

## LLM Judge with Real Agent

LLMJudge evaluates open-ended outputs against a rubric.
Here we test a recipe agent: per-case evaluators check dietary constraints,
global evaluators check general quality.

In [None]:
class CustomerOrder(BaseModel):  
    dish_name: str
    dietary_restriction: str | None = None


class Recipe(BaseModel):
    ingredients: list[str]
    steps: list[str]

In [None]:
recipe_agent = get_agent(
    output_type=Recipe,
    system_prompt='Generate a recipe to cook the dish that meets the dietary restrictions.'
)

In [None]:
async def transform_recipe(customer_order: CustomerOrder) -> Recipe:  
    res, nodes = await run_agent(recipe_agent, format_as_xml(customer_order), verbose=True)
    return res # type: ignore

In [None]:
model = recipe_agent.model

recipe_dataset = Dataset[CustomerOrder, Recipe, Any](  
    cases=[
        Case(
            name='vegetarian_recipe',
            inputs=CustomerOrder(dish_name='Spaghetti Bolognese', dietary_restriction='vegetarian'),
            expected_output=None,  # 
            metadata={'focus': 'vegetarian'},
            evaluators=(
                LLMJudge(  
                    rubric='Recipe should not contain meat or fish',
                    model=model
                ),
            ),
        ),
        Case(
            name='gluten_free_recipe',
            inputs=CustomerOrder(dish_name='Chocolate Cake', dietary_restriction='gluten-free'),
            expected_output=None,
            metadata={'focus': 'gluten-free'},
            # Case-specific evaluator with a focused rubric
            evaluators=(
                LLMJudge(
                    rubric='Recipe should not contain gluten or wheat products',
                    model=model
                ),
            ),
        ),
    ],
    evaluators=[  
        IsInstance(type_name='Recipe'),
        LLMJudge(
            rubric='Recipe should have clear steps and relevant ingredients',
            include_input=True,
            model=model,
        ),
    ],
)

In [None]:
report = await recipe_dataset.evaluate(transform_recipe)
print(report)