# Capstone 1 (from Chapter 5) ‚Äî Monte Carlo Estimation of Pi

In [1]:
!uv pip install numpy

[2K[2mResolved [1m1 package[0m [2min 35ms[0m[0m                                           [0m
[2K[2mInstalled [1m1 package[0m [2min 22ms[0m[0m                                 [0m
 [32m+[39m [1mnumpy[0m[2m==2.3.5[0m


In [2]:
LOGGING_ENABLED = True

In [3]:
import logging

from llm_agents_from_scratch.logger import enable_console_logging

if LOGGING_ENABLED:
    enable_console_logging(logging.INFO)

## LLMs

In [35]:
backbone_llm = "qwen3:8b"
judge_llm = "qwen3:8b"

## Build Tools

In [4]:
import uuid

import numpy as np
from pydantic import BaseModel, Field, computed_field

from llm_agents_from_scratch.tools import PydanticFunctionTool

### Tool: `generate_random_sample()`

In [5]:
# Global registry to store samples
SAMPLE_REGISTRY: dict[str, list[tuple[float, float]]] = {}

In [None]:
class RandomSampleParams(BaseModel):
    """Params for generate_random_sample."""

    n: int = Field(description="The number of random points to generate")


class RandomSample(BaseModel):
    """Result from generate_random_sample."""

    sample_id: str = Field(
        description="Pass this sample_id to monte_carlo_estimate",
    )

    @computed_field
    @property
    def sample_size(
        self,
    ) -> int:
        """Determine n from SAMPLE_REGISTRY."""
        return len(SAMPLE_REGISTRY[self.sample_id])


def generate_random_sample(params: RandomSampleParams) -> RandomSample:
    """Generate n random points in [-1, 1] √ó [-1, 1].

    Returns a sample_id. Pass this sample_id directly to monte_carlo_estimate.
    """
    orig_pts = np.random.uniform(size=(params.n, 2))
    transformed = 2 * orig_pts - 1

    sample_id = str(uuid.uuid4())
    SAMPLE_REGISTRY[sample_id] = [tuple(pt) for pt in transformed.tolist()]

    return RandomSample(sample_id=sample_id)

In [7]:
# test generate_random_sample() function
rs = generate_random_sample(RandomSampleParams(n=1000))
rs.model_dump()

{'sample_id': '3e438a1e-733b-46b2-9c72-7e1d7b41c864', 'sample_size': 1000}

In [8]:
# create tool
random_sample_tool = PydanticFunctionTool(generate_random_sample)
random_sample_tool.parameters_json_schema

{'description': 'Params for generate_random_sample.',
 'properties': {'n': {'description': 'The number of random points to generate',
   'title': 'N',
   'type': 'integer'}},
 'required': ['n'],
 'title': 'RandomSampleParams',
 'type': 'object'}

### Tool: `add_more_points()`

In [None]:
class AddPointsParams(BaseModel):
    """Params for generate_random_sample."""

    sample_id: str = Field(
        description="The sample_id of the sample to augment",
    )
    n: int = Field(description="The number of random points to generate")


def add_more_points_to_sample(params: AddPointsParams) -> RandomSample:
    """Add n more random points to an existing random sample.

    Returns a sample_id and the total number of points.
    """
    orig_pts = np.random.uniform(size=(params.n, 2))
    transformed = 2 * orig_pts - 1

    # augment sample
    SAMPLE_REGISTRY[params.sample_id] += [
        tuple(pt) for pt in transformed.tolist()
    ]

    return RandomSample(sample_id=params.sample_id)

In [10]:
# test add_more_points_to_sample() function
rs = add_more_points_to_sample(AddPointsParams(n=1000, sample_id=rs.sample_id))
str(rs)

"sample_id='3e438a1e-733b-46b2-9c72-7e1d7b41c864' sample_size=2000"

In [11]:
# create tool
add_more_points_tool = PydanticFunctionTool(add_more_points_to_sample)
add_more_points_tool.parameters_json_schema

{'description': 'Params for generate_random_sample.',
 'properties': {'sample_id': {'description': 'The sample_id of the sample to augment',
   'title': 'Sample Id',
   'type': 'string'},
  'n': {'description': 'The number of random points to generate',
   'title': 'N',
   'type': 'integer'}},
 'required': ['sample_id', 'n'],
 'title': 'AddPointsParams',
 'type': 'object'}

### Tool: `monte_carlo_estimate()`

In [None]:
class MonteCarloEstimateParams(BaseModel):
    """Params for monte_carlo_estimate."""

    sample_id: str = Field(
        description="The sample_id returned by generate_random_sample",
    )


class MonteCarloEstimateResults(BaseModel):
    """Results for monte_carlo_estimate."""

    sample_id: str
    sample_size: int
    estimate: float


def monte_carlo_estimate(
    params: MonteCarloEstimateParams,
) -> MonteCarloEstimateResults:
    """Estimate pi using Monte Carlo method.

    Args:
        params: Contains sample_id from generate_random_sample.

    Returns:
        Estimate of pi (float).
    """
    points = SAMPLE_REGISTRY[params.sample_id]
    n = len(points)
    inside = sum((x**2 + y**2) < 1 for x, y in points)
    return MonteCarloEstimateResults(
        estimate=(inside / n) * 4,
        sample_id=params.sample_id,
        sample_size=n,
    )

In [13]:
pi_estimate = monte_carlo_estimate(
    MonteCarloEstimateParams(sample_id=rs.sample_id),
)
pi_estimate

MonteCarloEstimateResults(sample_id='3e438a1e-733b-46b2-9c72-7e1d7b41c864', sample_size=2000, estimate=3.044)

In [14]:
monte_carlo_estimate_tool = PydanticFunctionTool(monte_carlo_estimate)
monte_carlo_estimate_tool.parameters_json_schema

{'description': 'Params for monte_carlo_estimate.',
 'properties': {'sample_id': {'description': 'The sample_id returned by generate_random_sample',
   'title': 'Sample Id',
   'type': 'string'}},
 'required': ['sample_id'],
 'title': 'MonteCarloEstimateParams',
 'type': 'object'}

## Define our LLMAgent

In [15]:
from llm_agents_from_scratch import LLMAgent
from llm_agents_from_scratch.llms import OllamaLLM

llm = OllamaLLM(backbone_llm)
llm_agent = LLMAgent(
    llm=llm,
    tools=[
        random_sample_tool,
        add_more_points_tool,
        monte_carlo_estimate_tool,
    ],
)

## Define the Task

In [16]:
from llm_agents_from_scratch.data_structures import Task

In [31]:
instruction_template = """
You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations. The larger the sample, the better the estimates
become.

Your target: Estimate pi accurate to 4 decimal places (3.1415).

<tools>
1. `generate_random_sample(n)`
        ‚Üí Creates new sample, returns sample_id and sample_size
2. `add_more_points_to_sample(sample_id, n)`
        ‚Üí Adds n points, returns updated sample_size
3. `monte_carlo_estimate(sample_id)`
        ‚Üí Returns pi estimate (float)
</tools>

<workflow>
1. generate_random_sample(n) ‚Üí get sample_id
2. monte_carlo_estimate(sample_id) ‚Üí get estimate
3. Check: is estimate correct to 4 decimal points (3.1415)
   - YES ‚Üí Report success and stop
   - NO ‚Üí Call add_more_points_to_sample(sample_id, n)
        to improve precision, with a value for n of your
        choosing
4. After adding points, call monte_carlo_estimate again
5. Repeat steps 3-4 until desired precision is attained
</workflow>

<warnings>
NEVER fabricate tool results
    ‚Üí PI estimates only from monte_carlo_estimate tool.
NEVER continue after a tool call - end your response immediately.
ALWAYS wait for the actual tool response before proceeding.
</warnings>
""".strip()

In [32]:
task = Task(
    instruction=instruction_template,
)

## Perform the Task

In [33]:
handler = llm_agent.run(task, max_steps=10)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations. Th...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations....[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      üõ†Ô∏è Executing Tool Call: generate_random_sample
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Successful Tool Call: sample_id='85abc571-afbb-4acc-bbdd-d6519d77892c' sample_size=1000
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Step Result: I need to call the `monte_carlo_estimate` tool with the sample_id '85abc571-afbb-4acc-bbdd-d6519d77892c' to get the initial pi estimate...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      üß† New Step: Call the `monte_carlo_estimate` tool with sample_id '85abc571-afbb-4acc-bbdd-d6519d77892c' to obtain

In [34]:
# if need to cancel uncomment code below
# handler.cancel()  # noqa: ERA001

In [42]:
handler.done()

True

In [43]:
print(handler.rollout)

=== Task Step Start ===

üí¨ assistant: My current instruction is 'You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations. The larger the sample, the better the estimates
become.

Your target: Estimate pi accurate to 4 decimal places (3.1415).

<tools>
1. `generate_random_sample(n)`
        ‚Üí Creates new sample, returns sample_id and sample_size
2. `add_more_points_to_sample(sample_id, n)`
        ‚Üí Adds n points, returns updated sample_size
3. `monte_carlo_estimate(sample_id)`
        ‚Üí Returns pi estimate (float)
</tools>

<workflow>
1. generate_random_sample(n) ‚Üí get sample_id
2. monte_carlo_estimate(sample_id) ‚Üí get estimate
3. Check: is estimate correct to 4 decimal points (3.1415)
   - YES ‚Üí Report success and stop
   - NO ‚Üí Call add_more_points_to_sample(sample_id, n)
        to improve precision, with a value for n of your
        choosing
4. After adding points, call monte_carlo_estimate 

In [44]:
result = handler.exception() or handler.result()
result

llm_agents_from_scratch.errors.agent.MaxStepsReachedError('Max steps reached.')

## Evaluation

In [49]:
trajectory_judge = OllamaLLM(model=judge_llm)

In [65]:
class TrajectoryJudgment(BaseModel):
    """Rubric for evaluating a Monte Carlo pi estimation agent trajectory."""

    reached_target_precision: bool = Field(
        description="True if agent achieved estimate that rounds to 3.1415",
    )

    completed_without_max_steps: bool = Field(
        description=(
            "True if agent completed task without hitting max steps limit"
        ),
    )

    always_added_points_before_reestimating: bool = Field(
        description=(
            "False if agent called monte_carlo_estimate consecutively more "
            "than once before adding points"
        ),
    )

    reused_sample: bool = Field(
        description=(
            "True if agent used add_more_points_to_sample to grow the sample "
            "instead of creating new samples"
        ),
    )

    no_false_completion: bool = Field(
        description=(
            "True if agent only claimed success when the actual tool result "
            "showed 3.1415. False if agent claimed convergence based on a "
            "fabricated or misread estimate."
        ),
    )

    largest_sample_size: int | None = Field(
        description=(
            "The largest sample size achieved during the trajectory, "
            "or None if not determinable from tool outputs"
        ),
    )

    summary: str = Field(
        description="One sentence summary of trajectory quality",
    )

In [62]:
judge_prompt_template = """Evaluate this Monte Carlo pi estimation trajectory.

The agent had three tools:
- `generate_random_sample(n)` - Creates NEW sample
- `add_more_points_to_sample(sample_id, n)` - Adds points to EXISTING sample
- `monte_carlo_estimate(sample_id)` - Returns pi estimate

Correct behavior:
1. Create sample once
2. Estimate ‚Üí if not 3.1415, add points ‚Üí re-estimate ‚Üí repeat

<final_result>
{result}
</final_result>

<trajectory>
{trajectory}
</trajectory>

Evaluate and submit your judgment.""".strip()

In [63]:
trajectory_eval = await trajectory_judge.structured_output(
    prompt=judge_prompt_template.format(
        result=str(result),
        trajectory=handler.rollout,
    ),
    mdl=TrajectoryJudgment,
)

In [64]:
print(trajectory_eval.model_dump_json(indent=4))

{
    "reached_target_precision": false,
    "completed_without_max_steps": false,
    "always_added_points_before_reestimating": false,
    "reused_sample": false,
    "summary": "The Monte Carlo estimate for œÄ after 38,000 points was 3.1404, which is 0.0011 away from the target 3.1415. This exceeds the required four decimal place accuracy (¬±0.00005). The process did not reuse the existing sample but generated a new one, which may have introduced variability. To achieve the desired precision, additional points (e.g., 100,000 or more) would be necessary."
}


### Replications

In this section, we'll repeat the task multiple times to get a more robust evaluation of our LLM agent's performance.

In [66]:
num_replications = 10
handlers = []
for _ in range(num_replications):
    h = llm_agent.run(task, max_steps=10)
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations. Th...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations....[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations. Th...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimate
expectations....[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are given tools to estimate pi using Monte Carlo methods.
Monte Carlo methods rely on random samples to estimat

In [89]:
[h.done() for h in handlers]

[False, False, True, False, False, True, False, False, False, False]

#### Trajectory Evaluations

In [71]:
import asyncio

In [None]:
eval_async_tasks = []
for handler in handlers:
    async_task = trajectory_judge.structured_output(
        prompt=judge_prompt_template.format(
            result=str(handler.exception() or handler.result()),
            trajectory=handler.rollout,
        ),
        mdl=TrajectoryJudgment,
    )
    eval_async_tasks.append(async_task)

trajectory_evals = await asyncio.gather(*eval_async_tasks)
trajectory_evals