# Capstone 1 (from Chapter 5) ‚Äî Monte Carlo Estimation of Pi

## Setup

In [1]:
import logging
import os

from llm_agents_from_scratch.logger import enable_console_logging

### Constants

In [2]:
IS_ON_RUNPOD = "RUNPOD_POD_ID" in os.environ
LOGGING_ENABLED = True
LOGGING_LEVEL = logging.INFO

# for task execution
MAX_STEPS = 20
NUM_REPLICATIONS = 10

In [3]:
# Install additional dependencies for notebook
if IS_ON_RUNPOD:
    !uv pip install numpy pandas --system
else:
    !uv pip install numpy pandas

Resolved 6 packages in 85ms                                                     
Prepared 1 package in 179ms                                                     
Installed 6 packages in 10msandas==2.3.3                                        
 + numpy==2.4.1
 + pandas==2.3.3
 + python-dateutil==2.9.0.post0
 + pytz==2025.2
 + six==1.17.0
 + tzdata==2025.3


In [4]:
# maybe enable logging
if LOGGING_ENABLED:
    enable_console_logging(LOGGING_LEVEL)

## LLMs

In [5]:
if IS_ON_RUNPOD:
    backbone_llm = os.getenv("OLLAMA_MODEL")
    judge_llm = "gpt-5" if os.getenv("OPENAI_API_KEY") else backbone_llm
else:
    backbone_llm = "qwen3:8b"
    judge_llm = "gpt-5" if os.getenv("OPENAI_API_KEY") else backbone_llm

In [6]:
print(f"Backbone LLM: {backbone_llm}")
print(f"Judge LLM: {judge_llm}")

Backbone LLM: qwen3:8b
Judge LLM: gpt-5


## Build Tools

### (Listing 5.1) Tool: `generate_random_sample()`

In [7]:
import uuid

import numpy as np
from pydantic import BaseModel, ConfigDict, Field, computed_field

from llm_agents_from_scratch.tools import PydanticFunctionTool

# Global registry to store samples
SAMPLE_REGISTRY: dict[str, list[tuple[float, float]]] = {}


class RandomSampleParams(BaseModel):
    """Params for generate_random_sample."""

    model_config = ConfigDict(extra="forbid")
    n: int = Field(description="The number of random points to generate")


class RandomSample(BaseModel):
    """Result from generate_random_sample."""

    sample_id: str = Field(
        description="Pass this sample_id to monte_carlo_estimate",
    )

    @computed_field
    @property
    def sample_size(
        self,
    ) -> int:
        """Determine n from SAMPLE_REGISTRY."""
        return len(SAMPLE_REGISTRY[self.sample_id])

    def __str__(self) -> str:
        """String representation of RandomSample."""
        return self.model_dump_json()


def generate_random_sample(params: RandomSampleParams) -> RandomSample:
    """Generate n random points in [0, 1] √ó [0, 1].

    Returns a sample_id. Pass this sample_id directly to monte_carlo_estimate.
    """
    pts = np.random.uniform(size=(params.n, 2))

    sample_id = str(uuid.uuid4())
    SAMPLE_REGISTRY[sample_id] = [tuple(pt) for pt in pts.tolist()]

    return RandomSample(sample_id=sample_id)


# generate random sample tool
random_sample_tool = PydanticFunctionTool(generate_random_sample)

#### Demonstration

In [8]:
from llm_agents_from_scratch.data_structures import ToolCall

rs_tool_call = ToolCall(
    tool_name=random_sample_tool.name,
    arguments={"n": 5000},
)
rs_tool_call_result = random_sample_tool(rs_tool_call)
rs_tool_call_result

ToolCallResult(tool_call_id='742838ea-1038-4507-8be2-ca2350fd45d6', content='{"sample_id":"e6799d5a-59a5-49e2-a631-1c774a8ad358","sample_size":5000}', error=False)

### (Listing 5.2) Tool: `add_more_points()`

In [9]:
class AddPointsParams(BaseModel):
    """Params for add_more_points_to_sample."""

    model_config = ConfigDict(extra="forbid")
    sample_id: str = Field(
        description="The sample_id of the sample to augment",
    )
    n: int = Field(description="The number of random points to generate")


def add_more_points_to_sample(params: AddPointsParams) -> RandomSample:
    """Add n more random points to an existing random sample.

    Returns a sample_id and the total number of points.
    """
    pts = np.random.uniform(size=(params.n, 2))

    # augment sample
    SAMPLE_REGISTRY[params.sample_id] += [tuple(pt) for pt in pts.tolist()]

    return RandomSample(sample_id=params.sample_id)


# create tool
add_more_points_tool = PydanticFunctionTool(add_more_points_to_sample)

#### Demonstration

In [10]:
# get the sample ID of the previous random_sample_tool() invocation
random_sample = RandomSample.model_validate_json(rs_tool_call_result.content)

# build tool call for add more points
add_pts_tool_call = ToolCall(
    tool_name=add_more_points_tool.name,
    arguments={
        "sample_id": random_sample.sample_id,
        "n": 500,
    },
)
add_pts_tool_call_result = add_more_points_tool(add_pts_tool_call)
add_pts_tool_call_result

ToolCallResult(tool_call_id='7a8bf483-1b06-4286-bc7d-5d918bdcdc81', content='{"sample_id":"e6799d5a-59a5-49e2-a631-1c774a8ad358","sample_size":5500}', error=False)

### (Listing 5.3) Tool: `monte_carlo_estimate()`

In [11]:
class MonteCarloEstimateParams(BaseModel):
    """Params for monte_carlo_estimate."""

    model_config = ConfigDict(extra="forbid")
    sample_id: str = Field(
        description="The sample_id returned by generate_random_sample",
    )


class MonteCarloEstimateResult(BaseModel):
    """Results for monte_carlo_estimate."""

    sample_id: str
    sample_size: int
    estimate: float

    def __str__(self) -> str:
        """String representation of MonteCarloEstimateResult."""
        return self.model_dump_json()


def monte_carlo_estimate(
    params: MonteCarloEstimateParams,
) -> MonteCarloEstimateResult:
    """Estimate pi using Monte Carlo method.

    Args:
        params: Contains sample_id from generate_random_sample.

    Returns:
        Estimate of pi (float).
    """
    points = SAMPLE_REGISTRY[params.sample_id]
    n = len(points)
    inside = sum((x**2 + y**2) < 1 for x, y in points)
    return MonteCarloEstimateResult(
        estimate=(inside / n) * 4,
        sample_id=params.sample_id,
        sample_size=n,
    )


# create tool
monte_carlo_estimate_tool = PydanticFunctionTool(monte_carlo_estimate)

#### Demonstration

In [12]:
# build tool call for estimating Pi
mc_estimate_tool_call = ToolCall(
    tool_name=monte_carlo_estimate_tool.name,
    arguments={
        "sample_id": random_sample.sample_id,
    },
)
mc_estimate_tool_call_result = monte_carlo_estimate_tool(mc_estimate_tool_call)
mc_estimate_tool_call_result

ToolCallResult(tool_call_id='7ff6d240-52f8-42ea-b96c-62790af123a6', content='{"sample_id":"e6799d5a-59a5-49e2-a631-1c774a8ad358","sample_size":5500,"estimate":3.181090909090909}', error=False)

## Define the Task

### (Listing 5.4) Writing the task instruction

In [13]:
instruction = """
You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the estimate falls in the range [3.1415, 3.1425).
Any value from 3.1415 up to (but not including) 3.1425 is a success.

Examples:
- 3.14159 ‚úì (within range)
- 3.14200 ‚úì (within range)
- 3.14149 ‚úó (too low)
- 3.14250 ‚úó (too high)

<algorithm>
1. Call generate_random_sample(1000000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 million
   - Second add: 2 million
   - Third add: 4 million
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling.
</algorithm>

<critical_rules>
- If the task is not complete, your response MUST contain a tool call
- Do not just describe what you plan to do‚Äîactually call the tool
- Do not stop until the estimate falls within the target range
- Keep track of your iteration to calculate the correct doubling amount
- NEVER fabricate tool results-only use actual tool responses
- NEVER invent a sample_id
</critical_rules>

<final_output>
When the estimate reaches the target precision, respond with this exact JSON
structure and nothing else:

{"sample_id": "<the-actual-sample-id-from-tool-response>"}

No explanation, no markdown formatting, no code blocks‚Äîjust the raw JSON.
</final_output>

Begin by calling generate_random_sample(1000000).
""".strip()

### (Listing 5.5) The Task

In [14]:
from llm_agents_from_scratch.data_structures import Task

task = Task(
    instruction=instruction,
)

## (Listing 5.6) Creating our LLMAgent

In [15]:
from llm_agents_from_scratch import LLMAgent
from llm_agents_from_scratch.llms import OllamaLLM

llm = OllamaLLM(backbone_llm)
llm_agent = LLMAgent(
    llm=llm,
    tools=[
        random_sample_tool,
        add_more_points_tool,
        monte_carlo_estimate_tool,
    ],
)

## Perform the Task

In [16]:
handler = llm_agent.run(task, max_steps=MAX_STEPS)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      üõ†Ô∏è Executing Tool Call: generate_random_sample
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Successful Tool Call: {"sample_id":"bc66a054-8bef-45a0-b807-13bcc09b37a6","sample_size":1000000}
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Step Result: <tool_call>
{"name": "monte_carlo_estimate", "arguments": {"sample_id":"bc66a054-8bef-45a0-b807-13bcc09b37a6"}}
</tool_call>
INFO (llm_agents_fs.TaskHandler) :      üß† New Step: {"name": "monte_carlo_estimate", "arguments": {"sample_id":"bc66a054-8bef-45a0-b807-13bcc09b37a6"}}
INFO (llm_agents

In [17]:
# if need to cancel uncomment code below
# handler.cancel()  # noqa: ERA001

In [18]:
handler.done()

False

In [19]:
if handler.done():
    # check if there was an error
    handler.exception()

In [20]:
print(handler.rollout)

=== Task Step Start ===

üí¨ assistant: My current instruction is 'You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the estimate falls in the range [3.1415, 3.1425).
Any value from 3.1415 up to (but not including) 3.1425 is a success.

Examples:
- 3.14159 ‚úì (within range)
- 3.14200 ‚úì (within range)
- 3.14149 ‚úó (too low)
- 3.14250 ‚úó (too high)

<algorithm>
1. Call generate_random_sample(1000000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 million
   - Second add: 2 million
   - Third add: 4 million
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling

In [21]:
result = handler.exception() or handler.result()
result

llm_agents_from_scratch.errors.agent.MaxStepsReachedError('Max steps reached.')

## Evaluation

### (Listing 5.7) Evaluating Task Success

In [22]:
import json
from json import JSONDecodeError

from pydantic import ValidationError


def estimate_has_target_precision(estimate: MonteCarloEstimateResult) -> bool:
    """Checks if the estimate achieved the desired precision.

    Target precision is 3 decimal places (3.142), meaning the estimate
    should be between 3.1415 and 3.1425.
    """
    upper_bound = 3.1425
    lower_bound = 3.1415
    return lower_bound <= estimate.estimate < upper_bound


def is_task_success(
    handler: LLMAgent.TaskHandler,
    verbose: bool = False,
) -> bool:
    """Determines task success.

    Args:
        handler (LLMAgent.TaskHandler): The handler containing the
            result or exception of the task execution
        verbose (bool): Whether to print out details of the
            determination. Defaults to False.

    Returns:
        bool: True if task was successful. False, otherwise.
    """
    if handler.exception():
        if verbose:
            print(handler.exception())
        return False

    result = handler.result()
    try:
        output_data = json.loads(result.content)
        sample_id = output_data["sample_id"]
        params = MonteCarloEstimateParams(
            sample_id=sample_id,
        )
        estimate = monte_carlo_estimate(params)
        if verbose:
            print(
                f"Estimate: {estimate}",
            )
        return estimate_has_target_precision(estimate)
    except (ValidationError, KeyError, JSONDecodeError) as e:
        # invalid sample_id provided by LLM agent‚Äîunsuccessful task
        if verbose:
            print(f"The LLM agent returned an invalid output: {str(e)}.")
        return False

In [23]:
is_task_success(handler, verbose=True)

Max steps reached.


False

### Trajectory Analysis

In [24]:
if judge_llm.startswith("gpt-"):
    from llm_agents_from_scratch.llms.openai import OpenAILLM

    trajectory_judge = OpenAILLM(model=judge_llm)
else:
    # fallback to Ollama model
    trajectory_judge = OllamaLLM(model=judge_llm)

### (Listing 5.8) Rubric for LLM judge

In [25]:
class TrajectoryEvalRubric(BaseModel):
    """Rubric for evaluating an execution trajectory."""

    reached_target_precision: bool = Field(
        description="True if agent achieved estimate that rounds to 3.142",
    )

    completed_without_max_steps: bool = Field(
        description=(
            "True if agent completed task without hitting max steps limit"
        ),
    )

    always_added_points_before_reestimating: bool = Field(
        description=(
            "False if agent called monte_carlo_estimate consecutively more "
            "than once before adding points"
        ),
    )

    reused_sample: bool = Field(
        description=(
            "True if agent used add_more_points_to_sample to grow the sample "
            "instead of creating new samples"
        ),
    )

    no_false_completion: bool = Field(
        description=(
            "True if agent only claimed success when the actual tool result "
            "showed 3.142. False if agent claimed convergence based on a "
            "fabricated or misread estimate."
        ),
    )

    no_missed_completion: bool = Field(
        description=(
            "True if agent stopped when estimate reached 3.142. False if "
            "agent continued adding points after already achieving target."
        ),
    )

    followed_output_format: bool = Field(
        description=(
            "True if agent's final response contained only the sample_id "
            "as instructed, with no additional text or explanation."
        ),
    )

    largest_sample_size: int | None = Field(
        description=(
            "The largest sample size achieved during the trajectory, "
            "or None if not determinable from tool outputs"
        ),
    )

    summary: str = Field(
        description="One sentence summary of trajectory quality",
    )

### (Listing 5.9) LLM judge instruction prompt

In [53]:
judge_prompt_template = """Evaluate this Monte Carlo pi estimation trajectory.

The agent had three tools:
- `generate_random_sample(n)` - Creates NEW sample
- `add_more_points_to_sample(sample_id, n)` - Adds points to EXISTING sample
- `monte_carlo_estimate(sample_id)` - Returns pi estimate

Correct behavior:
1. Create sample once
2. Estimate ‚Üí if not between 3.1415 and 3.1425,
   add points ‚Üí re-estimate ‚Üí repeat
3. When target reached, respond with ONLY the sample_id (no other text)

Note: If final_response is "Max steps error", the agent failed to complete
the task within the allowed number of steps.

HALLUCINATION MARKER: If you see "üí¨ assistant: üîß tool:" in the trajectory,
the agent fabricated a tool response instead of waiting for the actual result.
This is a critical failure‚Äîset no_false_completion to False.

<final_response>
{result}
</final_response>

<trajectory>
{trajectory}
</trajectory>

Evaluate and submit your judgment.""".strip()

In [27]:
trajectory_eval = await trajectory_judge.structured_output(
    prompt=judge_prompt_template.format(
        result=str(result),
        trajectory=handler.rollout,
    ),
    mdl=TrajectoryEvalRubric,
)

In [28]:
print(trajectory_eval.model_dump_json(indent=4))

{
    "reached_target_precision": true,
    "completed_without_max_steps": true,
    "always_added_points_before_reestimating": false,
    "reused_sample": true,
    "no_false_completion": true,
    "no_missed_completion": true,
    "followed_output_format": true,
    "largest_sample_size": 16000000,
    "summary": "The task was completed successfully, with the Monte Carlo estimate of œÄ reaching a precision of 3.1416. The process involved generating a random sample, estimating œÄ using the Monte Carlo method, and iteratively increasing the sample size to improve accuracy. The final estimate was within the target range of [3.1415, 3.1425). The process followed the required output format and did not exceed the maximum number of steps."
}


## Replications for a more reliable evaluation

In this section, we'll repeat the task multiple times to get a more robust evaluation of our LLM agent's performance.

### (Listing 5.10) Repeated task executions with our LLM agent

In [29]:
handlers = []
for _ in range(NUM_REPLICATIONS):
    h = llm_agent.run(task, max_steps=MAX_STEPS)
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.

In [40]:
# can execute this repeatedly until all handlers are done
[str(h.exception() or h.result()) if h.done() else "Not Done" for h in handlers]

['Max steps reached.',
 '{"sample_id": "6e9e4898-3cd8-41ff-be63-8ad5777ab7fa"}',
 '{"sample_id": "c1899d52-78a4-4775-ad54-eb501ada983d"}',
 '{"sample_id": "13ff8c3e-d063-4cef-a33d-e4d56683008b"}',
 'Max steps reached.',
 '{"sample_id": "70013709-5960-4bd9-ba4f-407d506276f4"}',
 '{"sample_id": "4bbd5b65-68d6-43ab-bd8c-591a60596089"}',
 '{"sample_id": "e67b9888-bd96-4c22-a8e3-79420d51b980"}',
 'Max steps reached.',
 '{"sample_id": "d7bcadd5-f702-4b1b-ab29-c01be8f1ae12"}']

### (Listing 5.11) Task Success and Trajectory Evaluations of Individual Runs

In [54]:
import asyncio

task_success = []
eval_async_coros = []
for handler in handlers:
    # task success evaluation
    task_success.append(int(is_task_success(handler)))

    # trajectory evaluation coro
    coro = trajectory_judge.structured_output(
        prompt=judge_prompt_template.format(
            result=str(handler.exception() or handler.result()),
            trajectory=handler.rollout,
        ),
        mdl=TrajectoryEvalRubric,
    )
    eval_async_coros.append(coro)

trajectory_evals = await asyncio.gather(*eval_async_coros)

In [55]:
task_success

[0, 1, 0, 1, 0, 1, 1, 1, 0, 1]

In [56]:
trajectory_evals

[TrajectoryEvalRubric(reached_target_precision=False, completed_without_max_steps=False, always_added_points_before_reestimating=False, reused_sample=True, no_false_completion=True, no_missed_completion=True, followed_output_format=True, largest_sample_size=1000000, summary='The assistant attempted to estimate œÄ using the Monte Carlo method with multiple samples, but the estimate of 3.140456 did not fall within the target range [3.1415, 3.1425]. The assistant reused the same sample multiple times without generating a new one, which limited the accuracy. The process did not reach the target precision, and the assistant did not follow through with increasing the sample size further. The output format was followed, and no false or missed completions occurred.'),
 TrajectoryEvalRubric(reached_target_precision=True, completed_without_max_steps=True, always_added_points_before_reestimating=True, reused_sample=True, no_false_completion=True, no_missed_completion=True, followed_output_format=

### Evaluation Summary

In [57]:
import pandas as pd

from llm_agents_from_scratch.notebook_utils import set_dataframe_display_options

# sets display options for pd.DataFrame in notebooks
set_dataframe_display_options()

In [58]:
# shape eval results into a pd.DataFrame
evals_df = pd.DataFrame(
    data=[e.model_dump() for e in trajectory_evals],
)

# add task_success column
evals_df.insert(0, "task_success", task_success)

# separate summary column
summary_df = evals_df[["summary"]].copy()
evals_df = evals_df.drop(columns=["summary"])

# compute aggregations: TOTAL and AVG rows
total_row = {}
avg_row = {}

for col, dtype in evals_df.dtypes.items():
    if dtype == "bool" or pd.api.types.is_numeric_dtype(dtype):
        total_row[col] = evals_df[col].sum()
        avg_row[col] = evals_df[col].mean()
    else:
        total_row[col] = "TOTAL"
        avg_row[col] = "AVG"

# merge evaluations and aggregations dataframes
evals_df = pd.concat(
    [
        pd.DataFrame([total_row, avg_row], index=["TOTAL", "AVG"]),
        evals_df,
    ],
)

# style
evals_df.style.apply(
    lambda r: ["border-bottom: 2px solid #444"] * len(r)
    if r.name == "AVG"
    else [""] * len(r),
    axis=1,
)

Unnamed: 0,task_success,reached_target_precision,completed_without_max_steps,always_added_points_before_reestimating,reused_sample,no_false_completion,no_missed_completion,followed_output_format,largest_sample_size
TOTAL,6.0,8.0,8.0,7.0,9.0,10.0,10.0,10.0,96001000.0
AVG,0.6,0.8,0.8,0.7,0.9,1.0,1.0,1.0,9600100.0
0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1000000.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6000000.0
2,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7000000.0
3,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,51000000.0
4,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5000000.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1000000.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9000000.0
7,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1000000.0


In [59]:
summary_df

Unnamed: 0,summary
0,"The assistant attempted to estimate œÄ using the Monte Carlo method with multiple samples, but the estimate of 3.140456 did not fall within the target range [3.1415, 3.1425]. The assistant reused the same sample multiple times without generating a new one, which limited the accuracy. The process did not reach the target precision, and the assistant did not follow through with increasing the sample size further. The output format was followed, and no false or missed completions occurred."
1,"The agent successfully estimated pi with a Monte Carlo method, doubling the sample size each time and re-estimating until the result fell within the target range [3.1415, 3.1425). The final estimate was 3.1421953333333335, which is within the target range. The agent followed all critical rules, including not fabricating tool results, using only actual tool responses, and not stopping until the estimate was accurate. The final response was in the correct format and contained only the required JSON with the sample_id."
2,"The agent successfully estimated pi using Monte Carlo methods. The estimate with 7 million points (3.141856) fell within the target range of [3.1415, 3.1425). The agent correctly followed the algorithm by doubling the sample size each time and re-estimating. The final response was in the correct JSON format with the actual sample_id."
3,"The task was successfully completed by iteratively increasing the sample size until the Monte Carlo estimate fell within the target range [3.1415, 3.1425]. The final estimate of 3.1415163921568627 is accurate and meets the required precision."
4,"The process successfully executed the Monte Carlo integration with a sample size of 5,000,000, achieving an estimate of œÄ as 3.142068, which is very close to the true value of œÄ (3.1415926535...). The process followed the required steps, reused the sample efficiently, and did not exceed the maximum allowed steps. The final result is accurate and meets the expected precision."
5,"The agent successfully estimated pi using Monte Carlo methods with an initial sample of 1 million points. The estimate (3.142068) fell within the target range [3.1415, 3.1425), so the task was completed successfully without exceeding the maximum steps."
6,"The agent successfully estimated pi using Monte Carlo methods. The final estimate of 3.1416897777777777 falls within the target range [3.1415, 3.1425). The agent followed the algorithm correctly, doubling the sample size each time and re-estimating. The agent did not fabricate any tool results, and the final response was in the correct JSON format."
7,"The agent correctly generated a sample, estimated pi, and confirmed the estimate fell within the target range. The final response was correct and followed the required format."
8,"The task involved estimating œÄ using the Monte Carlo method with a random sample. A sample of size 1000 was generated, and the estimate of œÄ was computed as 3.044. While this is a valid approximation, it is not very close to the actual value of œÄ (‚âà 3.1415926535). The relatively small sample size led to a less accurate estimate. The process followed the required steps, and the output format was adhered to. However, the target precision was not reached, and the sample size was not large enough to ensure a more accurate estimate."
9,"The agent successfully completed the task by generating an initial sample, estimating pi, and iteratively adding more points until the estimate fell within the target range. The final estimate was 3.1417848, which is within [3.1415, 3.1425). The agent followed all rules, including not fabricating tool responses and only reporting the sample_id at the end."


In [60]:
# write results to json
evals_df.to_json("evals_df.json")
summary_df.to_json("summary_df.json")

In [61]:
print(handlers[2].rollout)

=== Task Step Start ===

üí¨ assistant: My current instruction is 'You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the estimate falls in the range [3.1415, 3.1425).
Any value from 3.1415 up to (but not including) 3.1425 is a success.

Examples:
- 3.14159 ‚úì (within range)
- 3.14200 ‚úì (within range)
- 3.14149 ‚úó (too low)
- 3.14250 ‚úó (too high)

<algorithm>
1. Call generate_random_sample(1000000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 million
   - Second add: 2 million
   - Third add: 4 million
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling