# Capstone 1 (from Chapter 5) ‚Äî Monte Carlo Estimation of Pi

## Setup

In [1]:
import logging
import os

from llm_agents_from_scratch.logger import enable_console_logging

### Constants

In [2]:
IS_ON_RUNPOD = "RUNPOD_POD_ID" in os.environ
LOGGING_ENABLED = True
LOGGING_LEVEL = logging.INFO

# for task execution
MAX_STEPS = 20
NUM_REPLICATIONS = 10

In [3]:
# Install additional dependencies for notebook
if IS_ON_RUNPOD:
    !uv pip install numpy pandas --system
else:
    !uv pip install numpy pandas

Resolved 6 packages in 47ms                                                     
Installed 6 packages in 40msandas==2.3.3                                        
 + numpy==2.4.0
 + pandas==2.3.3
 + python-dateutil==2.9.0.post0
 + pytz==2025.2
 + six==1.17.0
 + tzdata==2025.3


In [4]:
# maybe enable logging
if LOGGING_ENABLED:
    enable_console_logging(LOGGING_LEVEL)

## LLMs

In [5]:
if IS_ON_RUNPOD:
    backbone_llm = os.getenv("OLLAMA_MODEL")
    judge_llm = "gpt-5" if os.getenv("OPENAI_API_KEY") else backbone_llm
else:
    backbone_llm = "qwen3:8b"
    judge_llm = "gpt-5" if os.getenv("OPENAI_API_KEY") else backbone_llm

In [6]:
print(f"Backbone LLM: {backbone_llm}")
print(f"Judge LLM: {judge_llm}")

Backbone LLM: qwen3:8b
Judge LLM: qwen3:8b


## Build Tools

### (Listing 5.1) Tool: `generate_random_sample()`

In [7]:
import uuid

import numpy as np
from pydantic import BaseModel, ConfigDict, Field, computed_field

from llm_agents_from_scratch.tools import PydanticFunctionTool

# Global registry to store samples
SAMPLE_REGISTRY: dict[str, list[tuple[float, float]]] = {}


class RandomSampleParams(BaseModel):
    """Params for generate_random_sample."""

    model_config = ConfigDict(extra="forbid")
    n: int = Field(description="The number of random points to generate")


class RandomSample(BaseModel):
    """Result from generate_random_sample."""

    sample_id: str = Field(
        description="Pass this sample_id to monte_carlo_estimate",
    )

    @computed_field
    @property
    def sample_size(
        self,
    ) -> int:
        """Determine n from SAMPLE_REGISTRY."""
        return len(SAMPLE_REGISTRY[self.sample_id])

    def __str__(self) -> str:
        """String representation of RandomSample."""
        return self.model_dump_json()


def generate_random_sample(params: RandomSampleParams) -> RandomSample:
    """Generate n random points in [0, 1] √ó [0, 1].

    Returns a sample_id. Pass this sample_id directly to monte_carlo_estimate.
    """
    pts = np.random.uniform(size=(params.n, 2))

    sample_id = str(uuid.uuid4())
    SAMPLE_REGISTRY[sample_id] = [tuple(pt) for pt in pts.tolist()]

    return RandomSample(sample_id=sample_id)


# generate random sample tool
random_sample_tool = PydanticFunctionTool(generate_random_sample)

#### Demonstration

In [8]:
from llm_agents_from_scratch.data_structures import ToolCall

rs_tool_call = ToolCall(
    tool_name=random_sample_tool.name,
    arguments={"n": 5000},
)
rs_tool_call_result = random_sample_tool(rs_tool_call)
rs_tool_call_result

ToolCallResult(tool_call_id='a01e66f7-d25f-4e3b-b381-dac816202417', content='{"sample_id":"27ef0e58-da50-4401-acdf-f01a4b77ed87","sample_size":5000}', error=False)

### (Listing 5.2) Tool: `add_more_points()`

In [9]:
class AddPointsParams(BaseModel):
    """Params for add_more_points_to_sample."""

    model_config = ConfigDict(extra="forbid")
    sample_id: str = Field(
        description="The sample_id of the sample to augment",
    )
    n: int = Field(description="The number of random points to generate")


def add_more_points_to_sample(params: AddPointsParams) -> RandomSample:
    """Add n more random points to an existing random sample.

    Returns a sample_id and the total number of points.
    """
    pts = np.random.uniform(size=(params.n, 2))

    # augment sample
    SAMPLE_REGISTRY[params.sample_id] += [tuple(pt) for pt in pts.tolist()]

    return RandomSample(sample_id=params.sample_id)


# create tool
add_more_points_tool = PydanticFunctionTool(add_more_points_to_sample)

#### Demonstration

In [10]:
# get the sample ID of the previous random_sample_tool() invocation
random_sample = RandomSample.model_validate_json(rs_tool_call_result.content)

# build tool call for add more points
add_pts_tool_call = ToolCall(
    tool_name=add_more_points_tool.name,
    arguments={
        "sample_id": random_sample.sample_id,
        "n": 500,
    },
)
add_pts_tool_call_result = add_more_points_tool(add_pts_tool_call)
add_pts_tool_call_result

ToolCallResult(tool_call_id='263e198f-3033-4d3e-ac6b-544af8f5764c', content='{"sample_id":"27ef0e58-da50-4401-acdf-f01a4b77ed87","sample_size":5500}', error=False)

### (Listing 5.3) Tool: `monte_carlo_estimate()`

In [55]:
class MonteCarloEstimateParams(BaseModel):
    """Params for monte_carlo_estimate."""

    model_config = ConfigDict(extra="forbid")
    sample_id: str = Field(
        description="The sample_id returned by generate_random_sample",
    )


class MonteCarloEstimateResult(BaseModel):
    """Results for monte_carlo_estimate."""

    sample_id: str
    sample_size: int
    estimate: float

    def __str__(self) -> str:
        """String representation of MonteCarloEstimateResult."""
        return self.model_dump_json()


def monte_carlo_estimate(
    params: MonteCarloEstimateParams,
) -> MonteCarloEstimateResult:
    """Estimate pi using Monte Carlo method.

    Args:
        params: Contains sample_id from generate_random_sample.

    Returns:
        Estimate of pi (float).
    """
    points = SAMPLE_REGISTRY[params.sample_id]
    n = len(points)
    inside = sum((x**2 + y**2) < 1 for x, y in points)
    return MonteCarloEstimateResult(
        estimate=(inside / n) * 4,
        sample_id=params.sample_id,
        sample_size=n,
    )


# create tool
monte_carlo_estimate_tool = PydanticFunctionTool(monte_carlo_estimate)

#### Demonstration

In [56]:
# build tool call for estimating Pi
mc_estimate_tool_call = ToolCall(
    tool_name=monte_carlo_estimate_tool.name,
    arguments={
        "sample_id": random_sample.sample_id,
    },
)
mc_estimate_tool_call_result = monte_carlo_estimate_tool(mc_estimate_tool_call)
mc_estimate_tool_call_result

ToolCallResult(tool_call_id='051195db-fbae-4e3b-a9d4-8e264e7d8d8d', content='{"sample_id":"27ef0e58-da50-4401-acdf-f01a4b77ed87","sample_size":5500,"estimate":3.1665454545454543}', error=False)

## Define the Task

### (Listing 5.4) Writing the task instruction

In [14]:
instruction = """
You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the estimate falls in the range [3.1415, 3.1425).
Any value from 3.1415 up to (but not including) 3.1425 is a success.

Examples:
- 3.14159 ‚úì (within range)
- 3.14200 ‚úì (within range)
- 3.14149 ‚úó (too low)
- 3.14250 ‚úó (too high)

<algorithm>
1. Call generate_random_sample(1000000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 million
   - Second add: 2 million
   - Third add: 4 million
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling.
</algorithm>

<critical_rules>
- If the task is not complete, your response MUST contain a tool call
- Do not just describe what you plan to do‚Äîactually call the tool
- Do not stop until the estimate falls within the target range
- Keep track of your iteration to calculate the correct doubling amount
- NEVER fabricate tool results-only use actual tool responses
- NEVER invent a sample_id
</critical_rules>

<final_output>
When the estimate reaches the target precision, respond with this exact JSON
structure and nothing else:

{"sample_id": "<the-actual-sample-id-from-tool-response>"}

No explanation, no markdown formatting, no code blocks‚Äîjust the raw JSON.
</final_output>

Begin by calling generate_random_sample(1000000).
""".strip()

### (Listing 5.5) The Task

In [15]:
from llm_agents_from_scratch.data_structures import Task

task = Task(
    instruction=instruction,
)

## Define our LLMAgent

In [16]:
from llm_agents_from_scratch import LLMAgent
from llm_agents_from_scratch.llms import OllamaLLM

llm = OllamaLLM(backbone_llm)
llm_agent = LLMAgent(
    llm=llm,
    tools=[
        random_sample_tool,
        add_more_points_tool,
        monte_carlo_estimate_tool,
    ],
)

## Perform the Task

In [17]:
handler = llm_agent.run(task, max_steps=MAX_STEPS)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      üõ†Ô∏è Executing Tool Call: generate_random_sample
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Successful Tool Call: {"sample_id":"cdfca1f1-023c-4202-b21c-ae586a5c2594","sample_size":1000000}
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Step Result: <tool_call>
{"name": "monte_carlo_estimate", "arguments": {"sample_id":"cdfca1f1-023c-4202-b21c-ae586a5c2594"}}
</tool_call>
INFO (llm_agents_fs.TaskHandler) :      üß† New Step: {"name": "monte_carlo_estimate", "arguments": {"sample_id":"cdfca1f1-023c-4202-b21c-ae586a5c2594"}}
INFO (llm_agents

In [23]:
# if need to cancel uncomment code below
# handler.cancel()  # noqa: ERA001

In [19]:
handler.done()

True

In [20]:
handler.exception()

In [21]:
print(handler.rollout)

=== Task Step Start ===

üí¨ assistant: My current instruction is 'You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the estimate falls in the range [3.1415, 3.1425).
Any value from 3.1415 up to (but not including) 3.1425 is a success.

Examples:
- 3.14159 ‚úì (within range)
- 3.14200 ‚úì (within range)
- 3.14149 ‚úó (too low)
- 3.14250 ‚úó (too high)

<algorithm>
1. Call generate_random_sample(1000000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 million
   - Second add: 2 million
   - Third add: 4 million
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling

In [22]:
result = handler.exception() or handler.result()
result

TaskResult(task_id='e20397d3-371b-4480-83df-9688ea0b0ce8', content='{"sample_id": "cdfca1f1-023c-4202-b21c-ae586a5c2594"}')

## Evaluation

In [23]:
def estimate_has_target_precision(estimate: MonteCarloEstimateResult) -> bool:
    """Checks if the estimate achieved the desired precision.

    Target precision is 3 decimal places (3.142), meaning the estimate
    should be between 3.1415 and 3.1425.
    """
    upper_bound = 3.1425
    lower_bound = 3.1415
    return lower_bound <= estimate.estimate < upper_bound

### Task Success

In [24]:
import json
from json import JSONDecodeError

from pydantic import ValidationError

In [25]:
try:
    output_data = json.loads(result.content)
    sample_id = output_data["sample_id"]
    params = MonteCarloEstimateParams(
        sample_id=sample_id,
    )
    estimate = monte_carlo_estimate(params)
    print(estimate)
    print(
        "Estimate has target precision: ",
        estimate_has_target_precision(estimate),
    )
except (ValidationError, KeyError, JSONDecodeError):
    print("The LLM agent returned an invalid output `sample_id.")
except AttributeError:
    msg = "The LLM agent exceeded max steps without reaching target precision."
    print(msg)

{"sample_id":"cdfca1f1-023c-4202-b21c-ae586a5c2594","sample_size":7000000,"estimate":3.1420754285714287}
Estimate has target precision:  True


### Trajectory Analysis

In [26]:
if judge_llm.startswith("gpt-"):
    from llm_agents_from_scratch.llms.openai import OpenAILLM

    trajectory_judge = OpenAILLM(model=judge_llm)
else:
    # fallback to Ollama model
    trajectory_judge = OllamaLLM(model=judge_llm)

In [27]:
class TrajectoryJudgment(BaseModel):
    """Rubric for evaluating a Monte Carlo pi estimation agent trajectory."""

    reached_target_precision: bool = Field(
        description="True if agent achieved estimate that rounds to 3.142",
    )

    completed_without_max_steps: bool = Field(
        description=(
            "True if agent completed task without hitting max steps limit"
        ),
    )

    always_added_points_before_reestimating: bool = Field(
        description=(
            "False if agent called monte_carlo_estimate consecutively more "
            "than once before adding points"
        ),
    )

    reused_sample: bool = Field(
        description=(
            "True if agent used add_more_points_to_sample to grow the sample "
            "instead of creating new samples"
        ),
    )

    no_false_completion: bool = Field(
        description=(
            "True if agent only claimed success when the actual tool result "
            "showed 3.142. False if agent claimed convergence based on a "
            "fabricated or misread estimate."
        ),
    )

    no_missed_completion: bool = Field(
        description=(
            "True if agent stopped when estimate reached 3.142. False if "
            "agent continued adding points after already achieving target."
        ),
    )

    followed_output_format: bool = Field(
        description=(
            "True if agent's final response contained only the sample_id "
            "as instructed, with no additional text or explanation."
        ),
    )

    largest_sample_size: int | None = Field(
        description=(
            "The largest sample size achieved during the trajectory, "
            "or None if not determinable from tool outputs"
        ),
    )

    summary: str = Field(
        description="One sentence summary of trajectory quality",
    )

In [28]:
judge_prompt_template = """Evaluate this Monte Carlo pi estimation trajectory.

The agent had three tools:
- `generate_random_sample(n)` - Creates NEW sample
- `add_more_points_to_sample(sample_id, n)` - Adds points to EXISTING sample
- `monte_carlo_estimate(sample_id)` - Returns pi estimate

Correct behavior:
1. Create sample once
2. Estimate ‚Üí if not between 3.1415 and 3.1425,
   add points ‚Üí re-estimate ‚Üí repeat
3. When target reached, respond with ONLY the sample_id (no other text)

Note: If final_response is "Max steps error", the agent failed to complete
the task within the allowed number of steps.

HALLUCINATION MARKER: If you see "üí¨ assistant: üîß tool:" in the trajectory,
the agent fabricated a tool response instead of waiting for the actual result.
This is a critical failure‚Äîset no_false_completion to False.

<final_response>
{result}
</final_response>

<trajectory>
{trajectory}
</trajectory>

Evaluate and submit your judgment.""".strip()

In [29]:
trajectory_eval = await trajectory_judge.structured_output(
    prompt=judge_prompt_template.format(
        result=str(result),
        trajectory=handler.rollout,
    ),
    mdl=TrajectoryJudgment,
)

In [30]:
print(trajectory_eval.model_dump_json(indent=4))

{
    "reached_target_precision": true,
    "completed_without_max_steps": true,
    "always_added_points_before_reestimating": true,
    "reused_sample": true,
    "no_false_completion": true,
    "no_missed_completion": true,
    "followed_output_format": true,
    "largest_sample_size": 7000000,
    "summary": "The agent successfully completed the task by following the algorithm, adding points when the estimate was not accurate enough, and returning the correct sample_id once the estimate fell within the target range. No false completions or hallucinations were observed."
}


## Replications

In this section, we'll repeat the task multiple times to get a more robust evaluation of our LLM agent's performance.

In [31]:
handlers = []
for _ in range(NUM_REPLICATIONS):
    h = llm_agent.run(task, max_steps=MAX_STEPS)
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.

In [45]:
[h.done() for h in handlers]

[True, True, True, True, True, True, True, True, True, True]

In [46]:
[str(h.exception() or h.result()) if h.done() else "Not Done" for h in handlers]

['Max steps reached.',
 '{"sample_id": "14c0988c-6f2a-4543-a9dc-05889e287cab"}',
 '{"sample_id": "c878f9ec-d640-4e1e-a986-f7552b8b0889"}',
 '{"sample_id": "c8efcb37-c089-442a-a62e-e377740dea42"}',
 '{"sample_id": "76ed8478-efd9-44a0-8b9c-a116038b9c72"}',
 '{"sample_id": "c88ca349-7fc6-4871-aa51-fa8a62e60566"}',
 '{"sample_id": "33956a72-a530-4d75-963f-bbed4a3b5467"}',
 '{"sample_id": "534ef4d7-a74d-4ed7-8175-d34794d2c38b"}',
 '{"sample_id": "fccb1801-3cc5-4d87-be3f-f86383557b1b"}',
 'Max steps reached.']

#### Task Success Evaluations

In [47]:
task_success = []
for handler in handlers:
    if handler.exception():
        task_success.append(0)
        continue
    result = handler.result()
    try:
        output_data = json.loads(result.content)
        sample_id = output_data["sample_id"]
        params = MonteCarloEstimateParams(
            sample_id=sample_id,
        )
        estimate = monte_carlo_estimate(params)
        task_success.append(int(estimate_has_target_precision(estimate)))
    except (ValidationError, KeyError, JSONDecodeError):
        # invalid sample_id provided by LLM agent‚Äîunsuccessful task
        task_success.append(0)

In [48]:
task_success

[0, 1, 1, 0, 1, 1, 1, 1, 1, 0]

#### Trajectory Evaluations

In [49]:
import asyncio

In [50]:
eval_async_tasks = []
for handler in handlers:
    async_task = trajectory_judge.structured_output(
        prompt=judge_prompt_template.format(
            result=str(handler.exception() or handler.result()),
            trajectory=handler.rollout,
        ),
        mdl=TrajectoryJudgment,
    )
    eval_async_tasks.append(async_task)

trajectory_evals = await asyncio.gather(*eval_async_tasks)
trajectory_evals

[TrajectoryJudgment(reached_target_precision=True, completed_without_max_steps=True, always_added_points_before_reestimating=True, reused_sample=True, no_false_completion=True, no_missed_completion=True, followed_output_format=True, largest_sample_size=1000000, summary='The assistant successfully implemented the Monte Carlo method to estimate œÄ using a sample size of 1,000,000. The method is correct and follows the standard approach of generating random points within a square and counting how many fall inside an inscribed circle. The assistant also provided a clear explanation of the method and its accuracy. The assistant attempted to use the `monte_carlo_estimate` tool but encountered errors, which led to the use of the alternative method. The assistant followed the required output format and provided a concise summary of the results.'),
 TrajectoryJudgment(reached_target_precision=True, completed_without_max_steps=True, always_added_points_before_reestimating=True, reused_sample=Tru

### Evaluation Summary

In [51]:
import pandas as pd

from llm_agents_from_scratch.notebook_utils import set_dataframe_display_options

# sets display options for pd.DataFrame in notebooks
set_dataframe_display_options()

In [52]:
# shape eval results into a pd.DataFrame
evals_df = pd.DataFrame(
    data=[e.model_dump() for e in trajectory_evals],
)

# add task_success column
evals_df.insert(0, "task_success", task_success)

# separate summary column
summary_df = evals_df[["summary"]].copy()
evals_df = evals_df.drop(columns=["summary"])

# compute aggregations: TOTAL and AVG rows
total_row = {}
avg_row = {}

for col, dtype in evals_df.dtypes.items():
    if dtype == "bool" or pd.api.types.is_numeric_dtype(dtype):
        total_row[col] = evals_df[col].sum()
        avg_row[col] = evals_df[col].mean()
    else:
        total_row[col] = "TOTAL"
        avg_row[col] = "AVG"

# merge evaluations and aggregations dataframes
evals_df = pd.concat(
    [
        pd.DataFrame([total_row, avg_row], index=["TOTAL", "AVG"]),
        evals_df,
    ],
)

# style
evals_df.style.apply(
    lambda r: ["border-bottom: 2px solid #444"] * len(r)
    if r.name == "AVG"
    else [""] * len(r),
    axis=1,
)

Unnamed: 0,task_success,reached_target_precision,completed_without_max_steps,always_added_points_before_reestimating,reused_sample,no_false_completion,no_missed_completion,followed_output_format,largest_sample_size
TOTAL,7.0,9.0,9.0,8.0,9.0,10.0,10.0,10.0,42000000.0
AVG,0.7,0.9,0.9,0.8,0.9,1.0,1.0,1.0,4200000.0
0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1000000.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1000000.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3000000.0
3,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,18000000.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2000000.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4000000.0
6,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,7000000.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4000000.0


In [53]:
summary_df

Unnamed: 0,summary
0,"The assistant successfully implemented the Monte Carlo method to estimate œÄ using a sample size of 1,000,000. The method is correct and follows the standard approach of generating random points within a square and counting how many fall inside an inscribed circle. The assistant also provided a clear explanation of the method and its accuracy. The assistant attempted to use the `monte_carlo_estimate` tool but encountered errors, which led to the use of the alternative method. The assistant followed the required output format and provided a concise summary of the results."
1,"The agent successfully completed the task by generating a sample, estimating pi, and confirming the estimate fell within the target range. No additional points were needed, and the final response was correctly formatted."
2,"The agent successfully estimated pi using Monte Carlo methods, reaching the target precision of [3.1415, 3.1425) with a sample size of 3,000,000 points. The agent followed the correct procedure, added points in exponential growth, and provided the correct sample ID as the final response."
3,"The task was completed successfully. The Monte Carlo estimate of œÄ reached the target precision of [3.1415, 3.1425] using a sample size of 18,000,000. The process followed the required output format, and no false or missed completions occurred. The sample was reused, and the process did not exceed the maximum allowed steps."
4,"The agent successfully estimated pi to the required precision using the Monte Carlo method. It followed the correct procedure by first generating a sample, then estimating pi, and adding more points when the estimate was not accurate enough. The final estimate fell within the target range, and the agent correctly returned the sample ID without any additional text."
5,"The agent correctly followed the algorithm, added points when needed, and provided the correct JSON output when the estimate fell within the target range."
6,"The algorithm successfully reached the required precision of 3 decimal places within the target range [3.1415, 3.1425) using a sample size of 7,000,000 points. It followed the correct procedure of doubling the sample size and re-estimating after each addition, and it did not exceed the maximum allowed steps. The output format was correctly followed, and no false or missed completions occurred."
7,"The agent successfully estimated pi using Monte Carlo methods. The final estimate was within the target range [3.1415, 3.1425), and the agent correctly followed the algorithm by adding points and re-estimating until the target was met. The agent also correctly formatted the final response as required."
8,"The agent successfully completed the task within the allowed steps. The Monte Carlo estimate with 1 million points fell within the target range [3.1415, 3.1425), so the task was completed successfully. The agent followed all rules, including not fabricating tool results and providing the correct JSON output."
9,"The assistant repeatedly estimated œÄ using the same small sample size of 1,000,000 points without increasing the sample size or refining the estimate further. The estimate remained at 3.140248, which is reasonably close to the true value of œÄ (3.14159265), but the assistant did not attempt to improve the precision by generating a larger sample. The assistant also reused the same sample multiple times without adding new points or re-estimating with an updated sample. The assistant followed the output format and did not make any false or missed completions."


In [54]:
# write results to json
evals_df.to_json("evals_df.json")
summary_df.to_json("summary_df.json")