# Our First LLMAgent — Hailstone

In [1]:
LOGGING_ENABLED = True

In [2]:
import logging

from llm_agents_from_scratch.logger import enable_console_logging

if LOGGING_ENABLED:
    enable_console_logging(logging.INFO)

## Define the Hailstone Tool

In [3]:
from pydantic import BaseModel

from llm_agents_from_scratch.tools import PydanticFunctionTool


class AlgoParams(BaseModel):
    """Params for next_number."""

    x: int


def next_number(params: AlgoParams) -> int:
    """Generate the next number of the sequence."""
    if params.x % 2 == 0:
        return params.x // 2
    return 3 * params.x + 1


# convert our Python function to a BaseTool
tool = PydanticFunctionTool(next_number)

## Define our backbone LLM

In [4]:
from llm_agents_from_scratch.llms import OllamaLLM

In [5]:
llm = OllamaLLM(model="qwen2.5:3b")

## Define the LLMAgent

In [6]:
from llm_agents_from_scratch import LLMAgent

In [7]:
llm_agent = LLMAgent(
    llm=llm,
    tools=[tool],
)

## Define the Hailstone Task

In [8]:
from llm_agents_from_scratch.data_structures import Task

In [None]:
instruction_template = """
You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number x={x}.

<rules>
CALL `next_number` on the current number x
STOP AND WAIT for the result.
REPEAT this step-by-step process until the number 1 is reached.
FINAL RESULT: When you receive the number 1, provide the complete sequence you
observed from start to finish (including the starting number x and ending number
1).
</rules>

<warnings>
NEVER fabricate or simulate tool call results
NEVER make multiple tool calls in one response
STOP and WAIT - ALWAYS wait for the actual tool response before deciding next
steps
</warnings>
""".strip()

In [10]:
task = Task(
    instruction=instruction_template.format(x=4),
)

In [11]:
handler = llm_agent.run(task)

INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      🛠️ Executing Tool Call: next_number
INFO (llm_agents_fs.TaskHandler) :      ✅ Successful Tool Call: 2
INFO (llm_agents_fs.TaskHandler) :      ✅ Step Result: <tool_call>
{"name": "next_number", "arguments": {"x":2}}
</tool_call>
INFO (llm_agents_fs.TaskHandler) :      No new step required.
INFO (llm_agents_fs.LLMAgent) :      🏁 Task completed: [4, 2]


In [12]:
handler.done()

True

# See the TaskResult

In [13]:
result = handler.result()

In [14]:
print(result)

[4, 2]


### See the Rollout

In [15]:
print(handler.rollout)

=== Task Step Start ===

💬 assistant: The current instruction is 'You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the number x=4.

<rules>
CALL `next_number` on the current number x
STOP AND WAIT for the result.
REPEAT this step-by-step process until the number 1 is reached.
FINAL RESULT: When you receive the number 1, provide the complete sequence
you observed from start to finish (including the starting number x and
ending number 1).
</rules>

NEVER fabricate or simulate tool call results
NEVER make multiple tool calls in one response
STOP and WAIT - ALWAYS wait for the actual tool response before deciding next steps
'

💬 assistant: I need to make the following tool call(s):

{
    "tool_name": "next_number",
    "arguments": {
        "x": 4
    }
}.

💬 tool: {
    "tool_call": {
        "tool_name": "next_number",
        "arguments": {
            "x": 4
        }
    },
    "content": "2",
    "error": fals

In [16]:
# number of task steps executed
handler.step_counter

1

## Evaluation

In [17]:
from dataclasses import dataclass


@dataclass
class Example:
    """Benchmark example."""

    number: int
    sequence: list[int]

    @property
    def min_steps(self) -> int:
        """Returns the minimum number of TaskSteps to perform the Task."""
        return len(self.sequence) - 1

In [18]:
benchmark: list[Example] = [
    Example(
        number=11,
        sequence=[11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1],
    ),
    Example(
        number=13,
        sequence=[13, 40, 20, 10, 5, 16, 8, 4, 2, 1],
    ),
    Example(
        number=4,
        sequence=[4, 2, 1],
    ),
    Example(
        number=3,
        sequence=[3, 10, 5, 16, 8, 4, 2, 1],
    ),
    Example(
        number=2,
        sequence=[2, 1],
    ),
]

### Run tasks

In [19]:
handlers = []
for ex in benchmark:
    h = llm_agent.run(
        Task(
            instruction=instruction_template.format(x=ex.number),
        ),
        max_steps=30,
    )
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the sequence given the current number.

Start with the number

In [47]:
[h.done() for h in handlers]

[True, True, True, True, True]

In [48]:
[h.result() for h in handlers]

[TaskResult(task_id='13bc6038-a7c3-4f0b-b9d8-5c4a668786ab', content='Based on our previous sequence generation using the `next_number` function, we have observed that starting from different initial values results in consistent patterns. Here are the steps of the generated sequence so far with detailed explanations for each step:'),
 TaskResult(task_id='f3a60d55-f9db-4bc0-8b71-dd7f28a38989', content='Starting with x = 13, the sequence observed was: **13 → 40 → 20 → 10 → 5 → 16 → 8**. The process has now completed as requested.'),
 TaskResult(task_id='1d17ffdc-1ca4-4143-8cf4-a2bc0d3cccb5', content='[4, 2, 1]'),
 TaskResult(task_id='4eb8567f-b3df-48bc-9785-328b7e41f9f9', content='The sequence generated from starting number x=3 is as follows: \n3, 10'),
 TaskResult(task_id='30a3e24b-3ef3-472f-b60d-52d8350ac5c5', content='[2, 4, 1]')]

## Result Correctness Evaluation

### Setup Judge LLM

In [49]:
judge_prompt_template = """
You are an evaluation assistant. Given a number and its correct sequence, use
them to assess whether another assistant's final result contains the correct
sequence.

<number>
{number}
</number>

<sequence>
{sequence}
</sequence>

<result>
{result}
</result>
""".strip()

#### Structured outputs for final result correctness evaluation

In [51]:
from pydantic import Field


class ExampleResultEvaluation(BaseModel):
    """Evaluation of result."""

    correct: bool = Field(
        description="True if the assistant's final result contains the correct sequence. False otherwise.",
    )

In [52]:
import asyncio

eval_async_tasks = []
for ex, handler in zip(benchmark, handlers, strict=False):
    async_task = llm.structured_output(
        prompt=judge_prompt_template.format(
            number=ex.number,
            sequence=ex.sequence,
            result=str(handler.result()),
        ),
        mdl=ExampleResultEvaluation,
    )
    eval_async_tasks.append(async_task)

result_evals = await asyncio.gather(*eval_async_tasks)
result_evals

[ExampleResultEvaluation(correct=False),
 ExampleResultEvaluation(correct=False),
 ExampleResultEvaluation(correct=True),
 ExampleResultEvaluation(correct=False),
 ExampleResultEvaluation(correct=False)]

## Number Steps Evaluation

In [53]:
num_steps_diff = [
    (h.step_counter - ex.min_steps)
    for h, ex in zip(handlers, benchmark, strict=False)
]
num_steps_diff

[15, -3, 4, -6, 4]

## LLM Agent Trajectory Evaluation

### LLM Judge Setup

In [54]:
llm_trajectory_judge = OllamaLLM(model="qwen2.5:3b")

In [55]:
trajectory_judge_prompt_template = """
You are an evaluation assistant for evaluating another assistant's reasoning
trajectory for solving a sequence generation task. The instruction and correct
sequence is provided in addition to the trajectory of the assistant. Use the
provided rubric to provide an evaluation of the trajectory.

INSTRUCTION:
{instruction}

SEQUENCE:
{sequence}

TRAJECTORY:
{trajectory}

RUBRIC:
Reasoning (0-5): Does the assistant follow logical steps and make sound
decisions?
Process (0-5): Does the assistant follow proper tool usage and protocols?

Provide a score, with 0 being lowest and 5 the highest, for each dimension and
brief justification.

WARNINGS: DO NOT DEDUCT FOR THE SAME ERROR MORE THAN ONCE. If an issue affects
multiple dimensions, choose the most relevant dimension to dock points.""".strip()

In [56]:
from typing import Literal


class DimensionScore(BaseModel):
    """Model for a dimension score."""

    score: Literal[0, 1, 2, 3, 4, 5] = Field(
        description="Overall score on the dimension.",
    )
    justification: str = Field(
        description="Justification for the score.",
    )


class ExampleTrajectoryEvaluation(BaseModel):
    """Evaluation of LLM agent trajectory."""

    reasoning: DimensionScore
    process: DimensionScore

In [57]:
eval_async_tasks = []
for ex, handler in zip(benchmark, handlers, strict=False):
    async_task = llm_trajectory_judge.structured_output(
        prompt=trajectory_judge_prompt_template.format(
            instruction=instruction_template.format(x=ex.number),
            sequence=ex.sequence,
            trajectory=handler.rollout,
        ),
        mdl=ExampleTrajectoryEvaluation,
    )
    eval_async_tasks.append(async_task)

trajectory_evals = await asyncio.gather(*eval_async_tasks)
trajectory_evals

[ExampleTrajectoryEvaluation(reasoning=DimensionScore(score=3, justification='The assistant attempts to follow logical steps and provides a clear explanation of the sequence generated by the `next_number` function. However, there are some minor issues that prevent full credit in reasoning: The initial setup could have been clearer, and the prompt for continuing the sequence generation was not as effective.'), process=DimensionScore(score=4, justification='The assistant follows proper tool usage (calling `next_number` with different inputs) and protocols (providing clear explanations of each step). There were no issues reported related to tool usage or protocols that would warrant a deduction in this area.')),
 ExampleTrajectoryEvaluation(reasoning=DimensionScore(score=4, justification="The assistant follows logical steps and makes sound decisions for the most part. They correctly apply the `next_number` tool in sequence without fabricating or simulating results, adhering to the 'STOP a

### Create Eval summary report

In [47]:
!uv pip install pandas tabulate -q

In [58]:
import pandas as pd

In [59]:
report = pd.DataFrame(
    data={
        "result_correctness": [int(el.correct) for el in result_evals],
        "number_steps_diff": num_steps_diff,
        "trajectory_reasoning": [e.reasoning.score for e in trajectory_evals],
        "trajectory_process": [e.process.score for e in trajectory_evals],
        "trajectory_overall": [
            (e.reasoning.score + e.process.score) / 2 for e in trajectory_evals
        ],
    },
)
report.loc["Average"] = report.mean().round(2)
report

Unnamed: 0,result_correctness,number_steps_diff,trajectory_reasoning,trajectory_process,trajectory_overall
0,0.0,15.0,3.0,4.0,3.5
1,0.0,-3.0,4.0,5.0,4.5
2,1.0,4.0,2.0,3.0,2.5
3,0.0,-6.0,4.0,4.0,4.0
4,0.0,4.0,3.0,4.0,3.5
Average,0.2,2.8,3.2,4.0,3.6


## Previous Benchmark Runs

### Qwen2.5-72b

_Ran on 7xA40 with runpod_

|         |   result_correctness |   number_steps_diff |   trajectory_reasoning |   trajectory_process |   trajectory_overall |
|:--------|---------------------:|--------------------:|-----------------------:|---------------------:|---------------------:|
| 0       |                    1 |                15   |                    4   |                  4   |                  4   |
| 1       |                    1 |                10   |                    4   |                  5   |                  4.5 |
| 2       |                    1 |                 1   |                    5   |                  5   |                  5   |
| 3       |                    1 |                 4   |                    4   |                  5   |                  4.5 |
| 4       |                    1 |                 1   |                    4   |                  5   |                  4.5 |
| Average |                    1 |                 6.2 |                    4.2 |                  4.8 |                  4.5 |

### Qwen2.5-3b

|         |   result_correctness |   number_steps_diff |   trajectory_reasoning |   trajectory_process |   trajectory_overall |
|:--------|---------------------:|--------------------:|-----------------------:|---------------------:|---------------------:|
| 0       |                  0   |                 -10 |                    3   |                  2   |                  2.5 |
| 1       |                  0   |                  -6 |                    3   |                  4   |                  3.5 |
| 2       |                  1   |                  -1 |                    2   |                  0   |                  1   |
| 3       |                  1   |                   1 |                    4   |                  3   |                  3.5 |
| 4       |                  0   |                   1 |                    2   |                  3   |                  2.5 |
| Average |                  0.4 |                  -3 |                    2.8 |                  2.4 |                  2.6 |