In [1]:
import os
import re
import sys

if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

try:
    from agentic_planning_eval.green_agent import tools_backend, val_wrapper
except ImportError:
    # Handle case where directory structure is nested differently
    sys.path.append(os.path.join(os.getcwd(), 'agentic-planning-eval'))
    from green_agent import tools_backend, val_wrapper

DOMAIN = "blocks"
INDEX = 1

def header(title):
    print(f"\n{'='*80}\n {title}\n{'='*80}")

def run_tool(func, *args):
    """Runs a tool, times it, and prints output clearly."""
    try:
        res = func(*args)
        print(res)
        return res
    except Exception as e:
        print(f"❌ ERROR: {e}")
        return None

binary_path = val_wrapper.guess_val_binary(None)

## Stateless tools

In [2]:
# 1. Overview
overview = run_tool(tools_backend.get_task_overview_nl, DOMAIN, INDEX)

Task description:
Rearrange blocks to satisfy the goal stacking configuration.

Initial situation:
A, B, C are each on the table and clear; the hand is empty.

Goal:
A is on B and B is on C (tower top->bottom: A, B, C).


In [3]:
# 2. List objects
objects_text = run_tool(tools_backend.list_objects_nl, DOMAIN, INDEX)

- A (type: block) — Block A.
- B (type: block) — Block B.
- C (type: block) — Block C.


In [4]:
# 3. Describe specific object
match = re.search(r"- ([\w-]+) \(type:", objects_text)
TARGET_BLOCK = match.group(1) if match else "A"
print(f"Target Block Selected: {TARGET_BLOCK}")

run_tool(tools_backend.describe_object_nl, DOMAIN, INDEX, TARGET_BLOCK)

Target Block Selected: A
Name: A
Type: block
Summary: Block A.


'Name: A\nType: block\nSummary: Block A.'

In [5]:
# 4. List actions
actions_text = run_tool(tools_backend.list_action_types_nl, DOMAIN)

Action: pick-up
Parameters (in order): X
Allowed when:
- The hand is empty.
- Block X is on the table.
- Block X is clear (nothing on top of it).
Effects:
- The hand is holding block X.
- Block X is no longer on the table.
- Block X is no longer clear (because it is being held).
- The hand is no longer empty.

Action: put-down
Parameters (in order): X
Allowed when:
- The hand is holding block X.
Effects:
- The hand is empty.
- The hand is no longer holding block X.
- Block X is on the table.
- Block X is clear.

Action: stack
Parameters (in order): X, Y
Allowed when:
- The hand is holding block X.
- Block Y is clear (nothing on top of it).
Effects:
- The hand is empty.
- The hand is no longer holding block X.
- Block X is on top of block Y.
- Block X is clear.
- Block Y is no longer clear.

Action: unstack
Parameters (in order): X, Y
Allowed when:
- The hand is empty.
- Block X is on top of block Y.
- Block X is clear (nothing on top of it).
Effects:
- The hand is holding block X.
- Th

In [6]:
# 5. Get specific action details
run_tool(tools_backend.get_action_type_nl, DOMAIN, "pick-up")

Action: pick-up
Parameters (in order): X
Allowed when:
- The hand is empty.
- Block X is on the table.
- Block X is clear (nothing on top of it).
Effects:
- The hand is holding block X.
- Block X is no longer on the table.
- Block X is no longer clear (because it is being held).
- The hand is no longer empty.


'Action: pick-up\nParameters (in order): X\nAllowed when:\n- The hand is empty.\n- Block X is on the table.\n- Block X is clear (nothing on top of it).\nEffects:\n- The hand is holding block X.\n- Block X is no longer on the table.\n- Block X is no longer clear (because it is being held).\n- The hand is no longer empty.'

## Stateless Plan Submission

In [7]:
# 6. Submit full plan text (goal not fulfilled case)
plan_text = f"pick-up {TARGET_BLOCK}"
run_tool(tools_backend.submit_plan_nl, DOMAIN, INDEX, plan_text)

Accepted: NO
Plan length: 1
Failure category: goal_not_satisfied
Unsatisfied conditions count: 0


'Accepted: NO\nPlan length: 1\nFailure category: goal_not_satisfied\nUnsatisfied conditions count: 0'

In [8]:
# 7. Submit full plan text (precs not satisfied case)
plan_text = f"pick-up {TARGET_BLOCK}\npick-up {TARGET_BLOCK}"
run_tool(tools_backend.submit_plan_nl, DOMAIN, INDEX, plan_text)

Accepted: NO
Plan length: 2
Failure category: precondition_unsatisfied
First failing step: 2
First failed action: (pick-up a)
Details: Unsatisfied preconditions at step 2 for (pick-up a): clear a = true (but was false); on-table a = true (but was false); arm-empty = true (but was false).
Unsatisfied conditions count: 2


'Accepted: NO\nPlan length: 2\nFailure category: precondition_unsatisfied\nFirst failing step: 2\nFirst failed action: (pick-up a)\nDetails: Unsatisfied preconditions at step 2 for (pick-up a): clear a = true (but was false); on-table a = true (but was false); arm-empty = true (but was false).\nUnsatisfied conditions count: 2'

In [9]:
# 8. Submit full plan text (correct plan!)
plan_text = "pick-up B\nstack B C\npick-up A\nstack A B"
run_tool(tools_backend.submit_plan_nl, DOMAIN, INDEX, plan_text)

Accepted: YES
Plan length: 4
Plan cost/value: 4.0


'Accepted: YES\nPlan length: 4\nPlan cost/value: 4.0'

## Stateful Execution

In [10]:
# 9. Reset episode
run_tool(tools_backend.reset_episode_nl, DOMAIN, INDEX)

State reset for domain 'blocks', problem 1. Step counter = 0.


"State reset for domain 'blocks', problem 1. Step counter = 0."

In [11]:
# 10. Check initial state
run_tool(tools_backend.get_state_nl)

Episode: domain=blocks, problem=1, steps=0
Facts:
- arm-empty is true
- a is clear
- b is clear
- c is clear
- a is on-table
- b is on-table
- c is on-table


'Episode: domain=blocks, problem=1, steps=0\nFacts:\n- arm-empty is true\n- a is clear\n- b is clear\n- c is clear\n- a is on-table\n- b is on-table\n- c is on-table'

In [12]:
# 11. Execute action (valid)
action_cmd = f"pick-up {TARGET_BLOCK}"
run_tool(tools_backend.act_nl, action_cmd)
run_tool(tools_backend.get_state_nl)

Executed: YES
Step counter now: 1
Episode: domain=blocks, problem=1, steps=1
Facts:
- b is clear
- c is clear
- a is holding
- b is on-table
- c is on-table


'Episode: domain=blocks, problem=1, steps=1\nFacts:\n- b is clear\n- c is clear\n- a is holding\n- b is on-table\n- c is on-table'

In [13]:
# 12. Execute action (inv, precs fail)
res = run_tool(tools_backend.act_nl, action_cmd)

Executed: NO
At step: 2
Detail: Block a should be clear (nothing on top).


In [14]:
# 13. Get history
run_tool(tools_backend.get_history_nl)

1) pick-up A


'1) pick-up A'

In [15]:
# 14. Submit current episode (check goal)
run_tool(tools_backend.submit_episode_nl)

Accepted: NO
Plan length: 1
Failure category: goal_not_satisfied
Unsatisfied conditions count: 0


'Accepted: NO\nPlan length: 1\nFailure category: goal_not_satisfied\nUnsatisfied conditions count: 0'

In [16]:
# 14. Undo
run_tool(tools_backend.undo_nl, 0)
run_tool(tools_backend.get_history_nl)

Reverted to step 0.
No actions executed yet.


'No actions executed yet.'