In [1]:
import sys
import time
from pathlib import Path

from PIL import Image
from smolagents import GradioUI, ActionStep, VLLMModel, LogLevel

# Allow imports from the parent directory
from agent import SandboxCodeAgent
from sandbox.configs import SandboxVMConfig

# ───────────────────────────── Agent Configuration ─────────────────────────────
# MODEL
# model_id = "meta-llama/Llama-3.3-70B-Instruct"
# model = InferenceClientModel(
#     token=os.getenv("HF_TOKEN"),  # model_id=model_id,
# )  # You can choose to not pass any model_id to InferenceClientModel to use a default model


# model = TransformersModel(model_id="Qwen/Qwen2.5-Coder-7B-Instruct", max_new_tokens=4096, device_map="auto")

# model = VLLMModel(model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct")
model = VLLMModel(model_id="HuggingFaceTB/SmolVLM2-2.2B-Instruct")


# ───────────────────────────── Helpers & Utils ─────────────────────────
def take_initial_screenshot(agent: SandboxCodeAgent, label: str = "initial"):
    client = agent.python_executor.vm.sandbox_client
    result = client.take_screenshot()
    if "screenshot_path" in result:
        try:
            image = Image.open(result["screenshot_path"])
            print(image)
            print(f"📸 Saved initial screenshot: {label}.png")
        except Exception as e:
            print(f"⚠️ Failed to save initial screenshot: {e}")


def save_screenshot_callback(memory_step: ActionStep, agent: SandboxCodeAgent):
    """Enhanced callback that takes screenshots with the FastAPI sandbox client."""
    # Wait for any animations or UI updates to complete
    time.sleep(3.0)

    # Clean up previous screenshots to save memory
    current_step = memory_step.step_number
    for previous_memory_step in agent.memory.steps:
        if isinstance(previous_memory_step, ActionStep) and previous_memory_step.observations_images is not None:
            if previous_memory_step.step_number <= current_step - 2:
                previous_memory_step.observations_images = None

    # Take the screenshot using the sandbox client
    client = agent.python_executor.vm.sandbox_client
    result = client.take_screenshot()
    if "screenshot_path" in result:
        path = result["screenshot_path"]
        try:
            image = Image.open(path)
            memory_step.observations_images = [image.copy()]

            # Add detailed observation information
            mouse_info = f"Mouse position: {result['mouse_position']}"
            screen_info = f"Screen resolution: {image.size[0]}x{image.size[1]} pixels"

            # Get additional VM state if available
            vm_state = client.get_vm_state() if hasattr(client, "get_vm_state") else {}
            active_window = vm_state.get("active_window", "Unknown")

            observations = [
                f"🖼️ Screenshot captured at step {current_step}",
                mouse_info,
                screen_info,
                f"Active window: {active_window}",
            ]

            memory_step.observations = "\n".join(observations)
            print(f"Captured a VM screenshot: {image.size[0]}x{image.size[1]} pixels")
        except Exception as e:
            memory_step.observations = f"⚠️ Failed to load screenshot: {e}"


config = SandboxVMConfig(host_server_dir=Path("sandbox/server/"))
agent = SandboxCodeAgent(
    tools=[],
    model=model,
    executor_type="sandbox",
    executor_kwargs={
        "config": config,
        "preserve_on_exit": False,
    },
    additional_authorized_imports=["pyautogui"],
    step_callbacks=[save_screenshot_callback],
    # verbosity_level=LogLevel.INFO,
)
take_initial_screenshot(agent, label="before_run")

GradioUI(agent).launch()


INFO 05-06 10:57:37 [__init__.py:239] Automatically detected platform cuda.
INFO 05-06 10:57:44 [config.py:717] This model supports multiple tasks: {'reward', 'score', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 05-06 10:57:44 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-06 10:57:45 [core.py:58] Initializing a V1 LLM engine (v0.8.5) with config: model='HuggingFaceTB/SmolVLM2-2.2B-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolVLM2-2.2B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_b

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-06 10:57:51 [loader.py:458] Loading weights took 3.26 seconds
INFO 05-06 10:57:52 [gpu_model_runner.py:1347] Model loading took 4.1963 GiB and 4.055252 seconds
INFO 05-06 10:57:52 [gpu_model_runner.py:1620] Encoder cache will be initialized with a budget of 8192 tokens, and profiled with 6 image items of the maximum feature size.
INFO 05-06 10:57:59 [backends.py:420] Using cache directory: /home/smwhering/.cache/vllm/torch_compile_cache/3b61b7abf2/rank_0_0 for vLLM's torch.compile
INFO 05-06 10:57:59 [backends.py:430] Dynamo bytecode transform time: 4.30 s
INFO 05-06 10:58:00 [backends.py:118] Directly load the compiled graph(s) for shape None from the cache, took 1.643 s
INFO 05-06 10:58:01 [monitor.py:33] torch.compile takes 4.30 s in total
INFO 05-06 10:58:02 [kv_cache_utils.py:634] GPU KV cache size: 34,800 tokens
INFO 05-06 10:58:02 [kv_cache_utils.py:637] Maximum concurrency for 8,192 tokens per request: 4.25x
INFO 05-06 10:58:22 [gpu_model_runner.py:1686] Graph capturing

[2025-05-06 10:59:13] INFO transport.py:1944: Connected (version 2.0, client OpenSSH_9.6p1)
[2025-05-06 10:59:13] INFO transport.py:1944: Authentication (publickey) failed.
[2025-05-06 10:59:13] INFO transport.py:1944: Authentication (password) successful!


[2025-05-06 10:59:28] INFO transport.py:1944: Connected (version 2.0, client OpenSSH_9.6p1)
[2025-05-06 10:59:28] INFO transport.py:1944: Authentication (publickey) failed.
[2025-05-06 10:59:28] INFO transport.py:1944: Authentication (password) successful!


[2025-05-06 10:59:29] INFO sftp.py:169: [chan 4] Opened sftp connection (server version 3)


HTTPError: 403 Client Error: Forbidden for url: http://localhost:8888/api/kernels

In [None]:
run = agent.run(
    """Run the following code:
```python
import pyautogui

# Move the mouse to the center of the screen
screen_width, screen_height = pyautogui.size()
center_x, center_y = screen_width // 2, screen_height // 2
pyautogui.moveTo(center_x, center_y)
print('Mouse moved to the center of the screen')


If you can see the mouse moved to the center of the screen and the screenshot saved, return "success" else return "failure"
```
""",
    max_steps=4,
    stream=False,
)