# Imports

In [1]:
from autoboltagent import verbose_prompts, grammars
from autoboltagent  import prompts
from autoboltagent.verbose_agents import VerboseLowFidelityAgent
from autoboltagent.agents import LowFidelityAgent
from autoboltagent.tools.logger import AgentLogger
from autoboltagent.VLLMModelCustom import VLLMModelCustom

  from .autonotebook import tqdm as notebook_tqdm
  import pkg_resources


In [2]:
from vllm import SamplingParams
from vllm.sampling_params import StructuredOutputsParams
import smolagents

# Setup

### set up logger and db

In [3]:
db_url = "sqlite:///../src/agent_logs_grammar_prod.db"
logger = AgentLogger(db_url)

### params

In [4]:
joint_configuration = {
    "load": 60000,
    "desired_safety_factor": 3.0,
    "bolt_yield_strength": 940,
    "plate_yield_strength": 250,
    "preload": 150000,
    "pitch": 1.5,
    "plate_thickness": 10,
    "bolt_elastic_modulus": 210,
    "plate_elastic_modulus": 210
    }

input = """{
    "load": 60000,
    "desired_safety_factor": 3.0,
    "bolt_yield_strength": 940,
    "plate_yield_strength": 250,
    "preload": 150000,
    "pitch": 1.5,
    "plate_thickness": 10,
    "bolt_elastic_modulus": 210,
    "plate_elastic_modulus": 210
    }"""

grammar_sop = StructuredOutputsParams(
    grammar=grammars.low_fidelity_agent_grammar_debug
)

sampling_params = SamplingParams(
    max_tokens=200,
    temperature=0.0,
    structured_outputs=grammar_sop
)

# Models

### Local

In [5]:
model = smolagents.VLLMModel(
    model_id="RedHatAI/Qwen2.5-3B-Instruct-quantized.w8a8",
    model_kwargs={
        "gpu_memory_utilization": 0.85,
    },
)

INFO 01-29 21:24:13 [utils.py:263] non-default args: {'gpu_memory_utilization': 0.85, 'disable_log_stats': True, 'model': 'RedHatAI/Qwen2.5-3B-Instruct-quantized.w8a8'}
INFO 01-29 21:24:14 [model.py:530] Resolved architecture: Qwen2ForCausalLM
INFO 01-29 21:24:14 [model.py:1545] Using max model len 32768


2026-01-29 21:24:15,764	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-29 21:24:15 [scheduler.py:229] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-29 21:24:15 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-29 21:24:15 [vllm.py:637] Disabling NCCL for DP synchronization when using async scheduling.
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:21 [core.py:97] Initializing a V1 LLM engine (v0.14.1) with config: model='RedHatAI/Qwen2.5-3B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Qwen2.5-3B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutput

[0;36m(EngineCore_DP0 pid=40850)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:25 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:25 [weight_utils.py:550] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.82it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  1.82it/s]
[0;36m(EngineCore_DP0 pid=40850)[0;0m 


[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:26 [default_loader.py:291] Loading weights took 0.65 seconds
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:26 [gpu_model_runner.py:3905] Model loading took 3.23 GiB memory and 2.647831 seconds
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:31 [backends.py:644] Using cache directory: /home/matthewli125/.cache/vllm/torch_compile_cache/fec4a759af/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:31 [backends.py:704] Dynamo bytecode transform time: 4.86 s
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:35 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.314 s
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:35 [monitor.py:34] torch.compile takes 6.18 s in total
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:36 [gpu_worker.py:358] Available KV cache memory: 2.12 GiB
[0;36m(EngineCor

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 19.89it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 27.43it/s]


[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:40 [gpu_model_runner.py:4856] Graph capturing finished in 4 secs, took 0.61 GiB
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:40 [core.py:273] init engine (profile, create kv cache, warmup model) took 14.00 seconds
[0;36m(EngineCore_DP0 pid=40850)[0;0m INFO 01-29 21:24:42 [vllm.py:630] Asynchronous scheduling is enabled.
INFO 01-29 21:24:42 [llm.py:347] Supported tasks: ['generate']


In [None]:
# Custom local VLLM model with grammar

model = VLLMModelCustom(
        model_id="RedHatAI/Qwen2.5-3B-Instruct-quantized.w8a8",
        apply_chat_template_kwargs=None,
        model_kwargs={
            "gpu_memory_utilization": 0.85,
        },
        sampling_params=sampling_params,
    )

### Cloud models

In [None]:
FIREWORKS_API_KEY = ""

In [None]:
# smolagents InferenceClientModel

model = smolagents.InferenceClientModel( # type: ignore
    provider="fireworks-ai",
    model_id="openai/gpt-oss-20b",
    token=FIREWORKS_API_KEY,
)

In [None]:
model = smolagents.OpenAIServerModel(
    model_id="accounts/fireworks/models/gpt-oss-20b",
    api_base="https://api.fireworks.ai/inference/v1", 
    api_key=FIREWORKS_API_KEY,
    # response_format={
    #     "type": "grammar",
    #     "grammar": grammars.low_fidelity_agent_grammar_debug
    # }
)

# Run agent

### Single agent run

In [6]:
agent = VerboseLowFidelityAgent(model, joint_configuration, "verbose low fidelity agent", f"verbose prompts + minimized + reason 512 + gpt 11", 3.0, logger, max_steps=100)
instruction = verbose_prompts.EXAMPLE_TASK_INSTRUCTIONS.format(input)
agent.run(instruction)

Adding requests: 100%|██████████| 1/1 [00:00<00:00, 134.97it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

KeyboardInterrupt: 

ERROR 01-29 21:29:41 [core_client.py:610] Engine core proc EngineCore_DP0 died unexpectedly, shutting down client.




### Loop agent run

In [None]:
for i in range(0,25):
        agent = VerboseLowFidelityAgent(model, joint_configuration, "verbose low fidelity agent", f"verbose prompts + minimized + reason 512 + gpt {i}", 3.0, logger, max_steps=100)
        instruction = verbose_prompts.EXAMPLE_TASK_INSTRUCTIONS.format(input)
        agent.run(instruction)
