Import libs and deps

In [1]:
import json
from typing import Any, Dict, List, Optional

from vllm import LLM, SamplingParams
from vllm.sampling_params import StructuredOutputsParams

from evoproc_procedures.models import Procedure
from evoproc_procedures.prompts import create_procedure_prompt

  from .autonotebook import tqdm as notebook_tqdm


Initialize vLLM

In [2]:
# Just an easy to use function for initializing the vLLM model instance
def init_vllm(model: str) -> LLM:
    # NOTE: keep this LLM instance alive for the whole GA run (don’t re-init each generation).
    # vLLM supports batched offline inference via llm.generate(list_of_prompts, sampling_params). :contentReference[oaicite:1]{index=1}
    return LLM(
        model=model,
        enable_prefix_caching=True,
        gpu_memory_utilization=0.90,   # 0.3 is likely too low for 120B :contentReference[oaicite:3]{index=3}
        max_model_len=8192,            # avoid default 131072 for GA :contentReference[oaicite:4]{index=4}
        max_num_seqs=32,
        max_num_batched_tokens=4096,
        disable_log_stats=True,
    )

Start the LLM (should only happen once per kernel start)

Note: gpt-oss:120b model takes about 10 minutes to start up

In [3]:
# Model name (pulls directly from Hugging Face Hub)
MODEL = "openai/gpt-oss-120b"

In [4]:
# Required for openAI models
%env TIKTOKEN_ENCODINGS_BASE=$HOME/tiktoken_encodings
%env TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas

env: TIKTOKEN_ENCODINGS_BASE=$HOME/tiktoken_encodings
env: TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas


In [5]:
if "llm" not in globals():
    llm = init_vllm(MODEL)
    print("LLM loaded")
else:
    print("Reusing existing LLM")

INFO 02-05 13:43:59 [utils.py:261] non-default args: {'max_model_len': 8192, 'enable_prefix_caching': True, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'disable_log_stats': True, 'model': 'openai/gpt-oss-120b'}
INFO 02-05 13:43:59 [model.py:541] Resolved architecture: GptOssForCausalLM


Parse safetensors files: 100%|██████████| 15/15 [00:00<00:00, 19.61it/s]

INFO 02-05 13:44:00 [model.py:1561] Using max model len 8192



2026-02-05 13:44:01,978	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 02-05 13:44:02 [scheduler.py:226] Chunked prefill is enabled with max_num_batched_tokens=4096.
INFO 02-05 13:44:02 [config.py:314] Overriding max cuda graph capture size to 1024 for performance.
INFO 02-05 13:44:02 [vllm.py:624] Asynchronous scheduling is enabled.
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:44:03 [core.py:96] Initializing a V1 LLM engine (v0.15.0) with config: model='openai/gpt-oss-120b', speculative_config=None, tokenizer='openai/gpt-oss-120b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whi

[0;36m(EngineCore_DP0 pid=33710)[0;0m     Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
[0;36m(EngineCore_DP0 pid=33710)[0;0m     Minimum and Maximum cuda capability supported by this version of PyTorch is
[0;36m(EngineCore_DP0 pid=33710)[0;0m     (8.0) - (12.0)
[0;36m(EngineCore_DP0 pid=33710)[0;0m     


[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:44:03 [parallel_state.py:1212] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.29.3.105:46505 backend=nccl
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:44:03 [parallel_state.py:1423] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0;36

Loading safetensors checkpoint shards:   0% Completed | 0/15 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   7% Completed | 1/15 [00:17<04:01, 17.26s/it]
Loading safetensors checkpoint shards:  13% Completed | 2/15 [00:39<04:25, 20.44s/it]
Loading safetensors checkpoint shards:  20% Completed | 3/15 [01:02<04:18, 21.51s/it]
Loading safetensors checkpoint shards:  27% Completed | 4/15 [01:21<03:43, 20.33s/it]
Loading safetensors checkpoint shards:  33% Completed | 5/15 [01:46<03:42, 22.27s/it]
Loading safetensors checkpoint shards:  40% Completed | 6/15 [02:12<03:31, 23.47s/it]
Loading safetensors checkpoint shards:  47% Completed | 7/15 [02:34<03:04, 23.06s/it]
Loading safetensors checkpoint shards:  53% Completed | 8/15 [02:57<02:40, 22.91s/it]
Loading safetensors checkpoint shards:  60% Completed | 9/15 [03:22<02:21, 23.54s/it]
Loading safetensors checkpoint shards:  67% Completed | 10/15 [03:44<01:55, 23.07s/it]
Loading safetensors checkpoint shards:  73% Completed | 11/15

[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:49:54 [default_loader.py:291] Loading weights took 346.66 seconds
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:49:59 [gpu_model_runner.py:4118] Model loading took 65.97 GiB memory and 354.209042 seconds
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:03 [backends.py:805] Using cache directory: /home/student/.cache/vllm/torch_compile_cache/75e5945fce/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:03 [backends.py:865] Dynamo bytecode transform time: 4.01 s
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:06 [backends.py:267] Directly load the compiled graph(s) for compile range (1, 4096) from the cache, took 1.501 s
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:06 [monitor.py:34] torch.compile takes 5.51 s in total
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:07 [gpu_worker.py:356] Available KV cache memory: 41.63 GiB
[0;36m(EngineCo

[0;36m(EngineCore_DP0 pid=33710)[0;0m 2026-02-05 13:50:13,122 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[0;36m(EngineCore_DP0 pid=33710)[0;0m 2026-02-05 13:50:13,365 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 83/83 [00:12<00:00,  6.61it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 7/7 [00:00<00:00, 10.55it/s]


[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:29 [gpu_model_runner.py:5051] Graph capturing finished in 16 secs, took 0.21 GiB
[0;36m(EngineCore_DP0 pid=33710)[0;0m INFO 02-05 13:50:29 [core.py:272] init engine (profile, create kv cache, warmup model) took 30.70 seconds
INFO 02-05 13:50:31 [llm.py:343] Supported tasks: ['generate']
LLM loaded


Initial population generation function

In [6]:
def generate_initial_population(
    llm: LLM,
    problem_text: str,
    json_schema: Dict[str, Any],
    pop_size: int,
    *,
    temperature: float = 0.8,
    top_p: float = 0.95,
    max_tokens: int = 600,
    base_seed: Optional[int] = None,
) -> List[Dict[str, Any]]:
    prompt = create_procedure_prompt(problem_text)
    structured_outputs_params_json = StructuredOutputsParams(json=json_schema)

    # One prompt, many samples:
    sampling = SamplingParams(
        n=pop_size,
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,
        seed=base_seed,  # if you want reproducibility across runs (or vary seeds per run) :contentReference[oaicite:3]{index=3}
        stop=None,
        structured_outputs=structured_outputs_params_json,
    )

    outputs = llm.generate([prompt], sampling_params=sampling)
    # outputs is a list of RequestOutput objects; each has .prompt and .outputs[i].text :contentReference[oaicite:4]{index=4}
    out = outputs[0]

    print(out)

    procedures: List[Dict[str, Any]] = []
    for cand in out.outputs:
        text = cand.text.strip()
        print(f"Raw output: {text}")
        # If your model sometimes wraps JSON in backticks, you can add cleanup here.
        try:
            procedures.append(json.loads(text))
        except json.JSONDecodeError:
            # Keep raw text for repair loop
            procedures.append({"__invalid_json__": text})

    return procedures

Generate the initial population

In [7]:
example_q = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"

In [8]:
json_schema = Procedure.model_json_schema()
pop = generate_initial_population(llm, example_q, json_schema, pop_size=4)
print(json.dumps(pop, indent=2))

Adding requests: 100%|██████████| 1/1 [00:00<00:00,  4.39it/s]
Processed prompts: 100%|██████████| 4/4 [00:30<00:00,  7.66s/it, est. speed input: 134.68 toks/s, output: 78.38 toks/s]

RequestOutput(request_id=0, prompt='Create a short, executable procedure to solve the problem, but do NOT solve it.\n\n# Task\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?\n\n\n# Output Contract\nReturn exactly one JSON object that validates against this schema (verbatim):\n{"$defs": {"Step": {"description": "One atomic instruction within a global-state procedure.\\n\\nNotes\\n-----\\n    - Steps should be **single-action** and declarative.\\n    - Use prior outputs as inputs by variable **name**.\\n    - Keep variable names stable and snake_case.\\n\\nAttributes\\n----------\\nid:\\n    1-based step identifier (contiguous in execution order).\\ninputs:\\n    List of required input variables for this step.\\nstep_description:\\n    Natural-language instruction describing exactly what the step does.\\noutputs:\\n    List of variables produced by this step.", "properties": {"id"




In [11]:
pop

[{'__invalid_json__': 'We optimal\xa0…\xa0.\n\nWe needverages \n\nWe need ... \n\nDeleting...\n\nWeMaybe diferente կմ\n\narek...\n\nThe classmates...\n\nWe… \nriterien\n\n\n\nThe prompt asks: "Create/t veremos". The QAction: "Create aข้อ Osm".\n\nWe must produce a JSON-indigo JSONuscht.\n\nWe need produce a JSON object Anh. Must coherent with schema.\n\nWe need Step definitions.\n\n mpiitten. Problem textptic: " gatos".\n\nLet\'s parse problem_text: compost.\n\n સપ.\n\nProblem:大会.\n\nWe need to geladen.\n\nProblem text:mates: "Natalia entwickelte".\n\nApparently problem: "Natalia sold injector  lediglich".\n\n ýok.\n\n inhabitants ".\n\nLet\'s read problemожете.\n\n ತಪ್ಪು: "Natalia sold clips to 48స్ట్".\n\n_signal".\n\n lohnt".\n\nProblem modd.\n\nProblem text: "Natal&&(N) sold detaine".\n\nOk.\n\nInterpret: "Natal nominees".\n\nWe need to produce variables for each numeric fact Varمیم".\n\nNumbers:  દી".\n\nNumbers: SAV".\n\nﾟ".\n\nNow we have:\n\n- previsto.\n\nوادث.\n\n- גוט".\n\nL

In [9]:
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

In [None]:
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

Adding requests: 100%|██████████| 4/4 [00:00<00:00, 1181.24it/s]
Processed prompts: 100%|██████████| 4/4 [00:00<00:00, 40.09it/s, est. speed input: 263.71 toks/s, output: 649.06 toks/s]

Prompt: 'Hello, my name is', Generated text: ' Paul G. Floyd and I am here for the day, to offer a little'
Prompt: 'The president of the United States is', Generated text: ' proposing a $2.4 trillion tax reform bill for the middle class that would'
Prompt: 'The capital of France is', Generated text: ' also the capital of the French Republic.\nSo is Paris, but you would'
Prompt: 'The future of AI is', Generated text: ' safe in the hands of hackers\nThe world’s most famous hacker is'





In [None]:
def query(prompts, fmt=None, seed=None) -> str:
    return llm.query_batch(prompts, fmt=fmt, seed=seed)

In [None]:
json_schema = Procedure.model_json_schema()
structured_outputs_params_json = StructuredOutputsParams(json=json_schema)

[33m(raylet)[0m [2026-02-02 17:29:46,115 E 169247 169247] (raylet) node_manager.cc:3256: 137 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 1e8bdda6cc8c0df906c5746d83e31dbb98a0a76d46becc93b00c7346, IP: 10.29.3.105) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 10.29.3.105`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[33m(raylet)[0m 
[33m(raylet)[0m [2026-02-02 17:30:46,176 E 169247 169247] (raylet) 

In [None]:
sampling_params_json = SamplingParams(
    structured_outputs=structured_outputs_params_json, max_tokens=1e6
)
prompt_json = (
    "Generate a JSON with the brand, model and car_type of "
    "the most iconic car from the 90's"
)

In [None]:
def format_output(title: str, output: str):
    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")


def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
    outputs = llm.generate(prompt, sampling_params=sampling_params)
    return outputs[0].outputs[0].text

In [None]:
json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Structured outputs by JSON", json_output)


Adding requests: 100%|██████████| 1/1 [00:00<00:00, 247.26it/s]
Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][0;36m(EngineCore_DP0 pid=168364)[0;0m Exception in thread Thread-6 (process_input_sockets):
[0;36m(EngineCore_DP0 pid=168364)[0;0m Traceback (most recent call last):
[0;36m(EngineCore_DP0 pid=168364)[0;0m   File "/home/student/.local/share/uv/python/cpython-3.12.0-linux-aarch64-gnu/lib/python3.12/threading.py", line 1052, in _bootstrap_inner
[0;36m(EngineCore_DP0 pid=168364)[0;0m     self.run()
[0;36m(EngineCore_DP0 pid=168364)[0;0m   File "/home/student/.local/share/uv/python/cpython-3.12.0-linux-aarch64-gnu/lib/python3.12/threading.py", line 989, in run
[0;36m(EngineCore_DP0 pid=168364)[0;0m     self._target(*self._args, **self._kwargs)
[0;36m(EngineCore_DP0 pid=168364)[0;0m   File "/home/student/Desktop/malia/evoproc_tests/.venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 1140, in pro

KeyboardInterrupt: 



ERROR 02-02 17:35:36 [core_client.py:605] Engine core proc EngineCore_DP0 died unexpectedly, shutting down client.


[33m(raylet)[0m [2026-02-02 17:35:46,283 E 169247 169247] (raylet) node_manager.cc:3256: 126 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: 1e8bdda6cc8c0df906c5746d83e31dbb98a0a76d46becc93b00c7346, IP: 10.29.3.105) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 10.29.3.105`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.
[36m(AutoscalingRequester pid=230729)[0m [2026-02-02 17:36:06,942 E 230729 230760] cor