## Import libraries/packages

**Before starting**

Run the command ``OLLAMA_NUM_PARALLEL=8 OLLAMA_MAX_LOADED_MODELS=1 ollama serve``

This will get OLLama up and running

In [10]:
import time
import json

import pandas as pd

from __future__ import annotations

from evoproc_procedures.runners import run_steps_stateful_minimal
from evoproc_procedures.schemas import get_schema

from step_run_and_mp import MPBatchOllamaEngine, make_batched_query_fn

In [11]:
UNITS = 4
MODEL = "gpt-oss:120b"
SEED = 1234
FINAL_SCHEMA = get_schema("gsm")
PRINT_BOOL = False

PROC_RESULTS_FILEPATH = "runs/gsm8k_train_v6.jsonl"
BASELINE_RESULTS_FILEPATH = "runs/gsm8k_train_v6_baseline.jsonl"

## Reading in the result files

In [12]:
read_results = None
with open(PROC_RESULTS_FILEPATH, "r") as f:
    items = [json.loads(line) for line in f if line.startswith("//") is False]
    read_results = items
    f.close()
proc_results_df = pd.DataFrame(read_results)

In [13]:
read_results = None
with open(BASELINE_RESULTS_FILEPATH, "r") as f:
    items = [json.loads(line) for line in f if line.startswith("//") is False]
    read_results = items
    f.close()
baseline_results_df = pd.DataFrame(read_results)

## For each procedure:
- Grab the procedure
- For each step 
    - Run the step prompt N times
    - Choose the most consistent final answer

In [14]:
# Conditions and their descriptions are included in data_exploration.ipynb file
condition1_final = [22, 48, 81, 136, 173]
condition2_final = [31, 52, 59, 65, 183, 193, 196, 200, 210, 240, 243, 245, 264]
condition3_final = [13, 63, 95, 103, 113, 142, 181, 187, 190, 220, 230, 237, 242, 267, 274]
condition4_final = [144, 234]
condition5_final = [108, 167, 218, 228, 229, 235, 270]

repairable = condition1_final + condition2_final + condition4_final
repairable_df = proc_results_df.iloc[repairable]
irreparable = condition3_final
irreparable_df = proc_results_df.iloc[irreparable]
# Note: condition 5 are actually correct, but the DB answers are incorrect themselves

In [15]:
engine = MPBatchOllamaEngine(model=MODEL, units=UNITS)
QUERY_FN = make_batched_query_fn(
    engine,
    n=4,                 # samples per step
    strategy="consensus" # or "pick_best"
)

GPT_OSS_LOCAL_BUGGY_MODELS = {"gpt-oss:20b", "gpt-oss:120b"}
def query_fn_no_format_for_gptoss(prompt, model, fmt, seed):
    # kill structured-output / format only for gpt-oss models
    # Note, this will only be with the gpt-oss NON-CLOUD models (gpt-oss:120b & gpt-oss:20b)
    # Cloud models do not have this bug, so I want to bypass this if it is a cloud model
    if isinstance(model, str) and model in GPT_OSS_LOCAL_BUGGY_MODELS:
        fmt = None
    return QUERY_FN(prompt, model, fmt, seed)   # call your existing function

In [16]:
for i, row in repairable_df.iterrows():
    try:
        start_time = time.time()
        state = run_steps_stateful_minimal(
            proc=row["procedure"],
            problem_text=row["question"],
            answer_schema=FINAL_SCHEMA,
            model=MODEL,
            query_fn=query_fn_no_format_for_gptoss,
            seed=SEED,
            print_bool=PRINT_BOOL,
            strict_missing=True,
        )
        end_time = time.time()
        correct = state["final_answer_numerical"] == row["gold_num"]
        this_print_bool = True
        if this_print_bool:
            print(f"Finished {i+1}/{len(repairable)}")
            print(f"Time taken: {end_time - start_time:.2f} seconds")
            correct = state["final_answer_numerical"] == row["gold_num"]
            if row["correct"] == correct:
                print("NO CHANGE. Correctness matches the expected value.")
            if not row["correct"] and correct:
                print("Model got it right this time, was marked wrong in the results file.")
                print(f"State: {state}")
                print(f"Is final answer correct? {state['final_answer_numerical'] == row['gold_num']}")
            if row["correct"] and not correct:
                print("Model got it wrong this time, was marked correct in the results file.")
                print(f"State: {state}")
                print(f"Is final answer correct? {state['final_answer_numerical'] == row['gold_num']}")
            if not row["correct"] and not correct:
                print("Model got it wrong this time, was marked wrong in the results file.")
                print(f"State: {state}")
                print(f"Is final answer correct? {state['final_answer_numerical'] == row['gold_num']}")
    except Exception as e:
        print(f"An error occurred: {e}")

Finished 23/20
Time taken: 93.01 seconds
NO CHANGE. Correctness matches the expected value.
Model got it wrong this time, was marked wrong in the results file.
State: {'problem_text': 'Samantha’s last name has three fewer letters than Bobbie’s last name. If Bobbie took two letters off her last name, she would have a last name twice the length of Jamie’s. Jamie’s full name is Jamie Grey. How many letters are in Samantha’s last name?', 'diff_sam_bob': 3, 'bob_removed': 2, 'factor': 2, 'jamie_last_len': 4, 'bob_length_expr': '2 * jamie_last_len + 2', 'samantha_length_expr': '2 * jamie_last_len - 1', 'final_answer': "Samantha's last name length is 9.", 'final_answer_numerical': 9, 'confidence': 0.4}
Is final answer correct? False
Finished 49/20
Time taken: 58.49 seconds
NO CHANGE. Correctness matches the expected value.
Model got it wrong this time, was marked wrong in the results file.
State: {'problem_text': 'Sam memorized six more digits of pi than Carlos memorized. Mina memorized six t

Questions that were corrected with SC runs: 
- 23, 32, 49

Questions that are still incorrect: 
- 14, 53, 60, 64, 66, 82, 96, 104

In [None]:
import multiprocess as mp
import numpy as np

units=4

import platform

if platform.system() == "Darwin": # Check if the OS is macOS
    mp.set_start_method("spawn", force=True)

In [None]:
def ola(x):
    model1 = "gpt-oss:20b"
    import ollama
    l=[]
    for z in x:
        l.append(ollama.generate(model=model1,prompt=z,stream=False)['response']) 
    return l

def generate(list):
    vec=np.array(list)
    li=[]
    amount=int(vec.shape[0]/units)
    j=0
    for i in range(units-1):
        li.append(vec[j*amount:(j+1)*amount])
        j=j+1
    li.append(vec[j*amount:vec.shape[0]])
    print(li)
    with mp.Pool(units) as pool:
        x=pool.map(ola, li)
    lf=[]
    for t in x:
        lf.extend(t)
    return lf


In [None]:
generate(["What is 2+2?", "What is the capital of France?", "Who wrote 'To Kill a Mockingbird'?", "What is the boiling point of water?"])

[array(['What is 2+2?'], dtype='<U35'), array(['What is the capital of France?'], dtype='<U35'), array(["Who wrote 'To Kill a Mockingbird'?"], dtype='<U35'), array(['What is the boiling point of water?'], dtype='<U35')]


['4',
 'The capital of France is **Paris**.',
 '"To Kill a Mockingbird" was written by **Harper Lee**.',
 'The normal boiling point of water—meaning the temperature at which its vapor pressure equals the surrounding atmospheric pressure of 1\u202fatmosphere—is:\n\n- **100\u202f°C** (\u202f212\u202f°F\u202f)  \n- **373.15\u202fK**  \n\n**A few extra points to keep in mind**\n\n| Factor | Effect on Boiling Point | Typical Change |\n|--------|------------------------|----------------|\n| **Altitude / pressure** | Lower atmospheric pressure lowers the boiling point. | At 2\u202f000\u202fm (≈\u202f65\u202fkPa), water boils at ≈\u202f93\u202f°C. |\n| **Impurities / solutes** | Dissolved solutes (like salt) raise the boiling point slightly (boiling‑point elevation). | A 3\u202f% NaCl solution boils at ≈\u202f100.9\u202f°C. |\n| **Pressure** | Increasing external pressure raises the boiling point. | At 2\u202fatm (≈\u202f202\u202fkPa), water boils at ≈\u202f120\u202f°C. |\n| **Pseudocode (if y