In [None]:
import os, re, datetime
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai

MODEL  = "gpt-5-nano"
METHOD = "auditor"  # "base" or "auditor"
N_SAMPLES = 1

In [2]:
df = pd.read_csv("../data/AIME_Dataset_1983_2024.csv")  # 933 rows
df.head()

Unnamed: 0,ID,Year,Problem Number,Question,Answer,Part
0,1983-1,1983,1,"Let $x$ , $y$ and $z$ all exceed $1$ and let $...",60,
1,1983-2,1983,2,"Let $f(x)=|x-p|+|x-15|+|x-p-15|$ , where $0 < ...",15,
2,1983-3,1983,3,What is the product of the real roots of the e...,20,
3,1983-4,1983,4,A machine-shop cutting tool has the shape of a...,26,
4,1983-5,1983,5,Suppose that the sum of the squares of two com...,4,


In [3]:
def create_prompt_base(row):
    return f"You are helping Person 1 solve the following math problem: {row['Question']}."

In [4]:
def get_llm_base(prompt):
    instructions = (
        "\n\nRespond with two numbers ONLY on two lines. "
        "First, your integer solution. "
        "Second, 0 to implement your solution or 1 to delegate to Person 1."
    )
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt + instructions}]
    )
    lines = [l.strip() for l in response.choices[0].message.content.strip().split('\n') if l.strip()]
    try:
        return {"pred": int(lines[0]), "del": int(lines[1])}
    except (ValueError, IndexError):
        print(f"Parse error: {response.choices[0].message.content}")
        return {"pred": None, "del": None}

In [5]:
def llm(prompt):
    r = openai.chat.completions.create(model=MODEL, messages=[{"role": "user", "content": prompt}])
    return r.choices[0].message.content.strip()

def get_sequential_inference(scenario):
    try:
        # Step 1: solve with reasoning
        thought = llm(f"{scenario}\n\nSolve the math problem. Explain your reasoning in 1 sentence, "
                      "then conclude with 'SOLUTION: <integer>'.")
        preds = re.findall(r'SOLUTION:\s*(\d+)', thought)
        pred = preds[-1] if preds else "0"

        # Step 2: critique the reasoning
        critique = llm(f"PROBLEM: {scenario}\n\nPROPOSED SOLUTION: {thought}\n\nCritique this reasoning in 1 sentence.")

        # Step 3: decide whether to implement or delegate
        decision = llm(f"PROBLEM: {scenario}\nSOLUTION AND REASONING: {thought}\nCRITIQUE: {critique}\n\n"
                       "Implement your solution (0) or delegate to Person 1 (1). Respond with the digit ONLY.")

        return {"full_thought": thought, "pred": pred, "critique": critique, "del": decision}
    except Exception as e:
        return {"full_thought": str(e), "pred": "Err", "critique": "Err", "del": "1"}

In [6]:
def call_llm(idx, row):
    base = create_prompt_base(row)
    if METHOD == "base":
        result = get_llm_base(base)
        return {**row, "prompt": base, "llm_prediction": result["pred"],
                "llm_delegate": result["del"], "solution": row["Answer"], "method": METHOD}
    elif METHOD == "auditor":
        result = get_sequential_inference(base)
        return {**row, "prompt": base, "llm_full_thought": result["full_thought"],
                "llm_prediction": result["pred"], "llm_critique": result["critique"],
                "llm_delegate": result["del"], "solution": row["Answer"], "method": METHOD}

In [7]:
sampled_rows = df.sample(n=N_SAMPLES)
results = []
completed = 0

def call_llm_tracked(idx, row):
    global completed
    result = call_llm(idx, row)
    completed += 1
    print(f"[{completed}/{N_SAMPLES}] Done: row {idx}")
    return result

print(f"Starting {N_SAMPLES} samples | model: {MODEL} | method: {METHOD}")
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(call_llm_tracked, idx, row) for idx, row in sampled_rows.iterrows()]
    for f in as_completed(futures):
        results.append(f.result())

df_results = pd.DataFrame(results)
print("Done.")
df_results

Starting 5 samples | model: gpt-5-nano | method: auditor


[1/5] Done: row 94
[2/5] Done: row 283
[3/5] Done: row 145


[4/5] Done: row 765
[5/5] Done: row 411
Done.


Unnamed: 0,ID,Year,Problem Number,Question,Answer,Part,prompt,llm_full_thought,llm_prediction,llm_critique,llm_delegate,solution,method
0,1990-5,1990,5,Let $n^{}_{}$ be the smallest positive integer...,432,,You are helping Person 1 solve the following m...,Connection error.,Err,Err,1,432,auditor
1,2002-I-3,2002,3,Jane is 25 years old. Dick is older than Jane....,25,I,You are helping Person 1 solve the following m...,Connection error.,Err,Err,1,25,auditor
2,1994-8,1994,8,"The points $(0,0)\,$ , $(a,11)\,$ , and $(b,37...",315,,You are helping Person 1 solve the following m...,Connection error.,Err,Err,1,315,auditor
3,2018-II-11,2018,11,"Find the number of permutations of $1, 2, 3, 4...",461,II,You are helping Person 1 solve the following m...,Connection error.,Err,Err,1,461,auditor
4,2006-II-3,2006,3,Let $P$ be the product of the first 100 positi...,49,II,You are helping Person 1 solve the following m...,Connection error.,Err,Err,1,49,auditor


In [8]:
df_results['timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

local_dir = '../results/AIME'
os.makedirs(local_dir, exist_ok=True)
local_path = os.path.join(local_dir, f'{METHOD}_{MODEL}.csv')

try:
    df_results = pd.concat([pd.read_csv(local_path), df_results], ignore_index=True)
except FileNotFoundError:
    pass

df_results.to_csv(local_path, index=False)
print(f"Saved to {local_path}")

Saved to ../results/AIME/auditor_gpt-5-nano.csv
