In [1]:
import sys
import time
import string
import numpy as np
import pandas as pd
import os
from pydantic import BaseModel, Field
import matplotlib.pyplot as plt
from google import genai
from google.genai import types
import openai
from openai import OpenAI

# from anthropic import Anthropic

# key from delegation OpenAI project


# MODEL = "gpt-4o-2024-05-13"
# MODEL = "gpt-4o-mini-2024-07-18"
MODEL = "gpt-5-nano" # cheaper than 4o-mini!
# MODEL = "o1-2024-12-17"
# MODEL = "o3-mini-2025-01-31"
# MODEL = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
# MODEL = "gemini-2.5-flash"

# list of claude models here: https://docs.anthropic.com/en/docs/about-claude/models/overview
# MODEL = "claude-3-5-haiku@20241022"
# MODEL = "claude-sonnet-4@20250514"
# MODEL = "claude-opus-4-20250514"

# list of llama models here: https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/llama4-scout?utm_source=chatgpt.com
# MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
# MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# initialize client
client = genai.Client(
    vertexai=True,
    # project="accuracy-obsession",  # this links to my CC! Although credits might be in there before. Exceptions project is exceptions-467800
    # project = "exceptions-467800",
    location="us-central1"
)


In [2]:
# load lending club accepted data
import pandas as pd
df = pd.read_csv("gs://exceptions-data/LLM Delegation/AIME/data/AIME_Dataset_1983_2024.csv") # entire dataset has 933 rows

df.head()

Unnamed: 0,ID,Year,Problem Number,Question,Answer,Part
0,1983-1,1983,1,"Let $x$ , $y$ and $z$ all exceed $1$ and let $...",60,
1,1983-2,1983,2,"Let $f(x)=|x-p|+|x-15|+|x-p-15|$ , where $0 < ...",15,
2,1983-3,1983,3,What is the product of the real roots of the e...,20,
3,1983-4,1983,4,A machine-shop cutting tool has the shape of a...,26,
4,1983-5,1983,5,Suppose that the sum of the squares of two com...,4,


In [3]:
import random

# scenario builder
def create_prompt_base(row):
    return f"You are helping Person 1 the following math problem: {row['Question']}."


In [4]:
from google import genai


# base LLM prompt
def get_llm_base(base_prompt, model_id=MODEL):

    # 1. Construct the one-shot prompt
    instructions = (
        "\n\nRespond with two numbers ONLY on two lines. "
        "First, respond with your integer solution for the math problem. \n"
        "Second, respond 0 to implement your solution, "
        "or 1 if you would like to delegate to Person 1 so they can implement their solution."
    )

    full_prompt = base_prompt + instructions

    # 2. Call the LLM

    # openAI version
    messages = [{"role": "user", "content": full_prompt}]
    response = openai.chat.completions.create(model=model_id, messages=messages)
    lines = response.choices[0].message.content.strip().split('\n')

    # gemini version
    # response = client.models.generate_content(
    #    model=model_id,
    #    contents=full_prompt
    # )
    # lines = response.text.strip().split('\n')


    # 3. Parse the response (extract numbers from the two lines)
    try:


        # Filter out empty lines if the LLM added extra spacing
        clean_lines = [line.strip() for line in lines if line.strip()]

        # save integer if there is one
        try:
            prediction = int(clean_lines[0])
            delegation = int(clean_lines[1])
        except (ValueError, TypeError, IndexError):
            prediction = None
            delegation = None

        return {
              "pred": prediction,
              "del": delegation
          }

    except (ValueError, IndexError):
        print(f"Error parsing LLM response: {response.text}")
        return None, None

In [5]:
# auditor (multiple LLMs: a separate LLM does the critiquing and another one does the deciding)
def get_sequential_inference(base_scenario, model_name):
    try:
        # --- STEP 1: PREDICT & EXPLAIN (The Generator) ---
        # We ask for the "thought process" first. This improves the prediction.
        p1 = (f"{base_scenario}\n\n"
              "TASK: Solve the math problem. First, explain your reasoning in 1 sentence. "
              "Then, conclude with 'SOLUTION: your integer solution'")

        messages_1 = [{"role": "user", "content": p1}]

        # OpenAI version
        r1 = openai.chat.completions.create(model=model_name, messages=messages_1)
        full_output = r1.choices[0].message.content.strip()

        # gemini version
        # r1 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p1
        # )

        # 2. Extract the output (This replaces r1.choices[0].message.content)
        # full_output = r1.text.strip()


        import re
        preds = re.findall(r'SOLUTION:\s*(\d+)', full_output)
        pred = preds[-1] if preds else "0"


        # --- STEP 2: CRITIQUE (The Auditor) ---
        # NEW CHAT: A fresh perspective to find flaws in the logic above.
        p2 = (f"PROBLEM: {base_scenario}\n\n"
              f"PROPOSED SOLUTION: {full_output}\n\n"
              "TASK: Critique this reasoning (1 sentence).")

        messages_2 = [{"role": "user", "content": p2}]

        # OpenAI version
        r2 = openai.chat.completions.create(model=model_name, messages=messages_2)
        critique = r2.choices[0].message.content.strip()

        # gemini version
        # r2 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p2
        # )
        # critique = r2.text.strip()


        # --- STEP 3: FINAL DECISION (The Judge) ---
        # NEW CHAT: The final decision-maker weigh the prediction vs. the critique.
        p3 = (f"PROBLEM: {base_scenario}\n"
              f"PPROPOSED SOLUTION AND REASONING: {full_output}\n"
              f"CRITIQUE: {critique}\n\n"
              "TASK: Implement your solution (0) or delegate to Person 1 (1).\n"
              "Respond with the digit ONLY.")

        messages_3 = [{"role": "user", "content": p3}]

        # OpenAI version
        r3 = openai.chat.completions.create(model=model_name, messages=messages_3)
        delegate = r3.choices[0].message.content.strip()

        # gemini version
        # r3 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p3
        # )
        # delegate = r3.text.strip()


        return {
            "full_thought": full_output,
            "pred": pred,
            "critique": critique,
            "del": delegate
        }

    except Exception as e:
        return {"full_thought": str(e), "pred": "Err", "critique": "Err", "del": "1"}

In [10]:
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed



# THE MULTI-THREADED RUNNER
def call_llm(idx, row):

    ########################
    # DECIDE method to run #
    ########################
    # method to run
    # TEMP_METHOD = "base"
    # TEMP_METHOD = "sft"
    TEMP_METHOD = "auditor"

    # base
    if TEMP_METHOD == "base":

      base = create_prompt_base(row)

      # Run the base LLM
      result = get_llm_base(base, MODEL) # standard call

      # 3. Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_prediction'] = result['pred']
      row_copy['llm_delegate'] = result['del']
      row_copy['solution'] = row['Answer']  # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy



    # auditor
    if TEMP_METHOD == "auditor":

      base = create_prompt_base(row)

      # Run the 3-step Metacognitive Loop
      # This now returns: pred, full_thought, critique, del
      # result = get_sequential_inference(base, MODEL) # different llms act as auditors
      result = get_sequential_inference(base, MODEL) # LLM is its own auditor

      # Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_full_thought'] = result['full_thought'] # The Reasoning + Prediction
      row_copy['llm_prediction'] = result['pred']         # Extracted digit from Step 1
      row_copy['llm_critique'] = result['critique']       # The Auditor's critique
      row_copy['llm_delegate'] = result['del']           # The Judge's final decision
      row_copy['solution'] = row['Answer']         # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy





In [11]:
# get sample df
N_SAMPLES = 50
sampled_rows = df.sample(n=N_SAMPLES) # can't have a random state, or we'll do the same values over and over!

# initialize results
results = []

# make call
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(call_llm, idx, row) for idx, row in sampled_rows.iterrows()]
    for f in as_completed(futures):
        results.append(f.result())

# save
df_results = pd.DataFrame(results)


In [12]:
df_results

Unnamed: 0,ID,Year,Problem Number,Question,Answer,Part,prompt,llm_full_thought,llm_prediction,llm_critique,llm_delegate,solution,method
249,2000-II-13,2000,13,The equation $2000x^6+100x^5+10x^3+x-2=0$ has ...,200,II,You are helping Person 1 the following math pr...,Set y = 10x to obtain Q(y) = 2y^6 + y^5 + 10y^...,200,Correct: the substitution y=10x yields the val...,0,200,auditor
261,2001-I-11,2001,11,"In a rectangular array of points, with 5 rows ...",149,I,You are helping Person 1 the following math pr...,Reasoning in one sentence: Relating x_i=(i−1)N...,125,The solution leaps from the five equalities to...,0,149,auditor
218,1999-10,1999,10,"Ten points in the plane are given, with no thr...",489,,You are helping Person 1 the following math pr...,One-sentence reasoning: There are 120 possible...,489,This reasoning is correct; it uses that no 4-e...,0,489,auditor
921,2024-II-3,2024,3,Find the number of ways to place a digit in ea...,45,II,You are helping Person 1 the following math pr...,Reasoning: Analyzing the column-wise sums with...,45,This reasoning is correct: it rightly conclude...,0,45,auditor
181,1997-2,1997,2,The nine horizontal and nine vertical lines on...,125,,You are helping Person 1 the following math pr...,Reasoning (in one sentence): On an 8×8 grid fo...,125,This reasoning is correct and complete: it cou...,0,125,auditor
443,2007-II-7,2007,7,"Given a real number $x,$ let $\lfloor x \rfloo...",553,II,You are helping Person 1 the following math pr...,The number of multiples of k in the interval [...,553,The reasoning is correct: the number of multip...,0,553,auditor
338,2004-I-2,2004,2,Set $A$ consists of $m$ consecutive integers w...,201,I,You are helping Person 1 the following math pr...,"Let A be m consecutive integers {a, a+1, ..., ...",201,"The algebra is correct, but the solution omits...",0,201,auditor
7,1983-8,1983,8,What is the largest $2$ -digit prime factor of...,61,,You are helping Person 1 the following math pr...,"By Kummer's theorem, v_p(C(200,100)) equals th...",61,"The reasoning is correct: by Kummer’s theorem,...",0,61,auditor
267,2001-II-2,2001,2,Each of the 2001 students at a high school stu...,298,II,You are helping Person 1 the following math pr...,Let x be the number who study both; with N=200...,298,Correct: using inclusion–exclusion with S∈{160...,0,298,auditor
585,2012-II-10,2012,10,Find the number of positive integers $n$ less ...,496,II,You are helping Person 1 the following math pr...,"Let k = floor(x) ≥ 1; then x ∈ [k, k+1) implie...",496,"Correct: for each k ≥ 1 with k^2 ≤ 999, x ∈ [k...",0,496,auditor


In [None]:
import datetime
import os

# write file; add timestamp
df_results['timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

###########################################
# MAKE SURE THE FILE MATCHES TEMP_METHOD! #
###########################################
# path = 'gs://exceptions-data/LLM Delegation/AIME/Results/base_' + MODEL + '.csv'
path = 'gs://exceptions-data/LLM Delegation/AIME/Results/auditor_' + MODEL + '.csv'

# Load, append, and re-save
try:
    df_existing = pd.read_csv(path)
    df_results = pd.concat([df_existing, df_results], ignore_index=True)
except FileNotFoundError:
    pass

df_results.to_csv(path, index=False)

# also save locally to results/AIME/
local_dir = '../results/AIME'
os.makedirs(local_dir, exist_ok=True)
local_path = os.path.join(local_dir, 'auditor_' + MODEL + '.csv')
try:
    df_local_existing = pd.read_csv(local_path)
    df_results = pd.concat([df_local_existing, df_results], ignore_index=True)
except FileNotFoundError:
    pass
df_results.to_csv(local_path, index=False)
print(f"Saved locally to {local_path}")

In [None]:
get_sequential_inference(create_prompt_base(sampled_rows.iloc[0]), MODEL)

{'full_thought': 'Equating greens from the a-layers and the b-layers gives a = 5t and b = 3t; since each a-layer has yellow bc − 21 and each b-layer red ac − 45, the total red across b-layers equals b(ac − 45) and must equal a·9, which with a = 5t, b = 3t yields t c = 12; hence c = 12/t and the volume V = abc = (5t)(3t)(12/t) = 180t, minimized at t = 1 to a = 5, b = 3, c = 12, giving V = 180.\n\nSOLUTION: 180',
 'pred': '180',
 'critique': 'The method is essentially correct, but you must also require c = 12/t to be an integer (so t must be a positive divisor of 12); among those, the smallest volume is achieved at t = 1, giving V = 180.',
 'del': '0'}