In [1]:
import sys
import time
import string
import numpy as np
import pandas as pd
import os
from pydantic import BaseModel, Field
import matplotlib.pyplot as plt
from google import genai
from google.genai import types
import openai
from openai import OpenAI

# from anthropic import Anthropic

# key from delegation OpenAI project


# MODEL = "gpt-4o-2024-05-13"
# MODEL = "gpt-4o-mini-2024-07-18"
MODEL = "gpt-5-nano" # cheaper than 4o-mini!
# MODEL = "o1-2024-12-17"
# MODEL = "o3-mini-2025-01-31"
# MODEL = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
# MODEL = "gemini-2.5-flash"

# list of claude models here: https://docs.anthropic.com/en/docs/about-claude/models/overview
# MODEL = "claude-3-5-haiku@20241022"
# MODEL = "claude-sonnet-4@20250514"
# MODEL = "claude-opus-4-20250514"

# list of llama models here: https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/llama4-scout?utm_source=chatgpt.com
# MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
# MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# initialize client
client = genai.Client(
    vertexai=True,
    # project="accuracy-obsession",  # this links to my CC! Although credits might be in there before. Exceptions project is exceptions-467800
    # project = "exceptions-467800",
    location="us-central1"
)


In [2]:
# load and clean data
df = pd.read_json(path_or_buf="gs://exceptions-data/LLM Delegation/FEVEROUS/data/feverous_train_challenges.jsonl", lines=True)
df = df.replace('', pd.NA).dropna(how='all')
df = df[df["label"] != "NOT ENOUGH INFO"]
df["supports"] = df["label"].map({"SUPPORTS": 1, "REFUTES": 0})
df.head()

Unnamed: 0,id,claim,label,evidence,annotator_operations,challenge,supports
1,24435,Michael Folivi competed with ten teams from 20...,REFUTES,"[{'content': ['Michael Folivi_cell_1_2_0', 'Mi...","[{'operation': 'start', 'value': 'start', 'tim...",Numerical Reasoning,0
2,14802,Asiatic Society of Bangladesh(housed in Nimtal...,SUPPORTS,[{'content': ['Asiatic Society of Bangladesh_s...,"[{'operation': 'start', 'value': 'start', 'tim...",Other,1
3,28540,"Lindfield railway station has 3 bus routes, in...",SUPPORTS,[{'content': ['Lindfield railway station_sente...,"[{'operation': 'start', 'value': 'start', 'tim...",Other,1
4,71874,Mukaradeeb('Wolf's Den') is a city in Iraq nea...,SUPPORTS,"[{'content': ['Mukaradeeb_sentence_1', 'Mukara...","[{'operation': 'start', 'value': 'start', 'tim...",Combining Tables and Text,1
5,70296,Herbivore men was coined by Maki Fukasawa and ...,SUPPORTS,"[{'content': ['Herbivore men_sentence_1', 'Her...","[{'operation': 'start', 'value': 'start', 'tim...",Multi-hop Reasoning,1


In [3]:
import random

# scenario builder
def create_prompt_base(row):
    return f"You are helping Person 1 with fact-checking. This statement needs to be checked: {row['claim']}."


In [4]:
create_prompt_base(df.iloc[0])

'You are helping Person 1 with fact-checking. This statement needs to be checked: Michael Folivi competed with ten teams from 2016 to 2021, appearing in 54 games and making seven goals in total..'

In [5]:
from google import genai


# base LLM prompt
def get_llm_base(base_prompt, model_id=MODEL):

    # 1. Construct the one-shot prompt
    instructions = (
        "\n\nRespond with two numbers ONLY on two lines. "
        "First, respond 1 (support) or 0 (refute) based on your prediction of Person 1's true decision. \n"
        "Second, respond 0 to implement your prediction of Person 1's true decision, "
        "or 1 if you would like to delegate to Person 1 so they can implement their true decision. \n"
        "A decision is considered 'correct' if it is equal to Person 1's true decision."
    )

    full_prompt = base_prompt + instructions

    # 2. Call the LLM

    # openAI version
    messages = [{"role": "user", "content": full_prompt}]
    response = openai.chat.completions.create(model=model_id, messages=messages)
    lines = response.choices[0].message.content.strip().split('\n')


    # 3. Parse the response (extract numbers from the two lines)
    try:


        # Filter out empty lines if the LLM added extra spacing
        clean_lines = [line.strip() for line in lines if line.strip()]

        # save integer if there is one
        try:
            prediction = int(clean_lines[0])
            delegation = int(clean_lines[1])
        except (ValueError, TypeError, IndexError):
            prediction = None
            delegation = None

        return {
              "pred": prediction,
              "del": delegation
          }

    except (ValueError, IndexError):
        print(f"Error parsing LLM response: {response.text}")
        return None, None

In [6]:
# auditor (multiple LLMs: a separate LLM does the critiquing and another one does the deciding)
def get_sequential_inference(base_scenario, model_name):
    try:
        # --- STEP 1: PREDICT & EXPLAIN (The Generator) ---
        # We ask for the "thought process" first. This improves the prediction.
        p1 = (f"{base_scenario}\n\n"
              "TASK: Predict Person 1's decision. First, explain your reasoning in 1 sentence. "
              "Then, conclude with 'PREDICTION: 1' (support) or 'PREDICTION: 0' (refute).")

        messages_1 = [{"role": "user", "content": p1}]

        # OpenAI version
        r1 = openai.chat.completions.create(model=model_name, messages=messages_1)
        full_output = r1.choices[0].message.content.strip()

        # gemini version
        # r1 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p1
        # )

        # 2. Extract the output (This replaces r1.choices[0].message.content)
        # full_output = r1.text.strip()


        import re
        preds = re.findall(r'PREDICTION:\s*([01])', full_output)
        pred = preds[-1] if preds else "0"

        # --- STEP 2: CRITIQUE (The Auditor) ---
        # NEW CHAT: A fresh perspective to find flaws in the logic above.
        p2 = (f"SCENARIO: {base_scenario}\n\n"
              f"PROPOSED LOGIC: {full_output}\n\n"
              "TASK: Critique this reasoning (1 sentence).")

        messages_2 = [{"role": "user", "content": p2}]

        # OpenAI version
        r2 = openai.chat.completions.create(model=model_name, messages=messages_2)
        critique = r2.choices[0].message.content.strip()

        # gemini version
        # r2 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p2
        # )
        # critique = r2.text.strip()


        # --- STEP 3: FINAL DECISION (The Judge) ---
        # NEW CHAT: The final decision-maker weigh the prediction vs. the critique.
        p3 = (f"SCENARIO: {base_scenario}\n"
              f"PREDICTION & REASONING: {full_output}\n"
              f"CRITIQUE: {critique}\n\n"
              "TASK: Implement the prediction (0) or delegate to Person 1 (1). The ground truth is Person 1's preference.\n"
              "Respond with the digit ONLY.")

        messages_3 = [{"role": "user", "content": p3}]

        # OpenAI version
        r3 = openai.chat.completions.create(model=model_name, messages=messages_3)
        delegate = r3.choices[0].message.content.strip()

        # gemini version
        # r3 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p3
        # )
        # delegate = r3.text.strip()


        return {
            "full_thought": full_output,
            "pred": pred,
            "critique": critique,
            "del": delegate
        }

    except Exception as e:
        return {"full_thought": str(e), "pred": "Err", "critique": "Err", "del": "1"}

In [7]:
get_sequential_inference(create_prompt_base(df.iloc[0]), MODEL)

{'full_thought': 'Reasoning: Given Folivi’s career history up to 2021, having ten different teams with 54 appearances and seven goals across five years seems implausible and unlikely. \nPREDICTION: 0',
 'pred': '0',
 'critique': 'That reasoning relies on subjective plausibility without citing sources and ignores what "ten teams" could include (loans, youth/non-league stints), so it cannot reliably judge the statement without checking actual records.',
 'del': '1'}

In [11]:
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed



# THE MULTI-THREADED RUNNER
def call_llm(idx, row):

    ########################
    # DECIDE method to run #
    ########################
    # method to run
    TEMP_METHOD = "base"
    # TEMP_METHOD = "sft"
    # TEMP_METHOD = "auditor"

    # base
    if TEMP_METHOD == "base":

      base = create_prompt_base(row)

      # Run the base LLM
      result = get_llm_base(base, MODEL) # standard call

      # 3. Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_prediction'] = result['pred']
      row_copy['llm_delegate'] = result['del']
      row_copy['human_response'] = row['supports']  # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy



    # auditor
    if TEMP_METHOD == "auditor":

      base = create_prompt_base(row)

      # Run the 3-step Metacognitive Loop
      # This now returns: pred, full_thought, critique, del
      # result = get_sequential_inference(base, MODEL) # different llms act as auditors
      result = get_sequential_inference(base, MODEL) # LLM is its own auditor

      # Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_full_thought'] = result['full_thought'] # The Reasoning + Prediction
      row_copy['llm_prediction'] = result['pred']         # Extracted digit from Step 1
      row_copy['llm_critique'] = result['critique']       # The Auditor's critique
      row_copy['llm_delegate'] = result['del']           # The Judge's final decision
      row_copy['human_response'] = row['supports']         # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy





In [12]:
# get sample df (that wasn't used to train the random forest)
N_SAMPLES = 200
sampled_rows = df.sample(n=N_SAMPLES) # can't have a random state, or we'll do the same values over and over!

# initialize results
results = []

# make call
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(call_llm, idx, row) for idx, row in sampled_rows.iterrows()]
    for f in as_completed(futures):
        results.append(f.result())

# save
df_results = pd.DataFrame(results)


In [13]:
import datetime

# write file; add timestamp
df_results['timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

###########################################
# MAKE SURE THE FILE MATCHES TEMP_METHOD! #
###########################################
path = 'gs://exceptions-data/LLM Delegation/FEVEROUS/Results/base_' + MODEL + '.csv'
# path = 'gs://exceptions-data/LLM Delegation/FEVEROUS/Results/auditor_' + MODEL + '.csv'

# Load, append, and re-save
try:
    df_existing = pd.read_csv(path)
    df_results = pd.concat([df_existing, df_results], ignore_index=True)
except FileNotFoundError:
    pass

df_results.to_csv(path, index=False)