In [20]:
import sys
import time
import string
import numpy as np
import pandas as pd
import os
from pydantic import BaseModel, Field
import matplotlib.pyplot as plt
from google import genai
from google.genai import types
import openai
from openai import OpenAI

# from anthropic import Anthropic

# key from delegation OpenAI project


# MODEL = "gpt-4o-2024-05-13"
# MODEL = "gpt-4o-mini-2024-07-18"
MODEL = "gpt-5-nano" # cheaper than 4o-mini!
# MODEL = "o1-2024-12-17"
# MODEL = "o3-mini-2025-01-31"
# MODEL = "meta-llama/Llama-3.2-3B-Instruct-Turbo"
# MODEL = "gemini-2.5-flash"

# list of claude models here: https://docs.anthropic.com/en/docs/about-claude/models/overview
# MODEL = "claude-3-5-haiku@20241022"
# MODEL = "claude-sonnet-4@20250514"
# MODEL = "claude-opus-4-20250514"

# list of llama models here: https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/llama4-scout?utm_source=chatgpt.com
# MODEL = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
# MODEL = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

# initialize client
client = genai.Client(
    vertexai=True,
    # project="accuracy-obsession",  # this links to my CC! Although credits might be in there before. Exceptions project is exceptions-467800
    # project = "exceptions-467800",
    location="us-central1"
)


In [13]:
import pandas as pd

# load and combine the data, and save in our directory
# data here: https://huggingface.co/datasets/jhu-clsp/jfleg
# splits = {'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
# df1 = pd.read_parquet("hf://datasets/jhu-clsp/jfleg/" + splits["test"])
# df2 = pd.read_parquet("hf://datasets/jhu-clsp/jfleg/" + splits["validation"])
# df = pd.concat([df1, df2])
# df.to_csv("gs://exceptions-data/LLM Delegation/JFLEG/data/data.csv")

# load data that we have saved
df = pd.read_csv("gs://exceptions-data/LLM Delegation/JFLEG/data/data.csv")

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,sentence,corrections
0,0,New and new technology has been introduced to ...,['New technology has been introduced to societ...
1,1,One possible outcome is that an environmentall...,['One possible outcome is that an environmenta...
2,2,"Every person needs to know a bit about math , ...",['Every person needs to know a bit about math ...
3,3,While the travel company will most likely show...,['While the travel company will most likely sh...
4,4,Disadvantage is parking their car is very diff...,['A disadvantage is that parking their cars is...


In [17]:
import random

# scenario builder
def create_prompt_base(row):
    return f"You are helping Person 1 with grammatical error correct. This statement needs to be checked: '{row['sentence']}'"


"You are helping Person 1 with grammatical error correct. This statement needs to be checked: 'New and new technology has been introduced to the society .'"

In [37]:
from google import genai


# base LLM prompt
def get_llm_base(base_prompt, model_id=MODEL):

    # 1. Construct the one-shot prompt
    instructions = (
        "\n\nRespond with two lines ONLY. "
        "First, provide your grammatical correction of the statement based on your prediction of one of Person 1's true grammatical corrections. \n"
        "Second, respond 0 to implement your prediction of one of Person 1's true grammatical corrections, "
        "or 1 if you would like to delegate to Person 1 so they can implement one of their true grammatical corrections. \n"
        "A grammatical correction is considered 'correct' if it is equal to one of Person 1's true grammatical corrections."
    )

    full_prompt = base_prompt + instructions

    # 2. Call the LLM

    # openAI version
    messages = [{"role": "user", "content": full_prompt}]
    response = openai.chat.completions.create(model=model_id, messages=messages)
    lines = response.choices[0].message.content.strip().split('\n')

    # gemini version
    # response = client.models.generate_content(
    #    model=model_id,
    #    contents=full_prompt
    # )
    # lines = response.text.strip().split('\n')


    # 3. Parse the response (extract numbers from the two lines)
    try:


        # Filter out empty lines if the LLM added extra spacing
        clean_lines = [line.strip() for line in lines if line.strip()]

        # save integer if there is one
        try:
            prediction = clean_lines[0]
            delegation = int(clean_lines[1])
        except (ValueError, TypeError, IndexError):
            prediction = None
            delegation = None

        return {
              "pred": prediction,
              "del": delegation
          }

    except (ValueError, IndexError):
        print(f"Error parsing LLM response: {response.text}")
        return None, None

In [46]:
# auditor (multiple LLMs: a separate LLM does the critiquing and another one does the deciding)
def get_sequential_inference(base_scenario, model_name):
    try:
        # --- STEP 1: PREDICT & EXPLAIN (The Generator) ---
        # We ask for the "thought process" first. This improves the prediction.
        p1 = (f"{base_scenario}\n\n"
              "TASK: Predict one of Person 1's grammatical correction. First, explain your reasoning in 1 sentence. "
              "Then, conclude with 'CORRECTION: your grammatical correction here.")

        messages_1 = [{"role": "user", "content": p1}]

        # OpenAI version
        r1 = openai.chat.completions.create(model=model_name, messages=messages_1)
        full_output = r1.choices[0].message.content.strip()

        # gemini version
        # r1 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p1
        # )

        # 2. Extract the output (This replaces r1.choices[0].message.content)
        # full_output = r1.text.strip()

        import re

        preds = re.findall(r'CORRECTION:\s*(.*)', full_output)
        pred = preds[-1].strip() if preds else "0"


        # --- STEP 2: CRITIQUE (The Auditor) ---
        # NEW CHAT: A fresh perspective to find flaws in the logic above.
        p2 = (f"SCENARIO: {base_scenario}\n\n"
              f"PROPOSED CORRECTION: {full_output}\n\n"
              "TASK: Critique this reasoning (1 sentence).")

        messages_2 = [{"role": "user", "content": p2}]

        # OpenAI version
        r2 = openai.chat.completions.create(model=model_name, messages=messages_2)
        critique = r2.choices[0].message.content.strip()

        # gemini version
        # r2 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p2
        # )
        # critique = r2.text.strip()


        # --- STEP 3: FINAL DECISION (The Judge) ---
        # NEW CHAT: The final decision-maker weigh the prediction vs. the critique.
        p3 = (f"SCENARIO: {base_scenario}\n"
              f"CORRECTION & REASONING: {full_output}\n"
              f"CRITIQUE: {critique}\n\n"
              "TASK: Implement the grammatical correct (0) or delegate to Person 1 (1). The ground truth is one of Person 1's true grammatical correction.\n"
              "Respond with the digit ONLY.")

        messages_3 = [{"role": "user", "content": p3}]

        # OpenAI version
        r3 = openai.chat.completions.create(model=model_name, messages=messages_3)
        delegate = r3.choices[0].message.content.strip()

        # gemini version
        # r3 = client.models.generate_content(
        #    model=MODEL,
        #    contents=p3
        # )
        # delegate = r3.text.strip()


        return {
            "full_thought": full_output,
            "pred": pred,
            "critique": critique,
            "del": delegate
        }

    except Exception as e:
        return {"full_thought": str(e), "pred": "Err", "critique": "Err", "del": "1"}

In [47]:
get_sequential_inference(create_prompt_base(df.iloc[0]), MODEL)

{'full_thought': 'Reasoning: Here "new" is repeated and redundant, "technology" is typically treated as an uncountable noun, and the standard phrase is "to society" rather than "to the society."\n\nCORRECTION: New technology has been introduced to society.',
 'pred': 'New technology has been introduced to society.',
 'critique': 'The reasoning is mostly sound: “new” is indeed redundant before “new technology,” “technology” is generally uncountable in this context (though you can say “technologies” when referring to multiple kinds), and “to society” is the idiomatic phrasing rather than “to the society,” so the proposed correction is appropriate.',
 'del': '0'}

In [63]:
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed



# THE MULTI-THREADED RUNNER
def call_llm(idx, row):

    ########################
    # DECIDE method to run #
    ########################
    # method to run
    # TEMP_METHOD = "base"
    # TEMP_METHOD = "sft"
    TEMP_METHOD = "auditor"

    # base
    if TEMP_METHOD == "base":

      base = create_prompt_base(row)

      # Run the base LLM
      result = get_llm_base(base, MODEL) # standard call

      # 3. Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_prediction'] = result['pred']
      row_copy['llm_delegate'] = result['del']
      row_copy['human_response'] = row['corrections']  # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy



    # auditor
    if TEMP_METHOD == "auditor":

      base = create_prompt_base(row)

      # Run the 3-step Metacognitive Loop
      # This now returns: pred, full_thought, critique, del
      # result = get_sequential_inference(base, MODEL) # different llms act as auditors
      result = get_sequential_inference(base, MODEL) # LLM is its own auditor

      # Save everything back to a copy of the row
      row_copy = row.copy()
      row_copy['prompt'] = base # the prompt
      row_copy['llm_full_thought'] = result['full_thought'] # The Reasoning + Prediction
      row_copy['llm_prediction'] = result['pred']         # Extracted digit from Step 1
      row_copy['llm_critique'] = result['critique']       # The Auditor's critique
      row_copy['llm_delegate'] = result['del']           # The Judge's final decision
      row_copy['human_response'] = row['corrections']         # Ground truth for accuracy
      row_copy['method'] = TEMP_METHOD # avoid mixing up methods

      return row_copy





In [64]:
# get sample df (that wasn't used to train the random forest)
N_SAMPLES = 50
sampled_rows = df.sample(n=N_SAMPLES) # can't have a random state, or we'll do the same values over and over!

# initialize results
results = []

# make call
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(call_llm, idx, row) for idx, row in sampled_rows.iterrows()]
    for f in as_completed(futures):
        results.append(f.result())

# save
df_results = pd.DataFrame(results)


In [65]:
df_results

Unnamed: 0.1,Unnamed: 0,sentence,corrections,prompt,llm_full_thought,llm_prediction,llm_critique,llm_delegate,human_response,method
1178,430,"Then , define your goals and work hard to achi...",['Then define your goals and work hard to achi...,You are helping Person 1 with grammatical erro...,The sentence has a stray space before the comm...,"Then, define your goals and work hard to achie...",That reasoning correctly identifies the stray ...,0,['Then define your goals and work hard to achi...,auditor
426,426,and new gerrthion berfer to uesing the transpo...,['New Gerrthion prefers to use public transpor...,You are helping Person 1 with grammatical erro...,Reason: The sentence has multiple misspellings...,The new generation prefers to use public trans...,Preferring public transport could reduce car u...,0,['New Gerrthion prefers to use public transpor...,auditor
1283,535,So the governments and the scientists are also...,['So governments and scientists are also promo...,You are helping Person 1 with grammatical erro...,"Reasoning: The sentence has several issues—""th...",So governments and scientists are also promoti...,The reasoning accurately identifies the key er...,0,['So governments and scientists are also promo...,auditor
217,217,Maybe for the ecologistics in twenty years the...,"['Maybe , for the ecologists , in twenty years...",You are helping Person 1 with grammatical erro...,"Reasoning: The sentence uses an unusual term, ...","Maybe, for ecological reasons, in twenty years...",Mostly correct in identifying the nonstandard ...,0,"['Maybe , for the ecologists , in twenty years...",auditor
838,90,It kept Anasazi from living in American Southw...,['It kept Anasazi from living in the American ...,You are helping Person 1 with grammatical erro...,"The sentence needs fixes for article use, plur...",It prevented the Anasazi from living in the Am...,That fix improves grammar and punctuation and ...,0,['It kept Anasazi from living in the American ...,auditor
311,311,It is not worth to jump in pool without knowin...,['It is not worth it to jump in a pool without...,You are helping Person 1 with grammatical erro...,"The sentence should use the gerund after ""wort...",It's not worth jumping into a pool if you don'...,"Overall, the reasoning is sound in recognizing...",0,['It is not worth it to jump in a pool without...,auditor
954,206,so measures should be taken to check the fish ...,['Measures should be taken to check the fish p...,You are helping Person 1 with grammatical erro...,Reasoning: The sentence has capitalization at ...,So measures should be taken to check the fish ...,The reasoning correctly notes the capitalizati...,0,['Measures should be taken to check the fish p...,auditor
1437,689,On one hand you have the general practitioners...,['On one hand you have the general practitione...,You are helping Person 1 with grammatical erro...,Reasoning: The sentence needs parallel structu...,"On the one hand, you have general practitioner...",Reasoning correctly flags the need for paralle...,0,['On one hand you have the general practitione...,auditor
614,614,They do not need to know about what facts abou...,['They do not need to know about the facts for...,You are helping Person 1 with grammatical erro...,The sentence is jumbled with an extra 'about' ...,They do not need to know what facts there are ...,The reasoning is sound: it identifies the doub...,0,['They do not need to know about the facts for...,auditor
1245,497,"Of course , things are new and very risky .","['Of course , things are new and very risky . ...",You are helping Person 1 with grammatical erro...,Reasoning: The comma should follow the introdu...,"Of course, things are new and very risky.",Mostly correct: it identifies the comma after ...,0,"['Of course , things are new and very risky . ...",auditor


In [67]:
import datetime

# write file; add timestamp
df_results['timestamp'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

###########################################
# MAKE SURE THE FILE MATCHES TEMP_METHOD! #
###########################################
# path = 'gs://exceptions-data/LLM Delegation/JFLEG/Results/base_' + MODEL + '.csv'
path = 'gs://exceptions-data/LLM Delegation/JFLEG/Results/auditor_' + MODEL + '.csv'

# Load, append, and re-save
try:
    df_existing = pd.read_csv(path)
    df_results = pd.concat([df_existing, df_results], ignore_index=True)
except FileNotFoundError:
    pass

df_results.to_csv(path, index=False)