In [2]:
import random
import json
import asyncio

from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio

from predictors import *
from llm import *
from prompts import *
from tools import *

BIG_MODEL = "gpt-4.1"
MEDIUM_MODEL = "gpt-4.1-mini"
SMALL_MODEL = "gpt-4.1-nano"

In [3]:
dataset = load_dataset("gsm8k", "main")
training, validation = build_datasets(dataset, 1000, 0.7)

In [16]:
case_prompt = """
<problem>
    <question>
        {question}
    </question>
    
    <thoughts>
        {thoughts}
    </thoughts>
    
    <answer>
        {answer}
    </answer>
</problem>
"""
def make_prompt_case(case):
    keys = [k[1] for k in string.Formatter().parse(case_prompt) if k[1]]
    return case_prompt.format(**{k:case[k] for k in keys})

teacher_template = """
You are an expert at solving math word problems. 
Given a math question, provide a step-by-step method to solve this type of problem without being too specific to the problem itself.

Your instructions will be provided to a LLM student agent who will follow the method you create on a slightly different problem.
You can assume that we will only change the quantities and the name of the people.

Finally, solve the problem using your method so I can include an example of applying the technique.

{problem}
"""

def make_prompt_teacher(case):
    return teacher_template.format(problem=make_prompt_case(case))

print(make_prompt_teacher(training[0]))


You are an expert at solving math word problems. 
Given a math question, provide a step-by-step method to solve this type of problem without being too specific to the problem itself.

Your instructions will be provided to a LLM student agent who will follow the method you create on a slightly different problem.
You can assume that we will only change the quantities and the name of the people.

Finally, solve the problem using your method so I can include an example of applying the technique.


<problem>
    <question>
        Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
    </question>
    
    <thoughts>
        Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
    </thoughts>
    
    <answer>
        72
    </answer>
</problem>




You are an expert at solving math word problems. 

<problem> 

    <question> 
    Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips   did Natalia sell altogether in April and May? 
    </question> 

</problem> 
1. The goal is to identify the relevant quantities of the problem. We don't need to calculate them now.
1.a. Rewrite the statement, thinking step-by-step about which quantities are relevant.
1.b. Give a list [(description: str, quantity_name: str)] with the description of the quantity and the quantity name (a variable name using snake case). 
2. The goal is to identify the functional relationship as equations between the quantities.
2.a.  Rewrite the pieces of text that express a relationship between two or more quantities.
2.b. For each relationship, write a math equation between the respective variable names of the quantities defined in (1.b)
3. The goal is to identify the constants in the statement and their relationship with the quantities.
3.a. Rewrite the pieces of text where any constant appears.
3.b. For each constant, write math equations between it and the quantities. Use the quantity names defined in (1.b)
4. The goal is to parse the last trace of messages and sum-up everything in one JSON output with the following format:
{
'quantities': [ name_of_quantity: str ]
'equations': [ math_equation: str ]
''
}


In [2]:
r = await predict_cot("How much is 2+2?", model=BIG_MODEL)
r

 'response': '{"thoughts":"STEP 1 Extract Data\\n• Number 1 = 2\\n• Number 2 = 2\\nSTEP 2 Set Up\\n• Let sum = Number 1 + Number 2\\nSTEP 3 Compute Explicitly (MANDATORY)\\n2 + 2 = 4\\nSTEP 4 Convert / Adjust\\n• No unit conversion needed\\nSTEP 5 Sanity Check\\n• 2 plus 2 is 4, which matches basic arithmetic\\nSTEP 6 Produce Output","value":4}',
 'predicted': 4,
 'predicted_thoughts': 'STEP 1 Extract Data\n• Number 1 = 2\n• Number 2 = 2\nSTEP 2 Set Up\n• Let sum = Number 1 + Number 2\nSTEP 3 Compute Explicitly (MANDATORY)\n2 + 2 = 4\nSTEP 4 Convert / Adjust\n• No unit conversion needed\nSTEP 5 Sanity Check\n• 2 plus 2 is 4, which matches basic arithmetic\nSTEP 6 Produce Output',
 'model': 'gpt-4.1'}

In [None]:
async def evaluate(test_set, predict_fn, tag, max_concurrent_requests: int = 10):
    semaphore = asyncio.Semaphore(max_concurrent_requests)
    async def evaluate_item(item):
         async with semaphore:
             result = await predict_fn(item["question"])
             return evaluate_item_result(item, result)
    tasks = [evaluate_item(item) for item in test_set]
    results = []
    for f in tqdm_asyncio.as_completed(tasks):
        result = await f
        results.append(result)
    return {
        'tag': tag,
        'results': results,
    } 



model = SMALL_MODEL
fn_name = 'predict_cot'
predict_fn = partial(eval(fn_name), model=model)
tag = fn_name + '-' + model

results_examples = await evaluate(validation[:3], predict_fn, tag, max_concurrent_requests=10)

100%|█████████████████████████████████████████████| 300/300 [00:57<00:00,  5.23it/s]


In [3]:
z={'thoughts': 'Step 1: Extract Data\n- Total days in April = 30\n- Sundays in April = 4\n- Total days John walks the dog = 30 - 4 = 26\n- Cost per day for walking the dog = $10\n- Total money spent on dog walking = 26 * 10 = $260\n- Money spent on books = $50\n- Money given to Kaylee = $50\n- Total money spent on books and Kaylee = 50 + 50 = $100\n- Total money spent = 260 + 100 = $360\nSanity check: John spends $360, so initial money must be at least $360 to have any left.', 'plan': 'Calculate total expenditure and subtract from initial amount to find remaining money.', 'execute': 'Total dog walking cost = 26 * 10 = $260\nTotal spent on books and giving to Kaylee = 50 + 50 = $100\nTotal expenditure = 260 + 100 = $360\nRemaining money = initial amount - total expenditure\nSince the initial amount is not given, assume initial amount = x\nRemaining money = x - 360', 'sanity_check': "The total expenditure is $360. The remaining money depends on John's initial amount. Without initial amount, we cannot compute the exact remaining money.", 'produce': 'The remaining money is initial amount minus $360.'}
list(z.keys())

['thoughts', 'plan', 'execute', 'sanity_check', 'produce']

In [2]:
import pickle
with open("results_examples_cot_small_model.pkl", "rb") as f:
    results_examples = pickle.load(f)

In [4]:
def calculate_pass_rate(results):
    total = len(results)
    passed = sum(1 for result in results if result["pass"])
    format_error = sum(1 for result in results if result["format_error"])
    api_error = sum(1 for result in results if result["api_error"])
    return {
        'pass_rate': (passed / total) * 100,
        'format_error_rate': (format_error / total) * 100,
        'api_error_rate': (api_error / total) * 100,
    }
calculate_pass_rate(results_examples['results'])

{'pass_rate': 88.66666666666667,
 'format_error_rate': 1.0,
 'api_error_rate': 0.0}

In [9]:
import pickle
# Assuming results_examples is already defined
with open("results_examples_cot_small_model.pkl", "wb") as f:
    pickle.dump(results_examples, f)

BIG_MODEL naive

{'pass_rate': 59.333333333333336,
 'format_error_rate': 2.3333333333333335,
 'api_error_rate': 0.0}

BIG_MODEL CoT

{'pass_rate': 88.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

{'pass_rate': 94.0,
 'format_error_rate': 0.0,
 'api_error_rate': 0.33333333333333337}


BIG_MODEL CoT fixed examples

{'pass_rate': 95.33333333333334, 'format_error_rate': 0.0, 'api_error_rate': 0.33333333333333337}

NANO naive

{'pass_rate': 27.0, 'format_error_rate': 6.0, 'api_error_rate': 0.0}

NANO CoT

{'pass_rate': 66.33333333333333, 'format_error_rate': 0.0, 'api_error_rate': 1.0}
{'pass_rate': 67.33333333333333, 'format_error_rate': 0.0, 'api_error_rate': 0.6666666666666667}
{'pass_rate': 66.66666666666666, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

NANO CoT (improved by meta-prompting 1 iteration using BIG_MODEL)
{'pass_rate': 84.33333333333334, 'format_error_rate': 0.0,'api_error_rate': 0.0}
NANO CoT (improved by meta-prompting 2 iteration using BIG_MODEL)
(similar result as before)

NANO CoT (improved by meta-prompting 1 iteration using o3 reasoning)
{'pass_rate': 88.66666666666667, 'format_error_rate': 1.0, 'api_error_rate': 0.0}

NANO CoT fixed examples
3 examples
{'pass_rate': 76.66666666666667, 'format_error_rate': 0.0, 'api_error_rate': 1.3333333333333335}
6 examples
{'pass_rate': 89.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}
12 examples
{'pass_rate': 82.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

NANO CoT learned examples
3 examples
{'pass_rate': 77.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}
6 examples
{'pass_rate': 89.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}
12 examples

50 examples

100 examples


In [14]:
f"""
%%a{{
"""

'\n%%a{\n'

In [15]:
DUMMY_PROMPT = "Explain quantum entanglement in simple terms, using metaphors and examples. Make the explanation about 2000 words long."
import time
import tiktoken
from openai import OpenAI

# Create sync client
client = OpenAI()
# Tokenizer setup
encoding = tiktoken.encoding_for_model("gpt-4")

def count_tokens(text):
    return len(encoding.encode(text))

def measure_speed(client: OpenAI, model_name: str):
    messages = [{"role": "user", "content": DUMMY_PROMPT}]
    
    start = time.monotonic()
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0,
    )
    end = time.monotonic()

    content = response.choices[0].message.content
    output_tokens = count_tokens(content)
    elapsed = end - start
    speed = output_tokens / elapsed

    return {
        "model": model_name,
        "output_tokens": output_tokens,
        "elapsed_sec": elapsed,
        "speed_tokens_per_sec": speed,
    }

In [16]:
def run_benchmark():
    models = ["gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano"]
    results = []

    for model in models:
        print(f"Benchmarking {model}...")
        result = measure_speed(client, model)
        print(f"Model: {result['model']}")
        print(f"Output Tokens: {result['output_tokens']}")
        print(f"Elapsed Time: {result['elapsed_sec']:.2f} sec")
        print(f"Speed: {result['speed_tokens_per_sec']:.2f} tokens/sec\n")
        results.append(result)

    return results


In [17]:
res = run_benchmark()
res

Benchmarking gpt-4.1...
Model: gpt-4.1
Output Tokens: 2635
Elapsed Time: 58.71 sec
Speed: 44.88 tokens/sec

Benchmarking gpt-4.1-mini...
Model: gpt-4.1-mini
Output Tokens: 1642
Elapsed Time: 17.53 sec
Speed: 93.67 tokens/sec

Benchmarking gpt-4.1-nano...
Model: gpt-4.1-nano
Output Tokens: 2102
Elapsed Time: 9.23 sec
Speed: 227.68 tokens/sec



[{'model': 'gpt-4.1',
  'output_tokens': 2635,
  'elapsed_sec': 58.7119711250125,
  'speed_tokens_per_sec': 44.88011472804115},
 {'model': 'gpt-4.1-mini',
  'output_tokens': 1642,
  'elapsed_sec': 17.52961220900761,
  'speed_tokens_per_sec': 93.6700698465113},
 {'model': 'gpt-4.1-nano',
  'output_tokens': 2102,
  'elapsed_sec': 9.232372332990053,
  'speed_tokens_per_sec': 227.67712611512857}]

In [18]:
b=res[0]['speed_tokens_per_sec']
m=res[1]['speed_tokens_per_sec']
s=res[2]['speed_tokens_per_sec']
b/m,m/s

(0.47912972416463606, 0.4114162517983714)