In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import random

from datasets import load_dataset
from llmplus import GenerationConfig, LLMClient, Provider

  from .autonotebook import tqdm as notebook_tqdm


In [49]:
from collabmem.compose_q import (
    BaselinePrediction,
    ComposedExample,
    build_baseline_prompts,
    evaluate_baseline_accuracy,
    extract_final_numeric_from_model_output,
    subsample_qa_examples,
)
from collabmem.execute_code import run_snippet

from collabmem.constants import REPO_ROOT

In [29]:
# setup OAI client
client = LLMClient(
    provider=Provider.OPENAI,
    dotenv_path=(REPO_ROOT / ".env"),
    cache_dir=(REPO_ROOT / ".llm_cache"),
)
GPT5_MINI = "gpt-5-mini-2025-08-07"

In [5]:
gsm8k = load_dataset("gsm8k", "main", split="train")

In [6]:
gsm8k

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [12]:
gsm8k[0]["question"]

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

## subsample some questions


In [15]:
# def subsample_qa_examples(
#     questions: list[str],
#     answers: list[str],
#     n: int,
#     seed: int = 0,
# ) -> list[QAExample]:
qa_examples = subsample_qa_examples(
    questions=list(gsm8k["question"]),
    answers=list(gsm8k["answer"]),
    n=5,
    seed=42,
)

In [16]:
qa_examples[0]

QAExample(id=1458, question='Stefan goes to a restaurant to eat dinner with his family. They order an appetizer that costs $10 and 4 entrees that are $20 each. If they tip 20% of the total for the waiter, what is the total amount of money that they spend at the restaurant?', answer_text='The total cost of the entrees is 4 * $20 = $<<4*20=80>>80.\nThe total cost of the dinner is $80 + $10 = $<<80+10=90>>90.\nThe tip is $90 * 0.20 = $<<90*0.20=18>>18\nThe total cost with tip is $90 + $18 = $<<90+18=108>>108\n#### 108', answer_value=108.0)

In [19]:
init_prompts = build_baseline_prompts(qa_examples)
print(init_prompts[0])

You are a math word problem solver.
Solve the following problem step by step, then give the final numeric answer.
Output your final numeric answer inside <answer> </answer> tags.

Problem:
Stefan goes to a restaurant to eat dinner with his family. They order an appetizer that costs $10 and 4 entrees that are $20 each. If they tip 20% of the total for the waiter, what is the total amount of money that they spend at the restaurant?



In [23]:
dummy_text = """\
THis is some dummy text.
<answer>-1,599.43</answer>
"""
extracted_num = extract_final_numeric_from_model_output(dummy_text)
print(f"Extracted number: {extracted_num}, type: {type(extracted_num)}")

Extracted number: -1599.43, type: <class 'float'>


In [None]:
preds, score = evaluate_baseline_accuracy(
    client=client,
    gen_cfg=GenerationConfig(max_tokens=2048),
    examples=qa_examples,
    model=GPT5_MINI,
)

RuntimeError: asyncio.run() cannot be called from a running event loop

In [None]:
# res = [BaselinePrediction(example_id=1458, gold_answer=108.0, pred_answer=108.0, raw_model_output='Step 1: Cost of appetizer = $10.\n\nStep 2: Cost of 4 entrees = 4 × $20 = $80.\n\nStep 3: Subtotal (food) = $10 + $80 = $90.\n\nStep 4: Tip at 20% of subtotal = 0.20 × $90 = $18.\n\nStep 5: Total spent = subtotal + tip = $90 + $18 = $108.\n\n<answer>108</answer>'), BaselinePrediction(example_id=3620, gold_answer=24.0, pred_answer=24.0, raw_model_output="Let the tank's full capacity be T gallons.\n\nInitially it contains (1/3)T. After adding 16 gallons it is full, so\n(1/3)T + 16 = T.\n\nSubtract (1/3)T from both sides:\n16 = T - (1/3)T = (2/3)T.\n\nSolve for T:\nT = 16 * (3/2) = 24.\n\n<answer>24</answer>"), BaselinePrediction(example_id=6312, gold_answer=18.0, pred_answer=18.0, raw_model_output='Step 1: Phillip has 40 apples.\nStep 2: Ben has 8 more than Phillip, so Ben = 40 + 8 = 48.\nStep 3: Tom has three-eighths as many apples as Ben, so Tom = (3/8) × 48 = 18.\n\n<answer>18</answer>'), BaselinePrediction(example_id=4833, gold_answer=114.0, pred_answer=114.0, raw_model_output='Step 1: Snakes = 15.\n\nStep 2: Monkeys = twice as many as snakes = 2 × 15 = 30.\n\nStep 3: Lions = 5 fewer than monkeys = 30 − 5 = 25.\n\nStep 4: Pandas = 8 more than lions = 25 + 8 = 33.\n\nStep 5: Dogs = 1/3 as many as pandas = (1/3) × 33 = 11.\n\nStep 6: Total animals = snakes + monkeys + lions + pandas + dogs\n= 15 + 30 + 25 + 33 + 11 = 114.\n\n<answer>114</answer>'), BaselinePrediction(example_id=5032, gold_answer=66.0, pred_answer=66.0, raw_model_output='Let the charger\'s output be a constant "power" unit. The phone requires 26 such minutes to fill, so its full charge = 26 units. The tablet requires 53 minutes, so its full charge = 53 units. Ana charged the tablet fully (53 units) and the phone halfway (0.5 × 26 = 13 units). Total energy = 53 + 13 = 66 units, so time = 66 minutes.\n\n<answer>66</answer>'), BaselinePrediction(example_id=1914, gold_answer=1350.0, pred_answer=1350.0, raw_model_output='Step 1: Cost for electricity = $4/watt × 300 watts = $1,200.\nStep 2: Add the $150 late fee: $1,200 + $150 = $1,350.\n\n<answer>1350</answer>'), BaselinePrediction(example_id=1258, gold_answer=1350.0, pred_answer=30.0, raw_model_output='Step 1: Total puzzles = 15 books × 30 puzzles/book = 450 puzzles.\nStep 2: Time = 450 puzzles × 3 minutes/puzzle = 1350 minutes.\nStep 3: Convert minutes to hours: 1350 minutes = 1350 ÷ 60 = 22.5 hours = 22 hours 30 minutes.\n\n<answer>1350 minutes (22 hours 30 minutes)</answer>'), BaselinePrediction(example_id=5173, gold_answer=40.0, pred_answer=40.0, raw_model_output='Step 1: Compute total number of songs: 6 + 4 = 10 songs.\nStep 2: Each song is 4 minutes, so total time = 10 × 4 = 40 minutes.\n\n<answer>40 minutes</answer>'), BaselinePrediction(example_id=3457, gold_answer=1000.0, pred_answer=1000.0, raw_model_output='Step 1: Gold coins per chest = 3500 gold ÷ 5 chests = 700 gold per chest.\n\nStep 2: Silver coins total = 500. Silver per chest = 500 ÷ 5 = 100 silver per chest.\n\nStep 3: Bronze coins total = twice as many as silver = 2 × 500 = 1000 bronze. Bronze per chest = 1000 ÷ 5 = 200 bronze per chest.\n\nStep 4: Total coins per chest = 700 (gold) + 100 (silver) + 200 (bronze) = 1000 coins.\n\n<answer>1000</answer>'), BaselinePrediction(example_id=3341, gold_answer=9500.0, pred_answer=12000.0, raw_model_output='Step 1: First week = 5000 cases.\n\nStep 2: Second week = half of first = 5000 ÷ 2 = 2500 cases.\n\nStep 3: Third week = 2000 more than the second = 2500 + 2000 = 4500 cases.\n\nStep 4: Total after three weeks = 5000 + 2500 + 4500 = 12000.\n\n<answer>12000</answer>')]
res = [
    BaselinePrediction(
        example_id=1458,
        gold_answer=108.0,
        pred_answer=108.0,
        raw_model_output="Step 1: Appetizer cost = $10.\nStep 2: Four entrees at $20 each: 4 × 20 = $80.\nStep 3: Subtotal = 10 + 80 = $90.\nStep 4: Tip = 20% of $90 = 0.20 × 90 = $18.\nStep 5: Total = 90 + 18 = $108.\n\n<answer>108</answer>",
    ),
    BaselinePrediction(
        example_id=3620,
        gold_answer=24.0,
        pred_answer=24.0,
        raw_model_output="Let T be the tank's full capacity in gallons.\nInitially the tank has T/3 gallons. After adding 16 gallons it is full, so:\nT/3 + 16 = T\n16 = T - T/3 = (2/3)T\nT = 16 * (3/2) = 24\n\n<answer>24</answer>",
    ),
    BaselinePrediction(
        example_id=6312,
        gold_answer=18.0,
        pred_answer=18.0,
        raw_model_output="Phillip has 40 apples.\nBen has 8 more than Phillip, so Ben = 40 + 8 = 48 apples.\nTom has three eighths as many as Ben, so Tom = (3/8) × 48 = 18.\n\n<answer>18</answer>",
    ),
    BaselinePrediction(
        example_id=4833,
        gold_answer=114.0,
        pred_answer=114.0,
        raw_model_output="Snakes = 15.\nMonkeys = 2 × 15 = 30.\nLions = 30 − 5 = 25.\nPandas = 25 + 8 = 33.\nDogs = (1/3) × 33 = 11.\nTotal = 15 + 30 + 25 + 33 + 11 = 114.\n\n<answer>114</answer>",
    ),
    BaselinePrediction(
        example_id=5032,
        gold_answer=66.0,
        pred_answer=66.0,
        raw_model_output="Step 1: Time to charge the phone halfway = 0.5 × 26 min = 13 min.\nStep 2: Time to charge the tablet fully = 53 min.\nStep 3: Total time = 53 + 13 = 66 minutes.\n\n<answer>66</answer>",
    ),
    BaselinePrediction(
        example_id=1914,
        gold_answer=1350.0,
        pred_answer=1350.0,
        raw_model_output="Cost for electricity: $4/watt × 300 watts = $1200.\nAdd late fee: $1200 + $150 = $1350.\n\n<answer>1350</answer>",
    ),
    BaselinePrediction(
        example_id=1258,
        gold_answer=1350.0,
        pred_answer=22.5,
        raw_model_output="Step 1: Total puzzles = 15 books × 30 puzzles/book = 450 puzzles.\nStep 2: Time per puzzle = 3 minutes, so total time = 450 × 3 = 1350 minutes.\nStep 3: Convert to hours: 1350 minutes ÷ 60 = 22.5 hours = 22 hours 30 minutes.\n\n<answer>22.5</answer>",
    ),
    BaselinePrediction(
        example_id=5173,
        gold_answer=40.0,
        pred_answer=40.0,
        raw_model_output="Step 1: Compute time for the first side: 6 songs × 4 minutes/song = 24 minutes.\nStep 2: Compute time for the second side: 4 songs × 4 minutes/song = 16 minutes.\nStep 3: Add both sides: 24 minutes + 16 minutes = 40 minutes.\n\n<answer>40</answer>",
    ),
    BaselinePrediction(
        example_id=3457,
        gold_answer=1000.0,
        pred_answer=1000.0,
        raw_model_output="Step 1: Gold per chest = 3500 gold ÷ 5 chests = 700 gold per chest.\n\nStep 2: Silver total = 500 silver. Silver per chest = 500 ÷ 5 = 100 silver per chest.\n\nStep 3: Bronze total = twice the silver = 2 × 500 = 1000 bronze. Bronze per chest = 1000 ÷ 5 = 200 bronze per chest.\n\nStep 4: Total coins per chest = gold + silver + bronze = 700 + 100 + 200 = 1000.\n\n<answer>1000</answer>",
    ),
    BaselinePrediction(
        example_id=3341,
        gold_answer=9500.0,
        pred_answer=12000.0,
        raw_model_output="Week 1: 5000 cases.\nWeek 2: half of 5000 = 2500 cases.\nWeek 3: 2000 more than week 2 = 2500 + 2000 = 4500 cases.\nTotal = 5000 + 2500 + 4500 = 12000.\n\n<answer>12000</answer>",
    ),
]

In [43]:
gsm8k[1258]

{'question': 'There have been 15 "Where\'s Waldo?" books published. Each book has 30 puzzles to find Waldo. The average person takes 3 minutes to find Waldo in a puzzle. How long would it take to find every Waldo?',
 'answer': "There have been 450 puzzles because 15 x 30 = <<450=450>>450\nIt would take 1,350 minutes to find all the Waldo's because 450 x 3 = <<450*3=1350>>1,350\n#### 1350"}

In [42]:
for r in res:
    if r.gold_answer != r.pred_answer:
        print(r)
        print()

BaselinePrediction(example_id=1258, gold_answer=1350.0, pred_answer=22.5, raw_model_output='Step 1: Total puzzles = 15 books × 30 puzzles/book = 450 puzzles.\nStep 2: Time per puzzle = 3 minutes, so total time = 450 × 3 = 1350 minutes.\nStep 3: Convert to hours: 1350 minutes ÷ 60 = 22.5 hours = 22 hours 30 minutes.\n\n<answer>22.5</answer>')

BaselinePrediction(example_id=3341, gold_answer=9500.0, pred_answer=12000.0, raw_model_output='Week 1: 5000 cases.\nWeek 2: half of 5000 = 2500 cases.\nWeek 3: 2000 more than week 2 = 2500 + 2000 = 4500 cases.\nTotal = 5000 + 2500 + 4500 = 12000.\n\n<answer>12000</answer>')



In [44]:
print(gsm8k["question"][3341])

New York recorded 5000 new coronavirus cases on a particular week. In the second week, half as many new coronaviruses cases as the first week was recorded by the state. In the third week, 2000 more cases were recorded in the state. What is the total number of recorded new coronaviruses in the state after the three weeks?


In [46]:
dummy_snippet = """\
def solve():
    return 10

result = solve()
print(result)
"""
res = run_snippet(dummy_snippet)

In [47]:
res

RunResult(stdout='10\n', stderr='', returncode=0)

In [None]:
ces = [
    ComposedExample(
        id1=1458,
        id2=6312,
        question1="Stefan goes to a restaurant to eat dinner with his family. They order an appetizer that costs $10 and 4 entrees that are $20 each. If they tip 20% of the total for the waiter, what is the total amount of money that they spend at the restaurant?",
        question2="Ben has 8 apples more than Phillip does. Tom has three eighths as many apples at Ben has. If Phillip has 40 apples, how many apples does Tom have?",
        answer1_value=108.0,
        answer2_value_original=18.0,
        composed_question="Let X be the answer to Q1:\nQ1: Stefan goes to a restaurant to eat dinner with his family. They order an appetizer that costs $10 and 4 entrees that are $20 each. If they tip 20% of the total for the waiter, what is the total amount of money that they spend at the restaurant?\nSolve it and use the value of X to solve Q2. Explain your answer step by step\nQ2: Ben has 8 apples more than Phillip does. Tom has three eighths as many apples as Ben has. If Phillip has X apples, how many apples does Tom have?",
        composed_answer_value=43.5,
        composed_code="def solve_revised():\n    phillip = 108.0\n    ben = phillip + 8\n    tom = 3/8 * ben\n    return tom",
        meta={},
    ),
    ComposedExample(
        id1=3620,
        id2=4833,
        question1="The gauge on a water tank shows that the tank is 1/3 full of water. To fill the tank, 16 gallons of water are added. How many gallons of water does the tank hold when full?",
        question2="John wants to start a zoo.  He has 15 snakes.  He has twice as many monkeys as he does snakes.  He has 5 fewer lions than he does monkeys.   John has 8 more pandas than he does lions.  John has 1/3 as many dogs as he does pandas.  How many total animals does John have?",
        answer1_value=24.0,
        answer2_value_original=114.0,
        composed_question="Let X be the answer to Q1:\nQ1: The gauge on a water tank shows that the tank is 1/3 full of water. To fill the tank, 16 gallons of water are added. How many gallons of water does the tank hold when full?\nSolve it and use the value of X to solve Q2. Explain your answer step by step\nQ2: John wants to start a zoo. He has X snakes. He has twice as many monkeys as he does snakes. He has 5 fewer lions than he does monkeys. John has 8 more pandas than he does lions. John has 1/3 as many dogs as he does pandas. How many total animals does John have?",
        composed_answer_value=183.0,
        composed_code="def solve_revised():\n    snakes = 24.0\n    monkeys = 2 * snakes\n    lions = monkeys - 5\n    pandas = lions + 8\n    dogs = pandas / 3\n    answer = snakes + monkeys + lions + pandas + dogs\n    return answer",
        meta={},
    ),
]

In [54]:
dummy = ces[1]
print(dummy.composed_question)
print(dummy.composed_answer_value)

Let X be the answer to Q1:
Q1: The gauge on a water tank shows that the tank is 1/3 full of water. To fill the tank, 16 gallons of water are added. How many gallons of water does the tank hold when full?
Solve it and use the value of X to solve Q2. Explain your answer step by step
Q2: John wants to start a zoo. He has X snakes. He has twice as many monkeys as he does snakes. He has 5 fewer lions than he does monkeys. John has 8 more pandas than he does lions. John has 1/3 as many dogs as he does pandas. How many total animals does John have?
183.0
