In [1]:
%load_ext autoreload
%autoreload 2

import random
import json
import asyncio

import pandas as pd
from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio

from predictors import *
from llm import *
from prompts import *
from tools import *
from evaluations import *

BIG_MODEL = "gpt-4.1"
MEDIUM_MODEL = "gpt-4.1-mini"
SMALL_MODEL = "gpt-4.1-nano"

In [2]:
validation2 = load_dataset("MU-NLPC/Calc-mawps", "default", split='validation')
validation2

Dataset({
    features: ['id', 'question', 'chain', 'result', 'result_float', 'equation', 'expression'],
    num_rows: 1040
})

In [26]:
v = load_dataset("mwpt5/MAWPS", split='train')
v

Dataset({
    features: ['Question', 'Equation', 'Answer', 'Numbers'],
    num_rows: 1772
})

In [31]:
import math
def is_integer(x):
    return  math.modf(x)[0] < 0.0000001

In [30]:
v = v.filter(lambda x: is_integer(x['Answer']) )
len(v)

Filter:   0%|          | 0/1772 [00:00<?, ? examples/s]

NameError: name 'math' is not defined

In [32]:
v = v.filter(lambda x: is_integer(x['Answer']) and all(is_integer(float(number)) for number in x['Numbers'].split()))
len(v)

Filter:   0%|          | 0/1772 [00:00<?, ? examples/s]

1574

In [28]:
v[0]

{'Question': 'Mary is baking a cake . The recipe wants N_00 cups of flour . She already put in N_01 cups . How many cups does she need to add ?',
 'Equation': 'N_00 - N_01',
 'Answer': 6.0,
 'Numbers': '8.0 2.0'}

In [34]:
# Step 1: parse numbers into a list
from datasets import Features, Value

import re
# Step 2: replace N_00, N_01, ... with actual values
def instantiate_equation(equation, numbers):
    def replacer(match):
        idx = int(match.group(1))
        return str(numbers[idx])
    return re.sub(r'N_(\d+)', replacer, equation)
    
def make_question(question, numbers):
    numbers = list(map(lambda x: int(round(float(x))), numbers.split()))
    return instantiate_equation(question, numbers)
    

v = v.map(lambda example: {"answer": str(round(example["Answer"]))})
v = v.map(lambda example: {"question": make_question(example["Question"], example["Numbers"])})
v = v.cast_column("answer", Value("string"))
v[0]

Map:   0%|          | 0/1574 [00:00<?, ? examples/s]

Map:   0%|          | 0/1574 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1574 [00:00<?, ? examples/s]

{'Question': 'Mary is baking a cake . The recipe wants N_00 cups of flour . She already put in N_01 cups . How many cups does she need to add ?',
 'Equation': 'N_00 - N_01',
 'Answer': 6.0,
 'Numbers': '8.0 2.0',
 'answer': '6',
 'question': 'Mary is baking a cake . The recipe wants 8 cups of flour . She already put in 2 cups . How many cups does she need to add ?'}

In [3]:
import math
validation2 = validation2.filter(lambda x: math.modf(x['result_float'])[0] < 0.00000001)
len(validation2)

731

In [39]:
1040-309

731

In [None]:
non_integer = 0
for x in validation2:
    c = math.modf(x['result_float'])[0]
    if c>0.00000001:
        print(x['result_float'],c)
        non_integer+=1
non_integer, len(validation2)

In [2]:
dataset = load_dataset("gsm8k", "main")

In [3]:
training, validation = build_datasets(dataset, 1000, 0.7)

In [3]:
training, validation = build_datasets(dataset, len(dataset['train']), 0.7)

In [82]:
training, validation = build_datasets(dataset, 1500, 0.7)
len(training)+len(validation)

1500

In [4]:
from datasets import Features, Value

validation2 = validation2.map(lambda example: {"answer": str(round(example["result_float"]))})
validation2 = validation2.cast_column("answer", Value("string"))

validation2[0]

Map:   0%|          | 0/731 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/731 [00:00<?, ? examples/s]

{'id': 'mawps__qA0gWJatQEeMzvOw',
 'question': 'A painter needed to paint 12 rooms in a building. Each room takes 7 hours to paint. If he already painted 5 rooms, how much longer will he take to paint the rest?',
 'chain': '<gadget id="calculator">12 - 5</gadget>\n<output>7</output>\n\n<gadget id="calculator">7 * 7</gadget>\n<output>49</output>\n\n<result>49</result>',
 'result': '49',
 'result_float': 49.0,
 'equation': 'x=(7.0*(12.0-5.0))',
 'expression': '(7.0*(12.0-5.0))',
 'answer': '49'}

In [None]:
len(training)

In [65]:
len(new_validation)

600

In [70]:
cfg = configure(BIG_MODEL, 'predict_naive')
results = await evaluate(list(v)[:300], *cfg, max_concurrent_requests=10)
calculate_pass_rate(results['results'])

100%|████████████████████████████████████████| 300/300 [00:21<00:00, 13.64it/s]


{'pass_rate': 98.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

In [68]:
cfg = configure(BIG_MODEL, 'predict_naive')
results = await evaluate(validation, *cfg, max_concurrent_requests=10)
calculate_pass_rate(results['results'])

100%|████████████████████████████████████████| 300/300 [00:23<00:00, 13.01it/s]


{'pass_rate': 59.333333333333336,
 'format_error_rate': 0.0,
 'api_error_rate': 0.0}

In [35]:

from openai import AsyncOpenAI


ooclient = AsyncOpenAI(
    api_key='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',  # Ensure your API key is set as an environment variable
    base_url="https://openrouter.ai/api/v1"
)

async def llama_llm(messages):
    # Define the conversation
    response = await ooclient.chat.completions.create(
        model="deepseek/deepseek-r1-distill-llama-8b",  # Replace with your chosen model
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content
    
async def predict_naive_llama(question):
    prompt = make_naive_prompt(question)
    result = await call_llm(prompt, llama_llm)
    final_result = post_processing(prompt, result)
    return final_result


results = await evaluate(list(v)[:300], predict_naive_llama, 'llama', max_concurrent_requests=10)

100%|█████████████████████████████████████████| 300/300 [00:27<00:00, 10.78it/s]


In [36]:
await llama_llm([{'role':'user', 'content':'hola'}])

'¡Hola! ¿En qué puedo ayudarte?'

In [37]:
calculate_pass_rate(results['results'])

{'pass_rate': 97.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

In [11]:
results['results'][4]

{'api_error': False,
 'format_error': False,
 'pass': False,
 'prompt': '\nYou are an expert at solving math word problems. Given a question, provide a solution as a numerical value only. \nReturn the numerical value only.\n\n<question>\nBob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?\n</question>\n\nReturn the numerical value only. No characters, no reasoning. Only an integer.\n',
 'response': '660',
 'predicted': 660,
 'predicted_thoughts': None,
 'question': 'Bob is tilling a plot of his garden. The plot is 110 feet wide by 120 feet long. His tiller digs a swath two feet wide, and he can till 1 foot of ground in about 2 seconds. How long will it take him to till this plot of land, in minutes?',
 'thoughts': "If Bob goes along the side that's 120 feet long, he will till 110 / 2 = 55 rows.\nEach of

MU-NLPC/Calc-mawps
Small model: 95.07
Medium model: 89.60
Big model: 87.96

mwpt5/MAWPS
Small model: 84.56
Medium model: 97.14
Big model: 98.15

mwpt5/MAWPS + gsm8k
Big model: 79.16 (constatado analiticamente)

In [64]:
new_validation = list(v)[:300]+validation

In [51]:
wrong = []
for i,x in enumerate(results['results']):
    if not x['pass']:
        wrong.append(i)
results['results'][wrong[4]]

{'api_error': False,
 'format_error': False,
 'pass': False,
 'prompt': '\nYou are an expert at solving math word problems. Given a question, provide a solution as a numerical value only. \nReturn the numerical value only.\n\n<question>\nTheresa has 32 crayons . Janice has 12 crayons . She shares 13 with Nancy . How many crayons will Theresa have ?\n</question>\n',
 'response': '32',
 'predicted': 32,
 'predicted_thoughts': None,
 'Question': 'Theresa has N_00 crayons . Janice has N_01 crayons . She shares N_02 with Nancy . How many crayons will Theresa have ?',
 'Equation': 'N_00 - N_02',
 'Answer': 19.0,
 'Numbers': '32.0 12.0 13.0',
 'answer': '19',
 'question': 'Theresa has 32 crayons . Janice has 12 crayons . She shares 13 with Nancy . How many crayons will Theresa have ?'}

In [73]:
len(ds['train'])

700

In [77]:
2242/(2242+5231)

0.30001338150675766

In [83]:
len(v)

1574

In [78]:
lv = len(v)
lt = int(lv*0.7)
lt

1101

In [85]:
ds = {
    'train': training,
    'valid': validation,
}
len(ds['train']), len(ds['valid'])

(1050, 450)

In [86]:
ds = {
    'train': training+list(v)[:lt],
    'valid': validation+list(v)[lt:],
}
len(ds['train']), len(ds['valid'])

(2151, 923)

In [87]:
all_results = []
for ds_name in ds.keys():
    for model in [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]:
        cfg = configure(model, 'predict_naive')
        results = await evaluate(ds[ds_name], *cfg, max_concurrent_requests=10)
        
        results['metrics'] = calculate_pass_rate(results['results'])
        results['model'] = model
        results['dataset'] = ds_name
        all_results.append(results.copy())

  7%|██▊                                    | 156/2151 [00:10<02:11, 15.22it/s]

Exception: invalid literal for int() with base 10: '4 0'


 17%|██████▋                                | 366/2151 [00:25<01:42, 17.50it/s]

Exception: invalid literal for int() with base 10: '- Total contribution from students'


 34%|█████████████▍                         | 739/2151 [00:45<01:14, 18.94it/s]

Exception: invalid literal for int() with base 10: '£12'


 38%|██████████████▉                        | 824/2151 [00:50<01:05, 20.21it/s]

Exception: invalid literal for int() with base 10: '- Cookies'


 48%|██████████████████▎                   | 1038/2151 [01:06<01:05, 16.96it/s]

Exception: invalid literal for int() with base 10: '- Mika initially had 20 stickers'


 49%|██████████████████▋                   | 1061/2151 [01:09<02:51,  6.36it/s]

Exception: invalid literal for int() with base 10: '9 0'


 79%|██████████████████████████████▏       | 1707/2151 [01:47<00:26, 16.75it/s]

Exception: invalid literal for int() with base 10: '4 million'


 87%|█████████████████████████████████▏    | 1878/2151 [02:00<00:26, 10.39it/s]

Exception: invalid literal for int() with base 10: '- Weasels'


 92%|██████████████████████████████████▊   | 1971/2151 [02:07<00:14, 12.49it/s]

Exception: invalid literal for int() with base 10: '- Total tickets'


100%|██████████████████████████████████████| 2151/2151 [02:20<00:00, 15.33it/s]
  3%|█▎                                      | 68/2151 [00:05<02:08, 16.21it/s]

Exception: invalid literal for int() with base 10: '5 cases and 0 extra boxes'


 18%|███████▏                               | 396/2151 [00:30<01:41, 17.29it/s]

Exception: invalid literal for int() with base 10: '2 cases and 0 extra boxes'


 36%|█████████████▉                         | 768/2151 [00:57<01:21, 16.99it/s]

Exception: invalid literal for int() with base 10: '4 boxes 0 gumballs'


 51%|███████████████████▍                  | 1103/2151 [01:22<01:09, 15.18it/s]

Exception: invalid literal for int() with base 10: '9 0'


 65%|████████████████████████▊             | 1407/2151 [01:44<01:02, 11.97it/s]

Exception: invalid literal for int() with base 10: '1 case plus 0 boxes'


 85%|████████████████████████████████▏     | 1823/2151 [02:14<00:22, 14.91it/s]

Exception: invalid literal for int() with base 10: '68 bolts and 39 nuts'


100%|██████████████████████████████████████| 2151/2151 [02:38<00:00, 13.56it/s]
  3%|█▏                                      | 67/2151 [00:05<03:13, 10.76it/s]

Exception: invalid literal for int() with base 10: '4 0'


 40%|███████████████▌                       | 857/2151 [01:08<01:58, 10.93it/s]

Exception: invalid literal for int() with base 10: '£11'


 76%|████████████████████████████▊         | 1629/2151 [02:06<00:33, 15.78it/s]

Exception: invalid literal for int() with base 10: ' ninety-three'


 82%|███████████████████████████████▏      | 1762/2151 [02:15<00:28, 13.50it/s]

Exception: invalid literal for int() with base 10: '4 million'


 94%|███████████████████████████████████▋  | 2017/2151 [02:34<00:08, 15.49it/s]

Exception: invalid literal for int() with base 10: '35%'


100%|██████████████████████████████████████| 2151/2151 [02:44<00:00, 13.04it/s]
 16%|██████▏                                 | 144/923 [00:07<00:44, 17.59it/s]

Exception: invalid literal for int() with base 10: '24 * 0'


 28%|███████████▏                            | 259/923 [00:14<00:48, 13.64it/s]

Exception: invalid literal for int() with base 10: '37 2 2'


 43%|█████████████████▏                      | 398/923 [00:22<00:28, 18.75it/s]

Exception: invalid literal for int() with base 10: ' 4 hours'


 91%|████████████████████████████████████▍   | 841/923 [00:49<00:03, 21.78it/s]

Exception: invalid literal for int() with base 10: 'orn 3 miles/day for 3 years (from age 13 to 16)'


100%|████████████████████████████████████████| 923/923 [00:53<00:00, 17.10it/s]
 40%|████████████████                        | 370/923 [00:27<00:35, 15.71it/s]

Exception: invalid literal for int() with base 10: '3 cases and 0 extra boxes'


100%|████████████████████████████████████████| 923/923 [01:07<00:00, 13.68it/s]
 14%|█████▊                                  | 133/923 [00:10<00:57, 13.78it/s]

Exception: invalid literal for int() with base 10: '90%'


100%|████████████████████████████████████████| 923/923 [01:13<00:00, 12.60it/s]


In [90]:
rows = [
    {
        **r['metrics'],
        'model': r['model'],
        'dataset': r['dataset']
    }
    for r in all_results
]

df_metrics = pd.DataFrame(rows)
df_metrics

Unnamed: 0,pass_rate,format_error_rate,api_error_rate,model,dataset
0,56.578336,0.41841,0.0,gpt-4.1-nano,train
1,73.733147,0.27894,0.0,gpt-4.1-mini,train
2,77.684798,0.23245,0.0,gpt-4.1,train
3,57.204767,0.433369,0.0,gpt-4.1-nano,valid
4,74.214518,0.108342,0.0,gpt-4.1-mini,valid
5,78.981582,0.108342,0.0,gpt-4.1,valid


In [89]:
import pickle
with open('./data/mixture_dataset_naive.pkl', 'wb') as f:
    pickle.dump(all_results, f)

In [None]:
with open('./data/mixture_dataset_naive.pkl', 'rb') as f:
    all_results = pickle.load(f)

In [92]:
from collections import defaultdict

rows = defaultdict(dict)
for r in all_results:
    ds_name = r['dataset']
    model_name = r['model']

    for item in r['results']:
        question = item['question']
        result = item['pass']
        rows[question]['dataset'] = ds_name
        rows[question][model_name] = result

In [94]:
MODELS = [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]

results_df = pd.DataFrame([
    (question, rows[question]['dataset'], 
        *[rows[question][model] for model in MODELS])
    for question in rows.keys()],
    columns=['question', 'dataset', *MODELS]
)
results_df

Unnamed: 0,question,dataset,gpt-4.1-nano,gpt-4.1-mini,gpt-4.1
0,Tyler had 15 dogs . Each dog had 5 puppies . H...,train,True,True,True
1,Steven wants to split a collection of cards in...,train,True,True,True
2,Katie had 85 files on her computer . She delet...,train,True,True,True
3,Luna's monthly food budget is equal to 60% of ...,train,False,False,False
4,James buys 3 dirt bikes for $150 each and 4 of...,train,False,False,False
...,...,...,...,...,...
3041,Mike had 34 peaches at his roadside fruit dish...,valid,True,True,True
3042,"Of 96 oranges, half were ripe. If 1/4 of the r...",valid,False,False,False
3043,Tim had 50 cents . He paid 45 cents for a cand...,valid,True,True,True
3044,On Sunday Alice bought 4 pints of strawberry i...,valid,True,False,False


In [95]:
results_df.to_csv('./data/dataset_mixture_train.csv', index=False)