In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import json
import asyncio
import re
from typing import List

import pandas as pd
from collections import defaultdict
from datasets import load_dataset
from tqdm.asyncio import tqdm_asyncio
import pickle
from langdetect import detect
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type

from predictors import *
from llm import *
from prompts import *
from tools import *
from evaluations import *
from parsers import *
from math_method import *
from math_solver import *

BIG_MODEL = "gpt-4.1"
MEDIUM_MODEL = "gpt-4.1-mini"
SMALL_MODEL = "gpt-4.1-nano"

MODELS = [SMALL_MODEL, MEDIUM_MODEL, BIG_MODEL]


In [3]:
dataset = load_dataset("gsm8k", "main")
training, validation = build_datasets(dataset, 1000, 0.7)

In [4]:
idx = 11
question = training[idx]['question']
question

'Tobias is buying a new pair of shoes that costs $95. He has been saving up his money each month for the past three months. He gets a $5 allowance a month. He also mows lawns and shovels driveways. He charges $15 to mow a lawn and $7 to shovel. After buying the shoes, he has $15 in change. If he mows 4 lawns, how many driveways did he shovel?'

In [5]:
training[idx]

{'question': 'Tobias is buying a new pair of shoes that costs $95. He has been saving up his money each month for the past three months. He gets a $5 allowance a month. He also mows lawns and shovels driveways. He charges $15 to mow a lawn and $7 to shovel. After buying the shoes, he has $15 in change. If he mows 4 lawns, how many driveways did he shovel?',
 'thoughts': 'He saved up $110 total because 95 + 15 = <<95+15=110>>110\nHe saved $15 from his allowance because 3 x 5 = <<3*5=15>>15\nHe earned $60 mowing lawns because 4 x 15 = <<4*15=60>>60\nHe earned $35 shoveling driveways because 110 - 60 - 15 = <<110-60-15=35>>35\nHe shoveled 5 driveways because 35 / 7 = <<35/7=5>>5',
 'answer': '5'}

In [5]:

openai_client = openai.AsyncOpenAI()

def pm(m):
    print(f"ROLE: {m['role']}\nCONTENT:\n{m['content']}")
    print("====== PRINT SEP ======")
    
def pms(ms):
    for m in ms:
        pm(m)

async def chat(messages, model, verbose):
    if verbose:
        pm(messages[-1])
    response = await openai_client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
            )
    res = { 'content': response.choices[0].message.content, 'role': response.choices[0].message.role }
    if verbose:
        pm(res)
    return res


In [6]:
async def solve_question(question):
    fn_llm = partial(chat, model=SMALL_MODEL, verbose=False)
    system = await create_method_agent(question, fn_llm)
    if system is not None:
        solution = await solve_method_agent(system, fn_llm)
    else:
        solution = None
    return {'predicted': solution, "response": True, 'system': system}

result = await evaluate(validation, solve_question, 'smartest')

  2%|▋                                          | 5/300 [00:10<10:25,  2.12s/it]


CancelledError: 

In [None]:
calculate_pass_rate(result['results'])

In [31]:
for r in result['results']:
    if not r['pass']:
        break
r

{'api_error': False,
 'format_error': False,
 'pass': False,
 'predicted': 1,
 'response': True,
 'system': ProblemModel(quantity_name=['number_of_rows', 'seats_per_row', 'cost_per_seat', 'discount_rate', 'total_seats', 'total_cost_before_discount', 'total_discount', 'total_amount_to_pay'], equations=['total_seats = number_of_rows * seats_per_row', 'total_cost_before_discount = total_seats * cost_per_seat', 'total_discount = (total_seats / 10) * (10 * cost_per_seat * discount_rate)', 'total_amount_to_pay = total_cost_before_discount - total_discount'], answer='total_amount_to_pay'),
 'question': 'A school is adding 5 rows of seats to the auditorium. Each row has 8 seats and each seat costs $30. A parent, being a seat manufacturer, offered a 10% discount on each group of 10 seats purchased. How much will the school pay for the new seats?',
 'thoughts': 'Ten seats amount to $30 x 10 = $<<30*10=300>>300.\nSo there is $300 x 10/100 = $<<300*10/100=30>>30 discount for each 10 seats purchase

In [28]:
fn_llm = partial(chat, model=SMALL_MODEL, verbose=True)
await solve_method_agent(r['system'], fn_llm)

ROLE: user
CONTENT:

You are a Python expert with experience in numerical optimization.
Given a JSON describing a system of equations, you must write Python code that builds and solves the system using SciPy.
The system of equations in JSON format contains three keys:
- "quantity_name": a list of str with the system's variables.
- "equations": a list of str with the equations of the system.
- "answer": the system variable that must be calculated to solve the problem.
Equations are not necessarily linear.

You must produce Python code. You can reason before coding.
1. Use this interface to solve the system:
```python
from scipy.optimize import root
sol = root(system_fn, initial_guess, method='hybr')
```

2. Inside system_fn(vars):
  2.a. Interpret vars as a list of floats.
  2.b. Unpack the variables in the same order as given in "quantity_name".
2.c. Return a list of residuals (i.e., lhs - rhs) for each equation, in any order, but of the same length as vars.

3. Your initial guess shou

18

In [4]:

from openai import OpenAI

# Initialize the OpenAI client with OpenRouter settings



In [7]:
prompt_template_brainstorming = """
You are an expert at solving math word problems.
Your goal is to think step by step, writing as many deductions as possible, to provide valuable insights to find the answer.
I will provide your insights to an expert who will analyze its correctness and output the final answer.
Write as many deductions as you can.

{question}
"""

def make_prompt_brainstorming(question):
    return prompt_template_brainstorming.format(question = question)
    


In [12]:
prompt_template_guess = """
You are an expert at solving math word problems.
Given a question, provide a solution as a numerical value only.
You will also be provided with the reasoning produced by a student. The reasoning can help you find the answer to the problem. But read it thoroughly with the question in mind; there is no guarantee that the reasoning is correct.

<question>
{question}
</question>

<resoning>
{reasoning}
</resoning>

Return the numerical value only. No characters, no reasoning. Only an integer.
"""

def make_prompt_guess(question, reasoning):
    return prompt_template_guess.format(question = question, reasoning = reasoning)

from openai import AsyncOpenAI


ooclient = AsyncOpenAI(
    api_key='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',  # Ensure your API key is set as an environment variable
    base_url="https://openrouter.ai/api/v1"
)

async def llama_llm(messages):
    # Define the conversation
    response = await ooclient.chat.completions.create(
        model="deepseek/deepseek-r1-distill-llama-8b",  # Replace with your chosen model
        messages=messages,
        temperature=0.3,
    )
    return {'content': response.choices[0].message.content}

async def predict_brainstorming(question, model_small, model_big):
    fn_llm_small = partial(chat, model=model_small, verbose=False)
    prompt_1 = make_prompt_brainstorming(question)
    response_1 = await llama_llm([{'role': 'user', 'content': prompt_1}])
    #response_1 = await fn_llm_small([{'role': 'user', 'content': prompt_1}])
    brainstorming = response_1['content']

    fn_llm_big = partial(chat, model=model_big, verbose=False)
    prompt_2 = make_prompt_guess(question, brainstorming)
    response_2 = await fn_llm_big([{'role': 'user', 'content': prompt_2}])
    final_guess = response_2['content']
    return {'response':final_guess, 'predicted': parse_int(final_guess), 'brainstorm':brainstorming}

r=await predict_brainstorming(question, SMALL_MODEL, SMALL_MODEL)
r

{'response': '5',
 'predicted': 5,
 'brainstorm': 'Tobias saved $5 each month for three months, totaling $15. He earned $60 from mowing 4 lawns and $35 from shoveling driveways. Together, his savings and earnings amount to $110, which covers the $95 cost of the shoes with $15 remaining. Therefore, he shoveled 5 driveways.\n\n**Answer:** Tobias shoveled \\boxed{5} driveways.'}

In [35]:
predict_fn = partial(predict_brainstorming, model1=SMALL_MODEL, model2=SMALL_MODEL)

In [36]:
results = await evaluate(validation, predict_fn, 'ss')

100%|█████████████████████████████████████████| 300/300 [02:04<00:00,  2.41it/s]


In [37]:
calculate_pass_rate(results['results'])

{'pass_rate': 93.0, 'format_error_rate': 0.0, 'api_error_rate': 0.0}

In [13]:
predict_fn2 = partial(predict_brainstorming, model_small=SMALL_MODEL, model_big=SMALL_MODEL)

In [14]:
results2 = await evaluate(validation, predict_fn2, 'ss')

100%|█████████████████████████████████████████| 300/300 [06:18<00:00,  1.26s/it]


In [15]:
calculate_pass_rate(results2['results'])

{'pass_rate': 73.33333333333333,
 'format_error_rate': 0.0,
 'api_error_rate': 0.0}

In [None]:
# Deepsek+Big: 83.666
# Deepsek+Small: 73.333
# Mixstral+Big: 

In [16]:
len(results2['results'])

300