In [103]:
from openai import OpenAI
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os
import json
from pydantic import BaseModel,Field
from langchain.llms import OpenAI
from dotenv import load_dotenv
from langchain_experimental.pal_chain.base import PALChain
from langchain.chat_models import ChatOpenAI, init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm
import re


In [93]:
file_path = 'data/GSM8K/test.jsonl'

with open(file_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

In [98]:
print(dataset[0]["question"])
print(dataset[0]["answer"])

Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


In [127]:
load_dotenv()
class Step(BaseModel):
    explanation: str
    output: str

class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str
load_dotenv()
model=init_chat_model('gpt-4o-mini',model_provider='openai')
messages=[
        SystemMessage(content="You are a math expert. For every question, respond ONLY by calling the 'MathReasoning' tool. You must explain the solution step-by-step using 'steps', and include the final numeric result in 'final_answer'.The result should be a number only, with no units."),
    HumanMessage("Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?")
]
model_with_tools = model.bind_tools([MathReasoning])
ai_msg = model_with_tools.invoke(messages)

In [128]:
tool_call = ai_msg.tool_calls[0] 
tool_args = tool_call['args']     
tool_args['final_answer']

'460'

In [104]:
def extract_ground_truth(answer: str) -> str:
    match = re.search(r"####\s*(\d+)", answer)
    return match.group(1).strip() if match else ""

def compare_answers(predicted: str, actual: str) -> bool:
    return predicted.strip() == actual.strip()

In [131]:
correct = 0
total = len(dataset)
model_with_tools = model.bind_tools([MathReasoning])
for item in tqdm(dataset):
    question = item["question"]
    true_answer = extract_ground_truth(item["answer"])
    messages = [
        SystemMessage(content="""
        You are a math expert. For every question, you must respond by calling the `MathReasoning` tool.
        Use a list of steps with explanations and intermediate outputs, and provide the final numeric result in `final_answer`.
        Do not answer in plain text.
        """),
        HumanMessage(content=question)
    ]
    try:
        ai_msg = model_with_tools.invoke(messages)
        tool_calls = ai_msg.tool_calls
        if tool_calls and isinstance(tool_calls[0], dict):
            predicted_answer = tool_calls[0]["args"]["final_answer"]
        else:
            predicted_answer = ai_msg.tool_calls[0].args["final_answer"] 
        
        if compare_answers(predicted_answer, true_answer):
            correct += 1
    except Exception as e:
        print(f"Error on question: {question[:60]}... => {e}")
        continue
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")


  1%|          | 13/1319 [00:52<1:25:39,  3.94s/it]

Error on question: Carlos is planting a lemon tree. The tree will cost $90 to p... => 'final_answer'


  2%|▏         | 30/1319 [02:03<1:12:54,  3.39s/it]

Error on question: Gloria is shoe shopping when she comes across a pair of boot... => 'final_answer'


  9%|▉         | 119/1319 [07:35<1:10:18,  3.52s/it]

Error on question: A customs officer at the main port of SeaSide clearances cou... => 'final_answer'


 16%|█▌        | 210/1319 [14:34<1:40:16,  5.43s/it]

Error on question: Twenty dozen cups cost $1200 less than the total cost of hal... => 'final_answer'


 26%|██▋       | 349/1319 [24:47<59:37,  3.69s/it]  

Error on question: A factory used to make tractors, but now makes silos. When t... => 'final_answer'


 30%|███       | 397/1319 [28:10<1:06:52,  4.35s/it]

Error on question: Jenny is dividing up a pizza with 12 slices. She gives 1/3 t... => 'final_answer'


 32%|███▏      | 419/1319 [29:45<1:06:09,  4.41s/it]

Error on question: Zack's locker is half as big as Timothy's locker. Peter's lo... => 'final_answer'


 32%|███▏      | 420/1319 [29:48<59:29,  3.97s/it]  

Error on question: Stanley was standing on the side of the mountain, at an elev... => 'final_answer'


 37%|███▋      | 487/1319 [34:39<45:40,  3.29s/it]  

Error on question: James runs 12 miles a day for 5 days a week.  If he runs 10 ... => 'final_answer'


 40%|████      | 531/1319 [37:45<1:01:46,  4.70s/it]

Error on question: After tracking his workout routine for 8 weeks, Josh discove... => 'final_answer'


 40%|████      | 534/1319 [37:58<57:57,  4.43s/it]  

Error on question: Errol bought a computer, 2 monitors, and a printer for $2,40... => 'final_answer'


 44%|████▍     | 581/1319 [41:04<50:10,  4.08s/it]  

Error on question: Maggie's oven is malfunctioning. When she sets it to 450 the... => 'final_answer'


 45%|████▌     | 596/1319 [42:08<41:54,  3.48s/it]  

Error on question: Jerry is twice as old as he was 5 years ago. How old will Je... => 'final_answer'


 48%|████▊     | 628/1319 [44:16<30:58,  2.69s/it]  

Error on question: On a particular week, a tow truck pulled ten cars for each o... => 'final_answer'


 48%|████▊     | 631/1319 [44:26<31:37,  2.76s/it]

Error on question: Tiffany is measuring how many surfers can ride a big wave wi... => 'final_answer'


 49%|████▉     | 649/1319 [45:43<35:17,  3.16s/it]

Error on question: Howie wants to buy cupcakes for everyone in his class as a s... => 'final_answer'


 50%|████▉     | 653/1319 [46:00<45:05,  4.06s/it]

Error on question: James hires a horse-drawn carriage from 5 PM to 9 PM.  He ge... => 'final_answer'


 51%|█████     | 669/1319 [47:02<27:14,  2.51s/it]  

Error on question: In a company of 50 employees, 20% of the employees are manag... => 'final_answer'


 52%|█████▏    | 692/1319 [48:47<48:40,  4.66s/it]  

Error on question: John brings his dog to the vet. His dog needs 2 vaccines, wh... => 'final_answer'


 61%|██████    | 803/1319 [57:13<39:40,  4.61s/it]  

Error on question: Bubbles collects stuffed animals. She has three stuffed pupp... => 'final_answer'


 62%|██████▏   | 814/1319 [3:24:12<30:33:44, 217.87s/it]  

Error on question: John and DeSean bought a bag of marshmallows to make s'mores... => 'final_answer'


 68%|██████▊   | 895/1319 [3:32:27<1:53:55, 16.12s/it]  

Error on question: Last month, Tasha made $80 from selling lemonade and mowing ... => 'final_answer'


 77%|███████▋  | 1018/1319 [3:42:16<25:38,  5.11s/it] 

Error on question: Olivia uploaded 72 pictures to Facebook.  She put the same n... => 'final_answer'


 90%|█████████ | 1188/1319 [4:00:55<3:09:42, 86.89s/it]

Error on question: Jay is making snowballs to prepare for a snowball fight with... => 'final_answer'


 99%|█████████▉| 1307/1319 [4:09:42<00:53,  4.42s/it]  

Error on question: Dr. Hugo Grumpus and his assistant, Igor, were preparing to ... => 'final_answer'


100%|██████████| 1319/1319 [4:10:32<00:00, 11.40s/it]

Accuracy: 87.19% (1150/1319)



