In [1]:
from openai import OpenAI
from dateutil.relativedelta import relativedelta
import os
import json
from pydantic import BaseModel,Field
from langchain.llms import OpenAI
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI, init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm
import re


In [2]:
file_path = '../data/GSM8K/test.jsonl'

with open(file_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

In [12]:
load_dotenv()
class Step(BaseModel):
    explanation: str
    output: str

class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str
load_dotenv()
model=init_chat_model('gpt-4o-mini',model_provider='openai',temperature=0.2)
messages=[
    SystemMessage(content="You are a math expert. For every question, respond ONLY by calling the 'MathReasoning' tool. You must explain the solution step-by-step using 'steps', and include the final numeric result in 'final_answer'.The result should be a number only, with no units."),
    HumanMessage("Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?")
]
model_with_tools = model.with_structured_output(MathReasoning)
ai_msg = model_with_tools.invoke(messages)

In [14]:
def extract_ground_truth(answer: str) -> str:
    match = re.search(r"####\s*(\d+)", answer)
    return match.group(1).strip() if match else ""

def compare_answers(predicted: str, actual: str) -> bool:
    return predicted.strip() == actual.strip()

In [17]:
correct = 0
total = len(dataset[:300])
model_with_tools = model.bind_tools([MathReasoning])
for item in tqdm(dataset[:300]):
    question = item["question"]
    true_answer = extract_ground_truth(item["answer"])
    messages = [
        SystemMessage(content="""
        You are a math expert.
        For every question, you **must** respond using the `MathReasoning` tool.
        - Do not respond with plain text or natural language.
        - Use a list of `Step`s to break down the reasoning.
        - Include a `final_answer` as a single number, no units or symbols.
        - If you cannot solve it, return a final_answer of "unknown".
        """),
        HumanMessage(content=question)
    ]
    try:
        ai_msg = model_with_tools.invoke(messages)
        tool_calls = ai_msg.tool_calls
        if tool_calls and isinstance(tool_calls[0], dict):
            predicted_answer = tool_calls[0]["args"]["final_answer"]
            tool_output = ai_msg.tool_calls[0]["args"]
            with open("result_COT.json", "w", encoding="utf-8") as f:
                json.dump(tool_output, f, ensure_ascii=False, indent=2)
        else:
            predicted_answer = ai_msg.tool_calls[0].args["final_answer"] 
        if compare_answers(predicted_answer, true_answer):
            correct += 1
    except Exception as e:
        print(f"Error on question: {question[:60]}... => {e}")
        continue
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")


 61%|██████    | 182/300 [11:54<06:00,  3.06s/it]

Error on question: Sandra, the florist around the corner, is very unhappy with ... => 'final_answer'


100%|██████████| 300/300 [20:44<00:00,  4.15s/it]

Accuracy: 88.33% (265/300)



