In [1]:
from mint.config import DATA_DIR
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
import os
import json

In [2]:
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_dir = DATA_DIR("GSM8K")

data = load_jsonl(os.path.join(data_dir, 'test.jsonl'))

In [3]:
class FinalAnswer(BaseModel):
    answer : float

In [4]:
load_dotenv()

True

In [5]:
model = init_chat_model(
        model='gpt-4o-mini', 
        model_provider='openai',
        temperature = 0.2
    )

In [6]:
messages = [
        SystemMessage('As a Q&A expert, please help me answer the following questions.'),
        HumanMessage("Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market? Only provide the final answer."),
    ]

In [7]:
response = model.invoke(messages)
raw_answer = response.content.strip()
print(response.content)

Janet makes $18 every day at the farmers' market.


In [8]:
client = OpenAI()

In [9]:
parsed_response = client.responses.parse(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": "Extract the numerical answer only."},
            {"role": "user", "content": raw_answer},
        ],
        text_format=FinalAnswer,
    )

In [10]:
print(parsed_response.output_parsed.answer)  

18.0


In [19]:
def extract_final_result(text):
    marker = "####"
    if marker in text:
        return float(text.split(marker)[-1].strip().replace(',', ''))

In [20]:
correct = 0
test_samples = data[:400]
total = len(test_samples)

for item in test_samples:
        question = item['question']
        true_answer = extract_final_result(item['answer'].strip())


        messages = [
            SystemMessage('As a Q&A expert, please help me answer the following questions.'),
            HumanMessage(question)
        ]

        response = model.invoke(messages)
        raw_answer = response.content.strip()

        try:
            parsed_response = client.responses.parse(
                model="gpt-4o-mini",
                input=[
                    {"role": "system", "content": "Extract the numerical answer only."},
                    {"role": "user", "content": raw_answer},
                ],
                text_format=FinalAnswer,
            )
            model_answer = parsed_response.output_parsed.answer
        except Exception:
            model_answer = extract_float(raw_answer)

        if abs(model_answer - true_answer) < 1e-3:
            correct += 1

accuracy = correct / total
print(f"Accuracy on GSM8K test subset (400 samples): {accuracy:.2%}")

Accuracy on GSM8K test subset (400 samples): 93.25%
