In [10]:
from openai import OpenAI
from dateutil.relativedelta import relativedelta
import os
import json
from pydantic import BaseModel,Field
from langchain.llms import OpenAI
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI, init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm
import re


In [11]:
file_path = '../data/GSM8K/test.jsonl'

with open(file_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

In [12]:
load_dotenv()
class Step(BaseModel):
    explanation: str
    output: str

class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str
load_dotenv()
model=init_chat_model('gpt-4o-mini',model_provider='openai',temperature=0.2)
messages=[
    SystemMessage(content="You are a math expert. For every question, respond ONLY by calling the 'MathReasoning' tool. You must explain the solution step-by-step using 'steps', and include the final numeric result in 'final_answer'.The result should be a number only, with no units."),
    HumanMessage("Tìm x. 2x-3=9")
]
model_with_tools = model.with_structured_output(MathReasoning)
ai_msg = model_with_tools.invoke(messages)

In [13]:
ai_msg

MathReasoning(steps=[Step(explanation='Bắt đầu với phương trình 2x - 3 = 9.', output='2x - 3 = 9'), Step(explanation='Cộng 3 vào cả hai vế để loại bỏ -3.', output='2x = 9 + 3'), Step(explanation='Tính toán bên phải: 9 + 3 = 12.', output='2x = 12'), Step(explanation='Chia cả hai vế cho 2 để tìm x.', output='x = 12 / 2'), Step(explanation='Tính toán: 12 / 2 = 6.', output='x = 6')], final_answer='6')

In [14]:
def extract_ground_truth(answer: str) -> str:
    match = re.search(r"####\s*(\d+)", answer)
    return match.group(1).strip() if match else ""

def compare_answers(predicted: str, actual: str) -> bool:
    return predicted.strip() == actual.strip()

In [15]:
correct = 0
results = []
total = len(dataset[:300])
for item in tqdm(dataset[:300]):
    question = item["question"]
    true_answer = extract_ground_truth(item["answer"])
    messages = [
        SystemMessage(content="""
        You are a math expert.
        For every question, you **must** respond using the `MathReasoning` tool.
        - Do not respond with plain text or natural language.
        - Use a list of `Step`s to break down the reasoning.
        - Include a `final_answer` as a single number, no units or symbols.
        - If you cannot solve it, return a final_answer of "unknown".
        """),
        HumanMessage(content=question)
    ]
    try:
        ai_msg = model_with_tools.invoke(messages)
        predicted_answer = ai_msg.final_answer  
        result = {
            "question": question,
            "true_answer": true_answer,
            "predicted_answer": predicted_answer,
            "steps": [step.model_dump() for step in ai_msg.steps], 
            "correct": compare_answers(predicted_answer, true_answer)
        }
        results.append(result)
        if compare_answers(predicted_answer, true_answer):
            correct += 1
    except Exception as e:
        print(f"Error on question: {question[:60]}... => {e}")
        continue
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")


100%|██████████| 300/300 [12:23<00:00,  2.48s/it]

Accuracy: 93.00% (279/300)





In [16]:
# Lưu ra file JSONL
output_path = "CoT_results.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"Đã lưu kết quả vào {output_path}")

Đã lưu kết quả vào CoT_results.jsonl
