In [11]:
from openai import OpenAI
from mint.config import DATA_DIR
import os
import json
from pydantic import BaseModel,Field
from langchain.llms import OpenAI
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm
import re
import textwrap
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from openai import OpenAI
from pydantic import BaseModel
import traceback
from langsmith import Client, traceable, evaluate
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from langsmith import traceable, trace

In [12]:
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_dir = DATA_DIR("GSM8K")

dataset = load_jsonl(os.path.join(data_dir, 'test.jsonl'))

In [13]:
load_dotenv()
model = init_chat_model("gpt-4.1-mini", model_provider="openai", temperature=0.2)

In [14]:
class State(TypedDict):
    question: str
    program: Optional[str]
    result: Optional[str]
    final_answer: Optional[str]
    error: Optional[str]
class IntermediateProgram(BaseModel):
    program: str


In [16]:
@traceable(run_type="prompt")
def pot_node(state: State) -> State:
    pot_messages = [
        SystemMessage("""
# Answer this question by implementing a solver() function.
# Write a Python program, and then return the answer.
Question: Carlos is planing a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he
can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?
def solver():
    total_cost = 90
    cost_of_watering_and_feeding = 3
    cost_of_each_lemon = 1.5
    num_of_lemon_per_year = 7
    ans = 0
    while total_cost > 0:
        total_cost += cost_of_watering_and_feeding
        total_cost -= num_of_lemon_per_year * cost_of_each_lemon
        ans += 1
    return ans
Question: When Freda cooks canned tomatoes into sauce, they lose half their volume. Each 16 ounce can of
tomatoes that she uses contains three tomatoes. Freda’s last batch of tomato sauce made 32 ounces of sauce. How
many tomatoes did Freda use?
def solver():
    lose_rate = 0.5
    num_tomato_contained_in_per_ounce_sauce = 3 / 16
    ounce_sauce_in_last_batch = 32
    num_tomato_in_last_batch = ounce_sauce_in_last_batch * num_tomato_contained_in_per_ounce_sauce
    ans = num_tomato_in_last_batch / (1 - lose_rate)
    return ans
    
Question: Jordan wanted to surprise her mom with a homemade birthday cake. From reading the instructions, she
knew it would take 20 minutes to make the cake bajer and 30 minutes to bake the cake. The cake would require 2
hours to cool and an additional 10 minutes to frost the cake. If she plans to make the cake all on the sam
def solver():
    minutes_to_make_bajer = 20
    minutes_to_bake_cake = 30
    minutes_to_cool_cake = 2 * 60
    minutes_to_frost_cake = 10
    total_minutes = minutes_to_make_bajer + minutes_to_bake_cake + minutes_to_cool_cake +
    minutes_to_frost_cake
    total_hours = total_minutes / 60
    ans = 5 - total_hours
    return ans
"""),
        HumanMessage(content=f"""
# Question: {state["question"]}
# If the final result is a decimal ending in .0, convert it to an integer before returning.
# Call the solver function and 'MUST' assign the variale 'result'.
# Before returning the final result, DOUBLE-CHECK each variable assignment and calculation to ensure they match the problem statement.
""")]

    model_pot = model.with_structured_output(IntermediateProgram)
    model_invoke=model_pot.invoke(pot_messages)
    code=model_invoke.program
    return {**state, "program": code}
    
@traceable(run_type="parser")
def exec_node(state: State) -> State:
    try:
        exec_globals = {}
        exec(state["program"], {}, exec_globals)
        result = exec_globals.get("result", None)
        if result is None:
            raise ValueError("Missing `result`")
        return {**state, "result": str(result), "error": None}
    except Exception as e:
        return {**state, "result": None, "error": str(e)}

def check_eos(state: State) -> bool:
    if state["error"] is None:
        return True
    else:
        return False

def write_final_answer_node(state:State)->State:

    if state["error"] is None:
        result=str(state["result"])
    else:
        result=str(9999)
    return {**state,"final_answer":result}


builder = StateGraph(State)
builder.add_node("PoT", pot_node)
builder.add_node("Exec", exec_node)

builder.add_node("write_final_answer",write_final_answer_node)

builder.set_entry_point("PoT")
builder.add_edge("PoT", "Exec")
builder.add_edge("Exec", "write_final_answer")
builder.add_edge("write_final_answer",END)
graph = builder.compile()


In [17]:
@traceable(run_type="tool")
def extract_ground_truth(answer: str) -> str:
    match = re.search(r"####\s*([\d,]+)", answer)
    if match:
        # Loại bỏ dấu phẩy để đảm bảo kết quả là số chuẩn
        return match.group(1).replace(",", "").strip()
    return ""

@traceable(run_type="tool")
def compare_answers(predicted: str, actual: str, eps: float = 1e-3) -> bool:
    try:
        pred = round(float(predicted.strip()))
        act = round(float(actual.strip()))
        return abs(pred - act) < eps
    except ValueError:
        return predicted.strip().lower() == actual.strip().lower()



In [18]:
@traceable(run_type="chain")
def process_item(item):
    question = item["question"]
    true_answer = extract_ground_truth(item["answer"])
    try:
        result = graph.invoke({"question": question})
        return {
            "question": question,
            "program":result["program"],
            "true_answer": true_answer,
            "predicted_answer": result["final_answer"],
            "correct": compare_answers(result["final_answer"], true_answer)
        }
    except Exception as e:
        return {"error": str(e), "question": question}

results = []
correct = 0
total = len(dataset[:10])
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_item, item) for item in dataset[:10]]
    for future in tqdm(as_completed(futures), total=total):
        result = future.result()
        if "error" not in result:
            results.append(result)
            if result["correct"]:
                correct += 1
        else:
            print(f"Error on question: {result['question'][:60]}... => {result['error']}")
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")


100%|███████████████████████████████████████████| 10/10 [00:11<00:00,  1.12s/it]

Accuracy: 100.00% (10/10)








In [20]:
output_path = "PoT_results.json"
wrong_answers = [r for r in results if not r["correct"]]
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(wrong_answers, f, ensure_ascii=False, indent=2)
    
print(f"Đã lưu kết quả vào {output_path}")


Đã lưu kết quả vào PoT_results.json
