In [184]:
from openai import OpenAI
import os
import json
from pydantic import BaseModel,Field
from langchain.llms import OpenAI
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from tqdm import tqdm
import re
import textwrap
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from openai import OpenAI
from pydantic import BaseModel
import traceback
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed

In [185]:
file_path = '../data/GSM8K/test.jsonl'

with open(file_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

In [186]:
load_dotenv()
model = init_chat_model("gpt-4o-mini", model_provider="openai", temperature=0.2)

In [187]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Optional
class State(TypedDict):
    question: str
    program: Optional[str]
    result: Optional[str]
    final_answer: Optional[str]
    steps: Optional[list]
    error: Optional[str]
class Step(BaseModel):
    explanation: str
    output: str
class IntermediateProgram(BaseModel):
    program: str
class FinalAnswer(BaseModel):
    steps: list[Step]
    final_answer: str
class DecisionOutput(BaseModel):
    decision: str

In [188]:
def pot_node(state: State) -> State:
    pot_prompt = SystemMessage(content="""
    You are a math expert. Read the question and write code to solve it. Assign final result to 'result'. If the final result is a decimal ending in .0, convert it to an integer using round().
""")
    pot_messages = [pot_prompt, HumanMessage(content=state["question"])]
    model_pot = model.with_structured_output(IntermediateProgram)
    model_invoke=model_pot.invoke(pot_messages)
    code=model_invoke.program
    return {**state, "program": code}

def exec_node(state: State) -> State:
    try:
        exec_globals = {}
        exec(state["program"], {}, exec_globals)
        result = exec_globals.get("result", None)
        if result is None:
            raise ValueError("Missing `result`")
        return {**state, "result": str(result), "error": None}
    except Exception as e:
        return {**state, "result": None, "error": str(e)}

def check_eos(state: State) -> bool:
    if state["error"] is None:
        return True
    else:
        return False
def write_final_answer_node(state:State)->State:
    result=str(state["result"])
    return {**state,"final_answer":result}

def cot_node(state: State) -> State:
    final_prompt = SystemMessage(content="""
Based on the question and result, return only the numeric final answer.
Final numeric result in 'final_answer'.
The result should be a number only, with no units.
""")
    context = state["question"] + f"\nAccording to the program: ans = {state['result']}"
    prompt_messages = [final_prompt, HumanMessage(content=context)]
    model_final = model.with_structured_output(FinalAnswer)
    model_invoke=model_final.invoke(prompt_messages)
    final_answer=model_invoke.final_answer
    steps=model_invoke.steps
    return {**state, "final_answer": final_answer, "steps": steps}

builder = StateGraph(State)
builder.add_node("PoT", pot_node)
builder.add_node("Exec", exec_node)
builder.add_node("CoT", cot_node)
builder.add_node("write_final_answer",write_final_answer_node)

builder.set_entry_point("PoT")
builder.add_edge("PoT", "Exec")
builder.add_edge("CoT", END)
builder.add_edge("write_final_answer",END)
builder.add_conditional_edges("Exec", check_eos, {
    True: "write_final_answer",     
    False: "CoT"   
})
graph = builder.compile()


In [189]:
question = "Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?"
result = graph.invoke({"question": question})

In [190]:
result

{'question': 'Carlos is planting a lemon tree. The tree will cost $90 to plant. Each year it will grow 7 lemons, which he can sell for $1.5 each. It costs $3 a year to water and feed the tree. How many years will it take before he starts earning money on the lemon tree?',
 'program': '# Constants\ncost_to_plant = 90\nlemons_per_year = 7\nprice_per_lemon = 1.5\nannual_cost = 3\n\n# Initial setup\nyears = 0\nnet_profit = -cost_to_plant\n\n# Calculate profit year by year until profit is positive\nwhile net_profit < 0:\n    years += 1\n    revenue = lemons_per_year * price_per_lemon\n    expenses = annual_cost\n    net_profit += (revenue - expenses)\n\n# Assign final result\nresult = years\n\n# Convert to integer if it ends with .0\nif result % 1 == 0:\n    result = round(result)\n\nresult',
 'result': '12',
 'final_answer': '12',
 'error': None}

In [191]:
def extract_ground_truth(answer: str) -> str:
    match = re.search(r"####\s*(\d+)", answer)
    return match.group(1).strip() if match else ""

def compare_answers(predicted: str, actual: str) -> bool:
    return predicted.strip() == actual.strip()

In [194]:

def process_item(item):
    question = item["question"]
    true_answer = extract_ground_truth(item["answer"])
    try:
        result = graph.invoke({"question": question})
        return {
            "question": question,
            "true_answer": true_answer,
            "predicted_answer": result["final_answer"],
            "correct": compare_answers(result["final_answer"], true_answer)
        }
    except Exception as e:
        return {"error": str(e), "question": question}

results = []
correct = 0
total = len(dataset[:400])
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(process_item, item) for item in dataset[:400]]
    for future in tqdm(as_completed(futures), total=total):
        result = future.result()
        if "error" not in result:
            results.append(result)
            if result["correct"]:
                correct += 1
        else:
            print(f"Error on question: {result['question'][:60]}... => {result['error']}")
accuracy = correct / total * 100
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total})")


100%|██████████| 400/400 [03:44<00:00,  1.78it/s]

Accuracy: 91.75% (367/400)





In [195]:
# Lưu ra file JSONL
output_path = "PoT_results.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"Đã lưu kết quả vào {output_path}")

Đã lưu kết quả vào PoT_results.jsonl
