In [93]:
from mint.config import DATA_DIR
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.chat_models import init_chat_model
from langchain_core.messages.ai import UsageMetadata
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from langsmith import Client, traceable, evaluate
from collections import defaultdict
from typing import List, Union
from collections import defaultdict
from tqdm.auto import tqdm
from tabulate import tabulate
import pandas as pd
import langsmith as ls
import openai
import datetime
import os
import json
import math

In [2]:
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_dir = DATA_DIR("GSM8K")

data = load_jsonl(os.path.join(data_dir, 'test.jsonl'))

In [3]:
class FinalAnswer(BaseModel):
    answer : float

In [4]:
load_dotenv()

True

In [5]:
model = init_chat_model(
        model='gpt-4.1-mini', 
        model_provider='openai',
        temperature = 0.2
    )

In [57]:
messages = [
        SystemMessage("You are a Math Q&A expert. These are math problems suitable for students in grade 8 or below. Please solve the following question step by step, showing your reasoning clearly. Use simple and clear language appropriate for middle school students. Give the final answer as a number at the end in the format Answer: <number>"),
        HumanMessage("Janet\u2019s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"),
    ]

In [58]:
response = model.invoke(messages)
raw_answer = response.content.strip()
print(response.content)

Let's break down the problem step by step:

1. Janet's ducks lay 16 eggs per day.
2. She eats 3 eggs for breakfast.
3. She uses 4 eggs to bake muffins.
4. The remaining eggs are sold at the farmers' market for $2 each.

First, find out how many eggs are left to sell:

Total eggs laid = 16  
Eggs eaten for breakfast = 3  
Eggs used for muffins = 4  

Remaining eggs = Total eggs - Eggs eaten - Eggs used  
Remaining eggs = 16 - 3 - 4 = 9 eggs

Next, find out how much money she makes by selling these eggs:

Price per egg = $2  
Number of eggs sold = 9  

Total money made = Number of eggs sold × Price per egg  
Total money made = 9 × 2 = $18

Answer: 18


In [59]:
client = OpenAI()

In [60]:
result = -9999
result = client.responses.parse(
        model="gpt-4.1-mini",
        input=[
            {"role": "system", "content": "Extract the numerical answer only."},
            {"role": "user", "content": raw_answer},
        ],
        text_format=FinalAnswer,
    )

In [61]:
print(result.output_parsed.answer)  

18.0


In [13]:
def extract_final_result(text):
    marker = "####"
    if marker in text:
        return float(text.split(marker)[-1].strip().replace(',', ''))

In [23]:
results = []
incorrect_results = []

In [24]:
correct = 0
test_samples = data[:500]
total = len(test_samples)

for item in test_samples:
    question = item['question']
    true_answer = extract_final_result(item['answer'].strip())

    messages = [
        SystemMessage('As a Q&A expert, please help me answer the following questions.'),
        HumanMessage(question)
    ]

    response = model.invoke(messages)
    raw_answer = response.content.strip()

    try:
        parsed_response = client.responses.parse(
            model="gpt-4.1-mini",
            input=[
                {"role": "system", "content": "Extract the numerical answer only."},
                {"role": "user", "content": raw_answer},
            ],
            text_format=FinalAnswer,
        )
        model_answer = parsed_response.output_parsed.answer
    except Exception:
        model_answer = extract_float(raw_answer)

    if abs(model_answer - true_answer) < 1e-3:
        correct += 1
        results.append({
            "question": question,
            "true_answer": true_answer,
            "raw_answer": raw_answer,
            "parsed_answer": model_answer
        })
    else:
        incorrect_results.append({
            "question": question,
            "true_answer": true_answer,
            "raw_answer": raw_answer,
            "parsed_answer": model_answer
        })

with open("zero-shot_responses.json", "w", encoding="utf-8") as f:
    json.dump({
        "correct_results": results,
        "incorrect_results": incorrect_results
    }, f, ensure_ascii=False, indent=2)

accuracy = correct / total
print(f"Accuracy on GSM8K test subset (500 samples): {accuracy:.2%}")

Accuracy on GSM8K test subset (500 samples): 95.80%


In [62]:
MODEL_NAME = "gpt-4.1-mini"
MODEL_PROVIDER = "openai"

openai_client = OpenAI()

class final_answer(BaseModel):
    answer : float
    
@traceable(run_type="parser") 
def parsed_answer(raw_answer) -> float:
    result = -9999
    result = openai_client.responses.parse(
        model = MODEL_NAME,
        input=[
            {"role": "system", "content": "Extract the numerical answer only."},
            {"role": "user", "content": raw_answer},
        ],
        text_format=final_answer,
    )
    return result.output_parsed.answer

@traceable(run_type="prompt")
def build_prompt(question: str):
    messages = [
        SystemMessage("""
        You are a Math Q&A expert. These are math problems suitable for students in grade 8 or below. 
        Please solve the following question step by step, showing your reasoning clearly. 
        Use simple and clear language appropriate for middle school students. 
        Give the final answer as a number at the end in the format:
        Answer: <number>
        """),
        HumanMessage(question)
    ]
    return messages

@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    model = init_chat_model(
        model = MODEL_NAME, 
        model_provider = MODEL_PROVIDER,
        temperature = 0.2
    )
    response = model.invoke(messages)
    raw_ans = response.content.strip()
    token = response.usage_metadata
    return token, raw_ans

@traceable(run_type="chain")
def mathqa_mas(question: str):
    msg = build_prompt(question)
    token, raw_answer = call_openai(msg)
    result = parsed_answer(raw_answer)
    output = {
        "answer": raw_answer,
        "parsed": result
    }
    return token, output

In [63]:
client = Client()
@traceable(run_type="tool")
def compare_result(inputs: dict, reference_outputs: dict, outputs: dict):
    reference_response = reference_outputs["parsed"]
    run_response = outputs["parsed"]

    score = math.isclose(float(reference_response), run_response, rel_tol=1e-3)
    return {"key": "is_correct", "score": int(score)}
    
@traceable(run_type="chain")
def target_function(inputs: dict):
    token, result = mathqa_mas(inputs["question"])
    rt = ls.get_current_run_tree()
    rt.metadata["input_tokens"] = token["input_tokens"]
    rt.metadata["output_tokens"] = token["output_tokens"]
    rt.metadata["total_tokens"] = token["total_tokens"]
    return result

evaluate(
    target_function,
    data=client.list_examples(dataset_name= "MathQA_MAS", splits=["Test Dataset"]), 
    evaluators=[compare_result],
    experiment_prefix="GSM8K - Test Dataset"
)

View the evaluation results for experiment: 'GSM8K - Test Dataset-039053ec' at:
https://smith.langchain.com/o/b7e81006-483c-41d8-b361-cc1ea3aa3974/datasets/09bf135d-d2b9-4518-b221-aa8b6ba23acf/compare?selectedSessions=463e50e1-371b-426f-9396-226776de0d81




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.answer,outputs.parsed,error,reference.answer,reference.parsed,feedback.is_correct,execution_time,example_id,id
0,"Carlos, Jim and Carrey were at the beach playi...",Let's solve the problem step by step.\n\n1. **...,20.0,,"Jim collected 27 seashells, 5 more than what C...",20,1,7.349218,f93f7632-18e1-4963-9903-71ccf920856d,d1576534-050a-44cb-9a47-f59c3ffe7faf
1,Tim wanted to make lemonade for a pool party. ...,Let's break down the problem step by step:\n\n...,36.0,,Tim is going to make one gallon twice as tart ...,36,1,7.842071,f0f3b06a-0d7f-43c6-afc7-0d92c339f1a2,d20f88e0-4e5e-4406-9dd2-160d68467781
2,The Kennel house keeps 3 German Shepherds and ...,Let's break down the problem step by step:\n\n...,147.0,,The German Shepherds consume 3 x 5 = <<3*5=15>...,147,1,3.065564,dd7cbc36-6ea4-4200-84b8-e412a2f8c7d7,0f35fe87-33fb-4697-b721-9e9bd13571cc
3,Indras has 6 letters in her name. Her sister's...,Let's solve the problem step by step:\n\n1. In...,13.0,,I = <<6=6>>6\nSister = 6/2 + 4 = <<6/2+4=7>>7\...,13,1,5.303601,db08a88c-fe03-4fdc-935a-8279b95a64fc,4622e5b1-2a4c-4a3b-bb43-eee7cce9f062
4,John decides to do several activities while ou...,Let's solve this step by step.\n\n1. **Time sp...,8.0,,He spent 6/2=<<6/2=3>>3 hours swimming\nHe spe...,20,0,4.88759,d77fe2aa-f59b-4555-9991-c668dcc0fb70,a0bcb76e-0db5-48e8-9d48-80856fe9d52c
5,Gerald and Julia divided $100 in the ratio 3:2...,Let's solve the problem step by step.\n\n1. Ge...,50.0,,The $100 was divided into 3 + 2 = <<3+2=5>>5 p...,50,1,5.349323,cf360108-2b45-438a-a478-34ebb76ab5c8,35cf1108-0a73-43fc-94b2-b798e77d650a
6,Britany records 18 4-minute TikTok videos each...,Let's break down the problem step by step.\n\n...,18.8,,First find how long Britany spends recording t...,1128,0,3.999026,c5b18837-4032-40ad-991a-70c249f65a2f,4d69e299-3a9c-47c2-8f66-be3bdd3660da
7,Ali has four $10 bills and six $20 bills that ...,Let's solve the problem step by step.\n\n1. **...,32.0,,Four $10 bills have a value of 4*$10 =$<<4*10=...,32,1,4.866059,c2433c2d-6923-4bfa-ac8e-bcb019c08c64,cad8acef-9b34-4ebb-b37f-afe1d04acab5
8,Brandon's iPhone is four times as old as Ben's...,Let's analyze the problem step by step.\n\n1. ...,12.0,,Ben’s iPhone is 1*2 = <<1*2=2>>2 years old.\nB...,8,0,6.799779,c18f4afe-22bb-4d9d-a4eb-47e5ac26d08e,a13c4f4b-2bc0-42b5-887b-aa5662028968
9,Jamal's phone can hold 6 times more photograph...,Let's break down the problem step by step.\n\n...,6.0,,"Since Jamal's phone can hold 1800 photos, whic...",6,1,4.856462,bdd7f842-4137-47b0-b546-670399e32abd,921f486a-022a-4606-8bc9-caa6d6939d68


In [64]:
experiment_name = "GSM8K - Test Dataset-039053ec" 

runs = list(client.list_runs(project_name=experiment_name, execution_order=1))

data = []
count = 0
for run in runs:
    count+=1
    metadata = run.extra.get("metadata", {})
    row = {
        "run_id": run.id,
        "input": run.inputs,
        "output": run.outputs,
        "error": run.error,
        "latency_sec": (run.end_time - run.start_time).total_seconds() if run.end_time and run.start_time else None,
        "total_cost": run.total_cost,
        "total_tokens": metadata.get("total_tokens"),
        "input_tokens": metadata.get("input_tokens"),
        "output_tokens": metadata.get("output_tokens"),
    }
    data.append(row)

df_test_gsm8k = pd.DataFrame(data)
print("Total requests: ",count)
df_test_gsm8k

Total requests:  50


Unnamed: 0,run_id,input,output,error,latency_sec,total_cost,total_tokens,input_tokens,output_tokens
0,599b4832-5d2e-4896-a67b-3511ade6199c,{'inputs': {'question': 'Steve decides to star...,{'answer': 'Let's break down the problem step ...,,3.263311,0.0003644,332,139,193
1,97df556a-e84e-4ac6-8a92-19b6fdaf6d91,"{'inputs': {'question': 'Together Lily, David,...",{'answer': 'Let's use variables to represent t...,,4.413376,0.0004608,381,124,257
2,8ca27529-8452-4d32-9486-618786016e46,{'inputs': {'question': 'Marilyn wants to buy ...,{'answer': 'Let's break down the problem step ...,,3.019183,0.0003304,322,154,168
3,835ccdcc-3e28-461b-a45f-99a832382510,{'inputs': {'question': 'Fern is trying to dec...,{'answer': 'Let's solve this step by step. 1....,,3.977341,0.000524,455,170,285
4,cbd69983-0e0e-478e-a2aa-cdfc28069ec5,{'inputs': {'question': 'Four books are arrang...,{'answer': 'Let's find the thickness of each b...,,3.478541,0.0003016,301,150,151
5,6470a473-539a-4dca-b36e-a2d3f8c4274a,{'inputs': {'question': 'Ten stalls have 20 co...,{'answer': 'Let's solve the problem step by st...,,3.999285,0.0003512,320,134,186
6,22b37267-8418-4d2c-8b82-f5b06fd3fb17,{'inputs': {'question': 'Dolly has two books. ...,{'answer': 'Let's understand the problem step ...,,3.03539,0.000288,273,124,149
7,7ca9958b-6a7b-4c6a-b212-374292f62ea1,{'inputs': {'question': 'Mary is making ice cu...,{'answer': 'Let's solve the problem step by st...,,4.164412,0.0004404,384,145,239
8,53a5aec3-6886-4ebe-ab0f-1e49fea664ae,{'inputs': {'question': 'Ruby is 6 times older...,{'answer': 'Let's use variables to represent t...,,5.240443,0.0004664,380,118,262
9,93174e28-d7a1-44c0-8dbc-e6cf05fa880d,{'inputs': {'question': 'A bakery produces 60 ...,{'answer': 'Let's solve the problem step by st...,,4.630161,0.0004436,380,137,243


In [104]:
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

data_dir = DATA_DIR("TATQA")

data_tatqa = load_json(os.path.join(data_dir, 'tatqa_dataset_train.json'))

In [82]:
def linearize_table(table_data):
    return tabulate(table_data, headers="firstrow", tablefmt="github")

def linearize_paragraphs(paragraphs):
    return "\n".join([f"({p['order']}) {p['text']}" for p in sorted(paragraphs, key=lambda x: x['order'])])

def build_prompt_tatqa(item, question_obj):
    table_text = linearize_table(item["table"]["table"])
    para_text = linearize_paragraphs(item["paragraphs"])
    question = question_obj["question"]

    prompt = f"""Context:
Table:
{table_text}

Paragraphs:
{para_text}

Question:
{question}
"""
    return prompt

In [83]:
model = init_chat_model(
        model='gpt-4.1-mini', 
        model_provider='openai',
        temperature = 0.2
    )

In [105]:
item = data_tatqa[0]
q = item["questions"][0]
prompt_test = build_prompt_tatqa(item, q)

In [106]:
messages = [
        SystemMessage("""As a helpful Q&A expert that answers questions based on table and text context. 
        No further explanation needed. If the final answer is a number, return a single number only."""),
        HumanMessage(prompt_test),
    ]

In [109]:
response = model. invoke(messages)
raw_answer = response.content.strip()
print(response.content)

Rate of inflation, Rate of increase in salaries, Discount rate


In [110]:
result = -9999
result = openai_client.responses.parse(
        model="gpt-4.1-mini",
        input=[
            {"role": "system", "content": "Extract the result in JSON under key `answer`. It can be a string, a float, a list of strings."},
            {"role": "user", "content": raw_answer},
        ],
        text_format=OutputTATQA,
    )

print(result.output_parsed.answer) 

['Rate of inflation', 'Rate of increase in salaries', 'Discount rate']


In [102]:
class OutputTATQA(BaseModel):
    answer: Union[str, float, List[str]]

In [114]:
def compare_values(a, b):
    if type(a) != type(b):
        return False

    if isinstance(a, str):
        return a == b

    if isinstance(a, float):
        return math.isclose(a, b, rel_tol=1e-3)

    if isinstance(a, list):
        return a == b  

    return a == b

In [92]:
results = []
incorrect_results = []

In [121]:
correct = 0
count = 0

samples_data_tatqa = data_tatqa[:300]
total = len(samples_data_tatqa)

for item in samples_data_tatqa:
    for q in item["questions"]:
        if count == 300:
            break

        count += 1
        question = q["question"]
        ref_output = q["answer"]

        messages = [
        SystemMessage("""As a helpful Q&A expert that answers questions based on table and text context. 
        No further explanation needed. If the final answer is a number, return a single number only."""),
        HumanMessage(build_prompt_tatqa(item, q)),
    ]

        response = model.invoke(messages)
        raw_answer = response.content.strip()
        
        parsed_response = openai_client.responses.parse(
            model="gpt-4.1-mini",
            input=[
                {"role": "system", "content": "Extract the result in JSON under key `answer`. It can be a string, a float, a list of strings."},
                {"role": "user", "content": raw_answer},
            ],
            text_format=OutputTAT_QA,)
        model_answer = parsed_response.output_parsed.answer

        if compare_values(model_answer, ref_output):
            correct += 1
            results.append({
                "question": question,
                "ref_output": ref_output,
                "output": model_answer
            })
        else:
            incorrect_results.append({
                "question": question,
                "ref_output": ref_output,
                "output": model_answer
            })
    if count == 300:
        break

with open("zero-shot_responses_tatqa.json", "w", encoding="utf-8") as f:
    json.dump({
        "correct_results": results,
        "incorrect_results": incorrect_results
    }, f, ensure_ascii=False, indent=2)

accuracy = correct / total
print(f"Accuracy on TATQA train subset (300 samples): {accuracy:.2%}")

Accuracy on TATQA train subset (300 samples): 17.67%
