In [3]:
import os
import json
import random
import math
import pandas as pd
import langsmith as ls
from collections import Counter
from dotenv import load_dotenv
from langsmith import Client, traceable, evaluate
from langchain_core.messages import HumanMessage, SystemMessage
from pydantic import BaseModel
from langchain.chat_models import init_chat_model
from openai import OpenAI
from tabulate import tabulate
from itertools import islice
from typing import List


In [4]:
load_dotenv()
MODEL_NAME = os.getenv("MODEL_NAME")
MODEL_PROVIDER = os.getenv("MODEL_PROVIDER")

In [5]:
model=init_chat_model(model=MODEL_NAME,model_provider=MODEL_PROVIDER,temperature=0.2)

## Zero-shot

In [6]:
openai_client = OpenAI()
client = Client()

In [35]:
class final_answer(BaseModel):
    answer : float
    
@traceable(run_type="parser") 
def parsed_answer(raw_answer) -> float:
    result = -9999
    result = openai_client.responses.parse(
        model = MODEL_NAME,
        input=[
            {"role": "system", "content": "Extract the numerical answer only."},
            {"role": "user", "content": raw_answer},
        ],
        text_format=final_answer,
    )
    return result.output_parsed.answer

@traceable(run_type="prompt")
def build_prompt(question: str):
    messages = [
        SystemMessage("""
        You are a Math Q&A expert. These are math problems suitable for students in grade 8 or below. 
        Please solve the following question step by step, showing your reasoning clearly. 
        Use simple and clear language appropriate for middle school students. 
        Give the final answer as a number at the end in the format:
        Answer: <number>
        """),
        HumanMessage(question)
    ]
    return messages

@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_openai(messages: List[dict]) -> str:
    model = init_chat_model(
        model = MODEL_NAME, 
        model_provider = MODEL_PROVIDER,
        temperature = 0.2
    )
    response = model.invoke(messages)
    raw_ans = response.content.strip()
    return raw_ans

@traceable(run_type="chain")
def mathqa_mas(question: str):
    msg = build_prompt(question)
    raw_answer = call_openai(msg)
    result = parsed_answer(raw_answer)
    output = {
        "answer": raw_answer,
        "parsed": result
    }
    return output

In [66]:
@traceable(run_type="tool")
def compare_result(inputs: dict, reference_outputs: dict, outputs: dict):
    reference_response = reference_outputs["parsed"]
    run_response = outputs["parsed"]

    score = math.isclose(float(reference_response), run_response, rel_tol=1e-3)
    return {"key": "is_correct", "score": int(score)}
    
@traceable(run_type="chain")
def target_function(inputs: dict):
    return mathqa_mas(inputs["question"])


evaluate(
    target_function,
    data=client.list_examples(dataset_name= "MathQA_MAS", splits=["Test Dataset"]), 
    evaluators=[compare_result],
    experiment_prefix="Zero-shot_GSM8K"
)

View the evaluation results for experiment: 'Zero-shot_GSM8K-a672f7d3' at:
https://smith.langchain.com/o/b7e81006-483c-41d8-b361-cc1ea3aa3974/datasets/09bf135d-d2b9-4518-b221-aa8b6ba23acf/compare?selectedSessions=9534d827-6d0a-4d43-b0be-a0338ebcca35




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.answer,outputs.parsed,error,reference.answer,reference.parsed,feedback.is_correct,execution_time,example_id,id
0,"Carlos, Jim and Carrey were at the beach playi...",Let's solve this step by step.\n\n1. **Identif...,20.0,,"Jim collected 27 seashells, 5 more than what C...",20,1,8.389464,f93f7632-18e1-4963-9903-71ccf920856d,c3f84639-4a4c-42ed-8a12-e09321fd5c20
1,Tim wanted to make lemonade for a pool party. ...,Let's break down the problem step by step:\n\n...,36.0,,Tim is going to make one gallon twice as tart ...,36,1,7.101982,f0f3b06a-0d7f-43c6-afc7-0d92c339f1a2,f4a3196d-b7d2-4aa5-9ddc-40ca39e3fe5c
2,The Kennel house keeps 3 German Shepherds and ...,Let's solve this step by step:\n\n1. Number of...,147.0,,The German Shepherds consume 3 x 5 = <<3*5=15>...,147,1,3.131233,dd7cbc36-6ea4-4200-84b8-e412a2f8c7d7,734b6075-090d-4a8a-b63f-212619bf29d6
3,Indras has 6 letters in her name. Her sister's...,Let's solve the problem step by step:\n\n1. In...,7.0,,I = <<6=6>>6\nSister = 6/2 + 4 = <<6/2+4=7>>7\...,13,0,3.884708,db08a88c-fe03-4fdc-935a-8279b95a64fc,2f4bbe08-314d-428e-9fe1-17b083779c7f
4,John decides to do several activities while ou...,Let's break down the problem step by step.\n\n...,8.0,,He spent 6/2=<<6/2=3>>3 hours swimming\nHe spe...,20,0,5.917918,d77fe2aa-f59b-4555-9991-c668dcc0fb70,cb4e424c-291e-4743-9e61-796056679916
5,Gerald and Julia divided $100 in the ratio 3:2...,Step 1: Understand the problem. \nGerald and ...,50.0,,The $100 was divided into 3 + 2 = <<3+2=5>>5 p...,50,1,3.668846,cf360108-2b45-438a-a478-34ebb76ab5c8,226d4f3c-cc8b-4bfe-a5a2-0f30af793473
6,Britany records 18 4-minute TikTok videos each...,Let's break down the problem step by step.\n\n...,1128.0,,First find how long Britany spends recording t...,1128,1,4.870849,c5b18837-4032-40ad-991a-70c249f65a2f,7c322d80-ec87-4ea4-ae31-a65f9806fc13
7,Ali has four $10 bills and six $20 bills that ...,Let's solve the problem step by step.\n\n1. **...,32.0,,Four $10 bills have a value of 4*$10 =$<<4*10=...,32,1,3.662607,c2433c2d-6923-4bfa-ac8e-bcb019c08c64,ee7e9822-1d29-47c8-881d-292b5eef5139
8,Brandon's iPhone is four times as old as Ben's...,Let's break down the problem step by step.\n\n...,8.0,,Ben’s iPhone is 1*2 = <<1*2=2>>2 years old.\nB...,8,1,4.229681,c18f4afe-22bb-4d9d-a4eb-47e5ac26d08e,fee91e4e-f55d-4c20-ba13-9ab7265eca3e
9,Jamal's phone can hold 6 times more photograph...,Let's break down the problem step by step:\n\n...,6.0,,"Since Jamal's phone can hold 1800 photos, whic...",6,1,6.052751,bdd7f842-4137-47b0-b546-670399e32abd,a751f433-b7b7-414e-8035-919b696b92d2


In [7]:
experiment_name = "Zero-shot_GSM8K-a672f7d3" 

runs = list(client.list_runs(project_name=experiment_name, execution_order=1))

data = []
count = 0
for run in runs:
    is_correct = None
    feedback_list = client.list_feedback(run_ids=[run.id])
    for fb in feedback_list:
        if fb.key == "is_correct":
            is_correct = fb.score
    count+=1
    row = {
        "run_id": run.id,
        "error": run.error,
        "latency_sec": (run.end_time - run.start_time).total_seconds() if run.end_time and run.start_time else None,
        "total_cost": run.total_cost,
        "input_tokens": run.prompt_tokens,
        "output_tokens": run.completion_tokens,
        "total_tokens": run.total_tokens,
        "is_correct": is_correct,
    }
    data.append(row)

df_zeroshot_gsm8k = pd.DataFrame(data)
print("Total requests: ",count)
df_zeroshot_gsm8k

Total requests:  50


Unnamed: 0,run_id,error,latency_sec,total_cost,input_tokens,output_tokens,total_tokens,is_correct
0,eb3ff841-eb94-44e7-9e51-d6bd017754d9,,3.752059,0.0002732,139,136,275,0.0
1,a6619972-9aa3-4d2c-ad45-1745a4c2c5ff,,4.807726,0.0004544,124,253,377,1.0
2,e70689f8-9efd-4b4e-908e-0c10805baf56,,3.46293,0.0003208,154,162,316,1.0
3,d6a5f154-d977-4947-b832-3e6027d6286e,,5.948651,0.0005224,170,284,454,1.0
4,46e0d125-69fc-4386-9b4f-26f3dd61c214,,3.28463,0.0003016,150,151,301,1.0
5,4622b2df-3e86-49aa-851a-33dcd9519656,,3.523911,0.0003368,134,177,311,0.0
6,271a76c3-6cd4-46bd-b985-d8d8ce2bddca,,4.84584,0.000296,124,154,278,1.0
7,9250874c-9908-41af-a1d2-7d42ee81ff87,,4.77193,0.0004404,145,239,384,1.0
8,d0d195bb-458c-4640-92a7-b69dbb92043f,,7.971507,0.0004648,118,261,379,1.0
9,1ddabba8-8641-4558-9970-afc9831209f8,,4.209305,0.0003412,137,179,316,0.0


## CoT

In [67]:
class Step(BaseModel):
    explanation: str
    output: str
class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: float

In [68]:
@traceable(run_type="prompt")
def build_prompt_cot(question: str):
    messages = [
        SystemMessage(content="""
        You are a math expert.
        For every question, you **must** respond using the `MathReasoning` tool.
        - Do not respond with plain text or natural language.
        - Use a list of `Step`s to break down the reasoning.
        - Include a `final_answer` as a single number, no units or symbols.
        - If you cannot solve it, return a final_answer of "unknown".
        - When dealing with money, do not round to thousands unless explicitly stated.
        """),
        HumanMessage(content=question)
    ]
    return messages

@traceable(
    run_type="llm",
    metadata={
        "ls_provider": MODEL_PROVIDER,
        "ls_model_name": MODEL_NAME
    }
)
def call_ai(messages: List[dict]):
    model_with_tools = model.with_structured_output(MathReasoning)
    ai_msg = model_with_tools.invoke(messages)
    predicted_answer = ai_msg.final_answer
    return ai_msg, predicted_answer
    
@traceable(run_type="tool")
def compare_result(inputs: dict, reference_outputs: dict, outputs: dict):
    reference_response = reference_outputs["parsed"]
    run_response = outputs["answer"]

    score = math.isclose(float(reference_response), run_response, rel_tol=1e-3)
    return {"key": "is_correct", "score": int(score)}
    
@traceable(run_type="chain")
def process(question: str):
    msg = build_prompt_cot(question)
    ai_msg, predicted_answer = call_ai(msg)
    output = {
        "steps": ai_msg.steps,
        "answer": predicted_answer,
    }
    return output
    
@traceable(run_type="chain")
def target(inputs: dict):
    return process(inputs["question"])

evaluate(
    target,
    data=client.list_examples(dataset_name= "MathQA_MAS", splits=["Test Dataset"]), 
    evaluators=[compare_result],
    experiment_prefix="CoT_GSM8K"
)

View the evaluation results for experiment: 'CoT_GSM8K-d3cfc4b3' at:
https://smith.langchain.com/o/b7e81006-483c-41d8-b361-cc1ea3aa3974/datasets/09bf135d-d2b9-4518-b221-aa8b6ba23acf/compare?selectedSessions=f8d72656-7958-43c1-8d84-97268f1d3279




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.steps,outputs.answer,error,reference.answer,reference.parsed,feedback.is_correct,execution_time,example_id,id
0,"Carlos, Jim and Carrey were at the beach playi...","[explanation='Jim collected 27 seashells, whic...",20.0,,"Jim collected 27 seashells, 5 more than what C...",20,1,4.327589,f93f7632-18e1-4963-9903-71ccf920856d,1ce2d6fa-712c-49c8-8aa9-a7ffc37ebe47
1,Tim wanted to make lemonade for a pool party. ...,[explanation='Tim needs 1 cup of lemon juice p...,36.0,,Tim is going to make one gallon twice as tart ...,36,1,3.440491,f0f3b06a-0d7f-43c6-afc7-0d92c339f1a2,8d62aec5-091f-4381-9aea-87412e7032a0
2,The Kennel house keeps 3 German Shepherds and ...,[explanation='Calculate the total amount of do...,147.0,,The German Shepherds consume 3 x 5 = <<3*5=15>...,147,1,3.180601,dd7cbc36-6ea4-4200-84b8-e412a2f8c7d7,c24ae4b7-bac8-48d8-a2ae-42c7af83a245
3,Indras has 6 letters in her name. Her sister's...,[explanation='Indras has 6 letters in her name...,13.0,,I = <<6=6>>6\nSister = 6/2 + 4 = <<6/2+4=7>>7\...,13,1,2.458625,db08a88c-fe03-4fdc-935a-8279b95a64fc,7e2bf955-70f9-48fe-bac2-88d6e29189b5
4,John decides to do several activities while ou...,[explanation='John spends 6 hours boating.' ou...,8.0,,He spent 6/2=<<6/2=3>>3 hours swimming\nHe spe...,20,0,4.407518,d77fe2aa-f59b-4555-9991-c668dcc0fb70,177c34b3-9a5d-434f-8446-75ed2483f0eb
5,Gerald and Julia divided $100 in the ratio 3:2...,[explanation='Calculate the total parts of the...,50.0,,The $100 was divided into 3 + 2 = <<3+2=5>>5 p...,50,1,2.750981,cf360108-2b45-438a-a478-34ebb76ab5c8,e1f57bfd-f446-4cda-bf02-2d3c58da7e4f
6,Britany records 18 4-minute TikTok videos each...,[explanation='Calculate the total time spent r...,18.8,,First find how long Britany spends recording t...,1128,0,4.874594,c5b18837-4032-40ad-991a-70c249f65a2f,5218f8e9-81e4-4133-bc11-dbfb8a0ec169
7,Ali has four $10 bills and six $20 bills that ...,[explanation='Calculate the total amount of mo...,32.0,,Four $10 bills have a value of 4*$10 =$<<4*10=...,32,1,3.532087,c2433c2d-6923-4bfa-ac8e-bcb019c08c64,2c12c018-a19e-4a81-8048-cf9f4ede9511
8,Brandon's iPhone is four times as old as Ben's...,"[explanation=""Suzy's iPhone is 1 year old."" ou...",8.0,,Ben’s iPhone is 1*2 = <<1*2=2>>2 years old.\nB...,8,1,2.32478,c18f4afe-22bb-4d9d-a4eb-47e5ac26d08e,9e8130ce-4d20-4789-8b21-811a73e6f5a1
9,Jamal's phone can hold 6 times more photograph...,"[explanation=""Let the number of photographs Br...",6.0,,"Since Jamal's phone can hold 1800 photos, whic...",6,1,3.024128,bdd7f842-4137-47b0-b546-670399e32abd,d80d68bb-ca1b-4e88-b2d3-eaba32b7735d


In [8]:
experiment_name = "CoT_GSM8K-d3cfc4b3" 

runs = list(client.list_runs(project_name=experiment_name, execution_order=1))

data = []
count = 0
for run in runs:
    is_correct = None
    feedback_list = client.list_feedback(run_ids=[run.id])
    for fb in feedback_list:
        if fb.key == "is_correct":
            is_correct = fb.score
    count+=1
    row = {
        "run_id": run.id,
        "error": run.error,
        "latency_sec": (run.end_time - run.start_time).total_seconds() if run.end_time and run.start_time else None,
        "total_cost": run.total_cost,
        "input_tokens": run.prompt_tokens,
        "output_tokens": run.completion_tokens,
        "total_tokens": run.total_tokens,
        "is_correct": is_correct,
    }
    data.append(row)

df_cot_gsm8k = pd.DataFrame(data)
print("Total requests: ",count)
df_cot_gsm8k

Total requests:  50


Unnamed: 0,run_id,error,latency_sec,total_cost,input_tokens,output_tokens,total_tokens,is_correct
0,36d9c73d-12ae-4f3c-a998-3311053038df,,2.729905,0.00038,270,170,440,1.0
1,357d1a6d-0fc5-4ac7-87ce-bc0e5e823270,,4.015743,0.0004988,255,248,503,1.0
2,bd96fc41-9877-4626-8f7b-e5d94af2d525,,3.323122,0.0004436,285,206,491,1.0
3,062b160d-a385-4d37-821e-c1c4264e8fd5,,2.577318,0.0004244,301,190,491,1.0
4,b8b1e6e8-3614-4bd5-864f-8b9b5f9c1b5b,,2.064469,0.0003572,281,153,434,1.0
5,046f7990-c3aa-4a75-a71d-be6884edca05,,2.923681,0.0004036,265,186,451,0.0
6,1e40a740-6118-4ae1-bee2-f11a2375b8d0,,1.946372,0.0002892,255,117,372,1.0
7,fb67ca82-b9f8-4312-88ec-17e4e7a76535,,3.441111,0.0004784,276,230,506,1.0
8,3c5781f4-d9cf-44e2-89e9-ee3766e5241e,,4.44125,0.0005956,249,310,559,1.0
9,97f890ae-68e8-4496-a319-0083665c856f,,3.581353,0.000344,268,148,416,0.0


In [9]:
df_cot_gsm8k.to_csv("cot_gsm8k.csv", index=False)
df_zeroshot_gsm8k.to_csv("zeroshot_gsm8k.csv", index=False)