In [6]:
from mint.config import DATA_DIR
import os
import re
import json
import pandas as pd
from collections import Counter
from dotenv import load_dotenv
from langsmith import Client
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT
from openai import OpenAI
from tabulate import tabulate

### Tải dataset GSM8K

In [2]:
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

data_dir = DATA_DIR("GSM8K")

test_data = load_jsonl(os.path.join(data_dir, 'test.jsonl'))
train_data = load_jsonl(os.path.join(data_dir, 'train.jsonl'))

In [3]:
load_dotenv()

True

In [4]:
df_train = pd.DataFrame(train_data)
df_test = pd.DataFrame(test_data)

dataset = {
    "Train": df_train,
    "Test": df_test
}

### Thống kê số lượng mẫu có trong từng tập

In [5]:
len_total = 0
for name, df in dataset.items():
    print(f"Length of {name} Dataset: {len(df)}")
    len_total += len(df)

Length of Train Dataset: 7473
Length of Test Dataset: 1319


### Tìm tỷ lệ giữa tập train và tập test

In [6]:
train_ratio = (7473 / len_total) * 100
test_ratio = (1319 / len_total) * 100

print(f"Train ratio: {train_ratio:.2f}%")
print(f"Test ratio: {test_ratio:.2f}%")

Train ratio: 85.00%
Test ratio: 15.00%


### Tạo dataset

In [7]:
file_path = 'zero-shot_responses.json'

with open(file_path, 'r', encoding='utf-8') as f:
    responses_data = json.load(f)

incorrect_results = responses_data.get('incorrect_results', [])

In [8]:
df_incorrect_result = pd.DataFrame(incorrect_results)

matched = df_incorrect_result.merge(df_test, on=["question"], how="inner")
matched = matched.drop(columns=["parsed_answer", "true_answer"])
matched = matched.rename(columns={"raw_answer": "ai_response"})

In [9]:
matched_list = matched[["question", "answer", "ai_response"]].to_dict(orient="records")
for item in matched_list:
    print("Question:\n", item["question"], "\n")
    print("Answer:\n", item["answer"], "\n")
    print("AI Response:\n", item["ai_response"], "\n")
    print("-" * 100, "\n")

Question:
 Gene is sewing a quilt out of old souvenir t-shirts. He has one shirt from each vacation he has been on. Every shirt is its own quilt block. Each row is made of blocks from a different year of vacations. He goes on four vacations a year and has been vacationing since he was 23 years old. He is now 34. How many quilt blocks does he have in total? 

Answer:
 Gene has been on 34 - 23 = <<34-23=11>>11 years of vacations, so he has 11 rows of blocks.
He goes on 4 vacations a year, so he has been on 4 * 11 = <<4*11=44>>44 vacations.
Gene has 1 shirt from each vacation, so he has 44 * 1 = <<44*1=44>>44 quilt blocks in all.
#### 44 

AI Response:
 Let's break down the problem step-by-step:

1. Gene has been vacationing since he was 23 years old.
2. He is now 34 years old.
3. Each year, he goes on 4 vacations.
4. Each vacation corresponds to one t-shirt, which corresponds to one quilt block.
5. Each row in the quilt corresponds to a different year of vacations.

First, calculate how 

In [10]:
def extract_operators(text):
    expressions = re.findall(r"<<(.*?)>>", text)

    all_expr = " ".join(expressions)

    ops = re.findall(r"[+\-*/]", all_expr)

    return ", ".join(ops)

def categorize_main_operator(op_str):
    if not op_str or op_str.strip() == "":
        return "unknown"

    ops = op_str.replace(" ", "").split(",")
    
    total = len(ops)
    if total == 0:
        return "unknown"

    counter = Counter(ops)
    most_common_op, freq = counter.most_common(1)[0]

    if freq / total > 0.5:
        return {
            "+": "add",
            "-": "sub",
            "*": "mul",
            "/": "div"
        }.get(most_common_op, "other")
    else:
        return "mixed"

matched["operate_count"] = matched["answer"].apply(extract_operators)
matched["operation_type"] = matched["operate_count"].apply(categorize_main_operator)
matched.head(21)

Unnamed: 0,question,ai_response,answer,operate_count,operation_type
0,Gene is sewing a quilt out of old souvenir t-s...,Let's break down the problem step-by-step:\n\n...,Gene has been on 34 - 23 = <<34-23=11>>11 year...,"-, *, *",mul
1,Lee used to be able to run the 400-meter hurdl...,Let's denote:\n\n- \( G \) = Gerald's original...,If Lee runs the 400-meter hurdles in 38 second...,"+, *, -",mixed
2,Adrien's total salary was 30 percent higher th...,Let's define the variables and analyze the pro...,Since Adrien was earning $40000 four years ago...,"/, *, /, *, +, +",mixed
3,Jean is two years older than Mark. Two years ...,Let's define the variables:\n\n- Let Mark's cu...,2 years ago Jan was 30-2=<<30-2=28>>28 years o...,"-, +, /, +, +",add
4,Mandy owes Benedict $100. They agreed to have ...,To calculate how much Mandy should pay Benedic...,The monthly interest is $100 x 2/100 = $<<100*...,"*, /, *, +",mixed
5,Steve decides to start eating more tomatoes an...,Let's break down the problem step-by-step:\n\n...,His girlfriend eats 6/2=<<6/2=3>>3 tomatoes pe...,"/, +, *, /",mixed
6,Dolly has two books. Pandora has one. If both ...,"Dolly has 2 books, Pandora has 1 book.\n\nIf b...",There are 2 + 1 = <<2+1=3>>3 books in total.\n...,"+, *",mixed
7,"Griffin had 24 french fries, but Kyle took 5 o...",Let's define the variables and analyze the pro...,"Griffin had 24 french fries, but Kyle took 5 o...","*, -",mixed
8,Ten stalls have 20 cows each. Mr. Sylas buys 4...,"First, let's analyze the problem step-by-step:...",When Mr. Sylas puts an equal number of the cow...,"/, *, *, +",mixed
9,"Three friends: Mike, Jim, and Tony decided to ...",Let's analyze the problem step-by-step:\n\n1. ...,After 3 rounds Jim has 21 points - 3 points = ...,"-, *, +, +, +",add


In [11]:
def extract_operators(text):
    expressions = re.findall(r"<<(.*?)>>", text)

    all_expr = " ".join(expressions)

    ops = re.findall(r"[+\-*/]", all_expr)

    return ", ".join(ops)

def categorize_main_operator(op_str):
    if not op_str or op_str.strip() == "":
        return "unknown"

    ops = op_str.replace(" ", "").split(",")
    
    total = len(ops)
    if total == 0:
        return "unknown"

    counter = Counter(ops)
    most_common_op, freq = counter.most_common(1)[0]

    if freq / total > 0.5:
        return {
            "+": "add",
            "-": "sub",
            "*": "mul",
            "/": "div"
        }.get(most_common_op, "other")
    else:
        return "mixed"

df_test["operate_count"] = df_test["answer"].apply(extract_operators)
df_test["operation_type"] = df_test["operate_count"].apply(categorize_main_operator)

type_counts_test = df_test["operation_type"].value_counts()

type_percent_test = (type_counts_test / type_counts_test.sum()) * 100

type_percent_test = type_percent_test.round(2)

print(type_percent_test)

operation_type
mixed      51.25
mul        23.96
add        13.27
sub         4.85
div         4.47
unknown     2.20
Name: count, dtype: float64


In [12]:
for op_type_test, percent_test in type_percent_test.items():
    print(f"{op_type_test}: {round(50 /100 * percent_test)} ")

mixed: 26 
mul: 12 
add: 7 
sub: 2 
div: 2 
unknown: 1 


In [13]:
matched = matched.drop(index=19).reset_index(drop=True)
matched.head(20)

Unnamed: 0,question,ai_response,answer,operate_count,operation_type
0,Gene is sewing a quilt out of old souvenir t-s...,Let's break down the problem step-by-step:\n\n...,Gene has been on 34 - 23 = <<34-23=11>>11 year...,"-, *, *",mul
1,Lee used to be able to run the 400-meter hurdl...,Let's denote:\n\n- \( G \) = Gerald's original...,If Lee runs the 400-meter hurdles in 38 second...,"+, *, -",mixed
2,Adrien's total salary was 30 percent higher th...,Let's define the variables and analyze the pro...,Since Adrien was earning $40000 four years ago...,"/, *, /, *, +, +",mixed
3,Jean is two years older than Mark. Two years ...,Let's define the variables:\n\n- Let Mark's cu...,2 years ago Jan was 30-2=<<30-2=28>>28 years o...,"-, +, /, +, +",add
4,Mandy owes Benedict $100. They agreed to have ...,To calculate how much Mandy should pay Benedic...,The monthly interest is $100 x 2/100 = $<<100*...,"*, /, *, +",mixed
5,Steve decides to start eating more tomatoes an...,Let's break down the problem step-by-step:\n\n...,His girlfriend eats 6/2=<<6/2=3>>3 tomatoes pe...,"/, +, *, /",mixed
6,Dolly has two books. Pandora has one. If both ...,"Dolly has 2 books, Pandora has 1 book.\n\nIf b...",There are 2 + 1 = <<2+1=3>>3 books in total.\n...,"+, *",mixed
7,"Griffin had 24 french fries, but Kyle took 5 o...",Let's define the variables and analyze the pro...,"Griffin had 24 french fries, but Kyle took 5 o...","*, -",mixed
8,Ten stalls have 20 cows each. Mr. Sylas buys 4...,"First, let's analyze the problem step-by-step:...",When Mr. Sylas puts an equal number of the cow...,"/, *, *, +",mixed
9,"Three friends: Mike, Jim, and Tony decided to ...",Let's analyze the problem step-by-step:\n\n1. ...,After 3 rounds Jim has 21 points - 3 points = ...,"-, *, +, +, +",add


In [14]:
type_counts = matched["operation_type"].value_counts()

print(type_counts)

operation_type
mixed    14
mul       3
add       3
Name: count, dtype: int64


In [15]:
used_questions = set(matched["question"])
def add_samples(sample_size, op_type):
    filtered_df = df_test[
        (df_test["operation_type"] == op_type) &
        (~df_test["question"].isin(used_questions))
    ]
    
    samples = filtered_df.sample(sample_size, random_state=42)
    used_questions.update(samples["question"])

    return pd.concat([matched, samples], ignore_index=True)

matched = matched.copy()

for op_type_test, percent_test in type_percent_test.items():
    n = round(50 / 100 * percent_test)
    count = type_counts.get(op_type_test, 0)
    sample_size = n - count
    matched = add_samples(sample_size, op_type_test)
        
matched

Unnamed: 0,question,ai_response,answer,operate_count,operation_type
0,Gene is sewing a quilt out of old souvenir t-s...,Let's break down the problem step-by-step:\n\n...,Gene has been on 34 - 23 = <<34-23=11>>11 year...,"-, *, *",mul
1,Lee used to be able to run the 400-meter hurdl...,Let's denote:\n\n- \( G \) = Gerald's original...,If Lee runs the 400-meter hurdles in 38 second...,"+, *, -",mixed
2,Adrien's total salary was 30 percent higher th...,Let's define the variables and analyze the pro...,Since Adrien was earning $40000 four years ago...,"/, *, /, *, +, +",mixed
3,Jean is two years older than Mark. Two years ...,Let's define the variables:\n\n- Let Mark's cu...,2 years ago Jan was 30-2=<<30-2=28>>28 years o...,"-, +, /, +, +",add
4,Mandy owes Benedict $100. They agreed to have ...,To calculate how much Mandy should pay Benedic...,The monthly interest is $100 x 2/100 = $<<100*...,"*, /, *, +",mixed
5,Steve decides to start eating more tomatoes an...,Let's break down the problem step-by-step:\n\n...,His girlfriend eats 6/2=<<6/2=3>>3 tomatoes pe...,"/, +, *, /",mixed
6,Dolly has two books. Pandora has one. If both ...,"Dolly has 2 books, Pandora has 1 book.\n\nIf b...",There are 2 + 1 = <<2+1=3>>3 books in total.\n...,"+, *",mixed
7,"Griffin had 24 french fries, but Kyle took 5 o...",Let's define the variables and analyze the pro...,"Griffin had 24 french fries, but Kyle took 5 o...","*, -",mixed
8,Ten stalls have 20 cows each. Mr. Sylas buys 4...,"First, let's analyze the problem step-by-step:...",When Mr. Sylas puts an equal number of the cow...,"/, *, *, +",mixed
9,"Three friends: Mike, Jim, and Tony decided to ...",Let's analyze the problem step-by-step:\n\n1. ...,After 3 rounds Jim has 21 points - 3 points = ...,"-, *, +, +, +",add


In [29]:
client = Client()

dataset_id = "09bf135d-d2b9-4518-b221-aa8b6ba23acf"

matched_list = matched.to_dict(orient="records")

inputs = [{"question": item["question"]} for item in matched_list]
outputs = [{"answer": item["answer"]} for item in matched_list]


client.create_examples(
  inputs=inputs,
  outputs=outputs,
  dataset_id=dataset_id,
)


{'example_ids': ['6b5c24b2-431a-43e7-879b-e90f18af2a6d',
  'bb4a6058-484d-42a0-8e6f-3b70c3b65bf7',
  '86362124-e94b-48b0-a037-d5086de969ae',
  '31b51189-25a6-4397-a9a9-5f4c9f71126e',
  '5387df10-3f64-4a18-bd3d-62cd931311ac',
  '29d9a9cd-834c-4842-98e4-e39e65f4d11a',
  'd77fe2aa-f59b-4555-9991-c668dcc0fb70',
  '0b4e95f7-0e0c-4071-b0f3-af30ba6ad922',
  '9a21d71c-2762-4e32-a03a-3cb2d86d73cf',
  'c2433c2d-6923-4bfa-ac8e-bcb019c08c64',
  'ac62c028-80d5-418f-9f7e-4dba3689f733',
  '71346934-7891-45d4-bd34-1bcfcfd5b904',
  '88a286ea-f55a-4aa4-8e66-31b338468e62',
  '9df3f5bd-6018-478e-9e3e-85cee872a3f8',
  '2591c2d1-4daf-44dd-876e-a404b2381669',
  'b0ee41ea-3118-475a-99fa-485a14e61e70',
  '00694dd9-f529-4683-a719-5967f2ed11e0',
  '4f27e006-97b0-4b20-8543-128f721d3a03',
  'b7873120-bee5-4c12-b713-331cd9476df3',
  'db08a88c-fe03-4fdc-935a-8279b95a64fc',
  'f93f7632-18e1-4963-9903-71ccf920856d',
  'c5b18837-4032-40ad-991a-70c249f65a2f',
  '88816b9d-4c95-429b-916a-580fb84e3651',
  'bdd7f842-4137-47

In [24]:
df_train["operate_count"] = df_train["answer"].apply(extract_operators)
df_train["operation_type"] = df_train["operate_count"].apply(categorize_main_operator)
type_counts_train = df_train["operation_type"].value_counts()

type_percent_train = (type_counts_train / type_counts_train.sum()) * 100

type_percent_train = type_percent_train.round(2)

print("Số lượng mẫu cần lấy từ tập Train: ", round(50 / 0.15 * 0.85))
print(type_percent_train)

Số lượng mẫu cần lấy từ tập Train:  283
operation_type
mixed      51.83
mul        22.64
add        13.25
div         5.35
sub         4.47
unknown     2.46
Name: count, dtype: float64


In [25]:
for op_type_train, percent_train in type_percent_train.items():
    print(f"{op_type_train}: {round(283 /100 * percent_train)} ")

mixed: 147 
mul: 64 
add: 37 
div: 15 
sub: 13 
unknown: 7 


In [26]:
used_questions = set()
train_samples_list = []

def add_samples_train(sample_size, op_type):
    filtered_df = df_train[
        (df_train["operation_type"] == op_type) &
        (~df_train["question"].isin(used_questions))
    ]
    
    samples = filtered_df.sample(sample_size, random_state=42)
    used_questions.update(samples["question"])
    train_samples_list.append(samples)

    return pd.concat(train_samples_list, ignore_index=True)

for op_type, percent in type_percent_train.items():
    sample_size = round(283 / 100 * percent)
    train_samples = add_samples_train(sample_size, op_type)
    
print("Tổng số mẫu trong train_samples:", len(train_samples))

Tổng số mẫu trong train_samples: 283


In [27]:
dataset_id = "62a0d112-9d0b-4425-8125-39ee93bb233f"

train_samples_list = train_samples.to_dict(orient="records")

inputs = [{"question": item["question"]} for item in train_samples_list]
outputs = [{"answer": item["answer"]} for item in train_samples_list]

client.create_examples(
  inputs=inputs,
  outputs=outputs,
  dataset_id=dataset_id,
)

{'example_ids': ['2356645c-7011-4310-a20d-a9fcca48d30a',
  '63330dfb-08af-4d92-85c5-a841e4b5cecd',
  'fad1c0c7-2c9a-4a99-ad43-88daeedfe8b9',
  'd35c949f-a7a7-484d-b95d-fac55d243381',
  'ade50585-38cf-4cf4-a55c-1bcaf62d18b8',
  '4e0f22b1-1e75-46f3-918c-060127f82ba0',
  '9fa368ea-8fdc-4625-b7f0-9382b469efc6',
  '1b2f0494-150d-40a7-b924-33e923bd7715',
  'd6b33a97-b154-47e1-89fc-45f4a7b66eaf',
  'c4aa49e2-0e17-4e83-9bc7-1f9203beca6c',
  '75f70fdd-f1b8-44c4-b943-f8b3782a9faf',
  '4f18116b-e57c-4b8d-9e75-f02024b9a707',
  '7bbc17c0-e9fb-4693-933e-dc6a3d946491',
  'ea71cdc2-7f22-42fa-a9fd-0eb486da8135',
  '69ac3426-b239-4b8b-85d5-76df9577b465',
  '9e5adb1e-9b22-40cf-972a-270b29077c2b',
  'ed19e954-4326-475a-b3fc-aba83e62a122',
  'cd2ebbaa-6cd4-48b3-a873-4edcd78d2302',
  '4c72bf7c-2240-461f-894c-1b24a4f6e809',
  '72855a39-d2bb-4461-98d1-4323720b6f71',
  'f41af350-4c70-475f-89c4-501cbe8bc4ee',
  '14748295-6ef4-4486-8ee5-055695a87747',
  '97755d81-f5c3-441c-b1b2-94ba4a12ce7a',
  'e925a326-f11c-46

In [17]:
def extract_final_result(text):
    marker = "####"
    if marker in text:
        return float(text.split(marker)[-1].strip().replace(',', ''))

In [30]:
client = Client()
examples = client.list_examples(dataset_name="MathQA_MAS")

for example in examples:
    output = example.outputs.get("answer")
    new_output = example.outputs.copy()
    new_output["parsed"] = extract_final_result(output)
    
    client.update_example(
    example_id= example.id,
    inputs=example.inputs,
    outputs=new_output
)