In [1]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

In [45]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if 'predicted_answer' not in item:
            print(item['realidx'])
        if item['answer_idx'] == item['predicted_answer']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if model == 'gpt-4o-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['token_usage']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['completion_tokens'] * 10 / 1000000
        elif model == 'o3-mini' or model == 'o1-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 1.1 / 1000000 + item['token_usage']['completion_tokens'] * 4.4 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_elapsed']
    return total_time / len(data)

tasks = {
    'medqa': ['test_hard', 'test'],
    'pubmedqa': ['test_hard', 'test'],
    'medmcqa': ['test_hard', 'test'],
    'medbullets': ['test_hard', 'test'],
    'mmlu': ['test_hard', 'test'],
    'mmlu-pro': ['test_hard', 'test'],
    'afrimedqa': ['test_hard', 'test'],
}
models = [
    'o3-mini',
    'gpt-4o-mini',
    'gpt-4o',
    'o1-mini'
]
methods = ['zero_shot']

table = PrettyTable()
table.field_names = ["Model", "Task", "Subtask", "Method", "Accuracy", "Cost per sample(USD)", "Time per sample(s)", "Total Number"]

total_cost = 0

for task in tasks:
    for subtask in tasks[task]:
        for model in models:
            for method in methods:
                file_path = f'./output/{task}/{model}-{task}-{subtask}-{method}.json'
                data = load_json(file_path)
                deduplicated_data = deduplicate_data(data)
                accuracy = calculate_accuracy(deduplicated_data)
                total = len(deduplicated_data)
                cost_per_sample = calculate_cost_from_token_usage(deduplicated_data, model)
                total_cost += cost_per_sample * total
                table.add_row([
                    model, task, subtask, method,
                    f"{accuracy * 100:.1f}%",
                    cost_per_sample,
                    calculate_time_from_data(deduplicated_data),
                    total
                ])

print(table)
print(f"\nTotal cost of experiment: ${total_cost:.2f}")

+-------------+------------+-----------+-----------+----------+------------------------+--------------------+--------------+
|    Model    |    Task    |  Subtask  |   Method  | Accuracy |  Cost per sample(USD)  | Time per sample(s) | Total Number |
+-------------+------------+-----------+-----------+----------+------------------------+--------------------+--------------+
|   o3-mini   |   medqa    | test_hard | zero_shot |  80.8%   |  0.002662619205298013  | 12.060769792424132 |     302      |
| gpt-4o-mini |   medqa    | test_hard | zero_shot |   1.7%   | 4.426688741721854e-05  | 0.7814563741747117 |     302      |
|    gpt-4o   |   medqa    | test_hard | zero_shot |  66.2%   |  0.000744801324503311  | 0.8367288538951747 |     302      |
|   o1-mini   |   medqa    | test_hard | zero_shot |  75.5%   | 0.0029081304635761585  | 5.3800361377513966 |     302      |
|   o3-mini   |   medqa    |    test   | zero_shot |  92.7%   | 0.0016330031421838173  | 7.229265965726182  |     1273     |


In [2]:
import glob
import shutil
import os

# Define the pattern to match all JSON files with "test_hard" in their filename under the output directory.
pattern = os.path.join('.', 'output', '*', '*test_hard*.json')
files = glob.glob(pattern, recursive=True)
print(f"Found {len(files)} file(s) matching pattern: {pattern}")

for file_path in files:
    # Generate new filenames by replacing "test_hard" with "test" and "test_good"
    new_file_test = file_path.replace("test_hard", "test")
    new_file_test_good = file_path.replace("test_hard", "test_good")

    # Copy the original file to the new file for "test"
    shutil.copy(file_path, new_file_test)

    # Copy the original file to the new file for "test_good"
    shutil.copy(file_path, new_file_test_good)

    print(f"Copied {file_path} to {new_file_test} and {new_file_test_good}")

Found 28 file(s) matching pattern: ./output/*/*test_hard*.json
Copied ./output/mmlu-pro/o3-mini-mmlu-pro-test_hard-zero_shot.json to ./output/mmlu-pro/o3-mini-mmlu-pro-test-zero_shot.json and ./output/mmlu-pro/o3-mini-mmlu-pro-test_good-zero_shot.json
Copied ./output/mmlu-pro/gpt-4o-mini-mmlu-pro-test_hard-zero_shot.json to ./output/mmlu-pro/gpt-4o-mini-mmlu-pro-test-zero_shot.json and ./output/mmlu-pro/gpt-4o-mini-mmlu-pro-test_good-zero_shot.json
Copied ./output/mmlu-pro/gpt-4o-mmlu-pro-test_hard-zero_shot.json to ./output/mmlu-pro/gpt-4o-mmlu-pro-test-zero_shot.json and ./output/mmlu-pro/gpt-4o-mmlu-pro-test_good-zero_shot.json
Copied ./output/mmlu-pro/o1-mini-mmlu-pro-test_hard-zero_shot.json to ./output/mmlu-pro/o1-mini-mmlu-pro-test-zero_shot.json and ./output/mmlu-pro/o1-mini-mmlu-pro-test_good-zero_shot.json
Copied ./output/pubmedqa/gpt-4o-mini-pubmedqa-test_hard-zero_shot.json to ./output/pubmedqa/gpt-4o-mini-pubmedqa-test-zero_shot.json and ./output/pubmedqa/gpt-4o-mini-pubme