In [1]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

In [4]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['id']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if item['gold_answer'] == item['pred_answer']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if model == 'gpt-4o-mini':
            total_cost += item['total_prompt_tokens'] * 0.15 / 1000000 + item['total_completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['total_prompt_tokens'] * 2.5 / 1000000 + item['total_completion_tokens'] * 10 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['total_time']
    return total_time / len(data)

tasks = {
    'medqa': ['test_hard'],
    # 'pubmedqa': ['test_hard'],
    'medmcqa': ['test_hard'],
    # 'medbullets': ['test_hard'],
    # 'mmlu': ['test_hard'],
    # 'mmlu-pro': ['test_hard'],
}
models = ['gpt-4o-mini', 'gpt-4o']
methods = ['syn_verif']

table = PrettyTable()
table.field_names = ["Model", "Task", "Subtask", "Method", "Accuracy", "Cost per sample(USD)", "Time per sample(s)", "Total Number"]

total_cost = 0

for model in models:
    for task in tasks:
        for subtask in tasks[task]:
            for method in methods:
                file_path = f'./output/{task}/{model}-{task}-{subtask}-{method}.json'
                data = load_json(file_path)
                deduplicated_data = deduplicate_data(data)
                accuracy = calculate_accuracy(deduplicated_data)
                total = len(deduplicated_data)
                cost_per_sample = calculate_cost_from_token_usage(deduplicated_data, model)
                total_cost += cost_per_sample * total
                table.add_row([
                    model, task, subtask, method,
                    f"{accuracy * 100:.1f}%",
                    cost_per_sample,
                    calculate_time_from_data(deduplicated_data),
                    total
                ])

print(table)
print(f"\nTotal cost of experiment: ${total_cost:.2f}")

+-------------+---------+-----------+-----------+----------+-----------------------+--------------------+--------------+
|    Model    |   Task  |  Subtask  |   Method  | Accuracy |  Cost per sample(USD) | Time per sample(s) | Total Number |
+-------------+---------+-----------+-----------+----------+-----------------------+--------------------+--------------+
| gpt-4o-mini |  medqa  | test_hard | syn_verif |  43.4%   | 0.0064627395695364236 | 61.51443644075204  |     302      |
| gpt-4o-mini | medmcqa | test_hard | syn_verif |  32.0%   |  0.005034318893756836 |  44.7301887677246  |     913      |
|    gpt-4o   |  medqa  | test_hard | syn_verif |  68.0%   |  0.09486659999999998  | 52.70672381480535  |     300      |
|    gpt-4o   | medmcqa | test_hard | syn_verif |  49.4%   |  0.07847013643067843  | 48.72816071010972  |     678      |
+-------------+---------+-----------+-----------+----------+-----------------------+--------------------+--------------+

Total cost of experiment: $88.2