In [3]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

In [4]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if item['predicted_answer'] == item['answer_idx']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if model == 'gpt-4o-mini':
            total_cost += item['token_usage']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['token_usage']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['completion_tokens'] * 10 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_elapsed']
    return total_time / len(data)

tasks = {
    'medqa': ['test_hard'],
    'pubmedqa': ['test_hard'],
    'medmcqa': ['test_hard'],
    'medbullets': ['test_hard'],
    'mmlu': ['test_hard'],
    'mmlu-pro': ['test_hard'],
    'afrimedqa': ['test_hard']
}
models = ['gpt-4o-mini', 'gpt-4o', 'deepseek-V3']
difficulties = ['adaptive']

print("| Model | Task | Subtask | Method | Accuracy | Cost per sample(USD) | Time per sample(s) | Total Number |")
print("|-------|------|---------|---------|----------|---------------------|-------------------|--------------|")
table = PrettyTable()
table.field_names = ["Model", "Task", "Subtask", "Difficulty", "Accuracy", "Cost per sample(USD)", "Time per sample(s)", "Total Number"]

total_cost = 0

for task in tasks:
    for model in models:
        for subtask in tasks[task]:
            for difficulty in difficulties:
                try:
                    file_path = f'./output/{task}/{model}_{task}_{subtask}_{difficulty}.json'
                    data = load_json(file_path)
                    output_path = f'../../output/{task}/{model if model != "deepseek-V3" else "DeepSeek-V3"}-{task}-{subtask}-mdagents.json'
                    save_as_json(data, output_path)
                    deduplicated_data = deduplicate_data(data)
                    accuracy = calculate_accuracy(deduplicated_data)
                    total = len(deduplicated_data)
                    cost_per_sample = calculate_cost_from_token_usage(deduplicated_data, model)
                    total_cost += cost_per_sample * total
                    print(f"| {model} | {task} | {subtask} | MDAgents | {accuracy * 100:.1f}% | {cost_per_sample} | {calculate_time_from_data(deduplicated_data)} | {total} |")
                    table.add_row([
                        model, task, subtask, difficulty,
                        f"{accuracy * 100:.1f}%",
                        cost_per_sample,
                        calculate_time_from_data(deduplicated_data),
                        total
                    ])
                except Exception as e:
                    print(f"Error loading file {file_path}: {e}")

print(table)
print(f"\nTotal cost of experiment: ${total_cost:.2f}")

| Model | Task | Subtask | Method | Accuracy | Cost per sample(USD) | Time per sample(s) | Total Number |
|-------|------|---------|---------|----------|---------------------|-------------------|--------------|
| gpt-4o-mini | medqa | test_hard | MDAgents | 22.0% | 0.012128373000000001 | 79.52511752843857 | 100 |
| gpt-4o | medqa | test_hard | MDAgents | 36.0% | 0.11012845 | 60.32874982357025 | 100 |
| deepseek-V3 | medqa | test_hard | MDAgents | 44.0% | 0.0 | 162.24405320167543 | 100 |
| gpt-4o-mini | pubmedqa | test_hard | MDAgents | 23.0% | 0.010930074000000001 | 107.77486172676086 | 100 |
| gpt-4o | pubmedqa | test_hard | MDAgents | 11.0% | 0.06935595 | 108.19114663362502 | 100 |
| deepseek-V3 | pubmedqa | test_hard | MDAgents | 15.0% | 0.0 | 141.52637087345124 | 100 |
| gpt-4o-mini | medmcqa | test_hard | MDAgents | 16.0% | 0.0117990975 | 44.20571577072143 | 100 |
| gpt-4o | medmcqa | test_hard | MDAgents | 22.0% | 0.025111974999999998 | 18.03709844827652 | 100 |
| deepseek-V3 | m