In [1]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line.strip()) for line in file]
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['realidx']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_cost_from_token_usage(data, model):
    total_cost = 0
    for item in data:
        if model == 'gpt-4o-mini':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 0.15 / 1000000 + item['token_usage']['all']['completion_tokens'] * 0.6 / 1000000
        elif model == 'gpt-4o':
            total_cost += item['tokens_usage']['all']['prompt_tokens'] * 2.5 / 1000000 + item['tokens_usage']['all']['completion_tokens'] * 10 / 1000000
        elif model == 'o3-mini' or model == 'o1-mini':
            total_cost += item['tokens_usage']['all']['prompt_tokens'] * 1.1 / 1000000 + item['tokens_usage']['all']['completion_tokens'] * 4.4 / 1000000
        elif model == 'claude-3-5-sonnet':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 3.0 / 1000000 + item['token_usage']['all']['completion_tokens'] * 15.0 / 1000000
        elif model == 'claude-3-5-haiku':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 0.8 / 1000000 + item['token_usage']['all']['completion_tokens'] * 4.0 / 1000000
        elif model == 'qwq':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 1.2 / 1000000 + item['token_usage']['all']['completion_tokens'] * 1.2 / 1000000
        elif model == 'qwen2.5':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 0.3 / 1000000 + item['token_usage']['all']['completion_tokens'] * 0.3 / 1000000
        elif model == 'r1':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 2.5 / 1000000 + item['token_usage']['all']['completion_tokens'] * 7 / 1000000
        elif model == 'v3':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 1.25 / 1000000 + item['token_usage']['all']['completion_tokens'] * 1.25 / 1000000
        elif model == 'llama3.3':
            total_cost += item['token_usage']['all']['prompt_tokens'] * 0.88 / 1000000 + item['token_usage']['all']['completion_tokens'] * 0.88 / 1000000
    return total_cost / len(data)

def calculate_time_from_data(data):
    total_time = 0
    for item in data:
        total_time += item['time_taken']
    return total_time / len(data)

In [14]:
# Load the data from the specified JSON file
file_path = "/home/ubuntu/MedAgents-2/output/medxpertqa-r/20250404/gpt-4o-mini-medxpertqa-r-test_hard-retrieve-20-rerank-32-rewrite-True-review-False-adaptive_rag-auto-similarity_strategy-reuse-agent_memory-True.json"
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

deduplicated_data = deduplicate_data(data)
avg_cost = calculate_cost_from_token_usage(deduplicated_data, 'gpt-4o-mini')
avg_time = calculate_time_from_data(deduplicated_data)
print(f"After deduplication: {len(deduplicated_data)} items")
print(f"Average cost per item: ${avg_cost:.6f}")
print(f"Average time taken per item: {avg_time:.2f} seconds")

correct_count = 0
for item in deduplicated_data:
    if item['answer_idx'] == item['answer_by_turns'][-1]['answer']:
        correct_count += 1

print(f"Accuracy: {correct_count / len(deduplicated_data) * 100:.2f}%")

After deduplication: 100 items
Average cost per item: $0.043552
Average time taken per item: 159.41 seconds
Accuracy: 10.00%
