In [2]:
import json
from prettytable import PrettyTable
import json
import os
import random
import pandas as pd

# Read the jsonl file and convert it to a JSON list
def jsonl_to_json_list(jsonl_file_path):
    json_list = []
    with open(jsonl_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # Parse each line as JSON
            json_list.append(json_obj)
    
    return json_list

# Save the JSON list to a file
def save_as_json(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        json.dump(json_list, outfile, indent=4)

def save_as_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as outfile:
        for json_obj in json_list:
            json.dump(json_obj, outfile)
            outfile.write('\n')

In [6]:
import json
import os
import random
import pandas as pd

# look for pubmedqa test set
pubmedqa_test_set = json.load(open(os.path.join('./data/pubmedqa', 'test_set.json'), 'r', encoding='utf-8'))
pubmedqa_test_set = [{'realidx': idx, **item} for idx, item in pubmedqa_test_set.items()]
df = pd.DataFrame(pubmedqa_test_set)
# rename columns
df_meta = df[['reasoning_required_pred', 'reasoning_free_pred', 'YEAR', 'MESHES', 'LABELS']]
df = df.drop(columns=['reasoning_required_pred', 'reasoning_free_pred', 'YEAR', 'MESHES', 'LABELS'])
df = df.rename(columns={'QUESTION': 'question', 'CONTEXTS': 'context', 'final_decision': 'answer', 'LONG_ANSWER': 'answer_text'})
df['options'] = [{'A': 'yes', 'B': 'no', 'C': 'maybe'} for _ in range(len(df))]
df['answer_idx'] = df['answer'].map({'yes': 'A', 'no': 'B', 'maybe': 'C'})
df['context'] = df['context'].apply(lambda x: '\n'.join(x))

# save df as jsonl
pubmedqa_test_set = df.to_dict(orient='records')
save_as_jsonl(pubmedqa_test_set, os.path.join('./data/pubmedqa', 'test_set.jsonl'))

sampled_50_pubmedqa = df.sample(50, random_state=42).to_dict(orient='records')
save_as_jsonl(sampled_50_pubmedqa, os.path.join('./data/pubmedqa', 'sampled_50.jsonl'))

df.head(10)

Unnamed: 0,realidx,question,context,answer,answer_text,options,answer_idx
0,12377809,Is anorectal endosonography valuable in dysche...,Dyschesia can be provoked by inappropriate def...,yes,Linear anorectal endosonography demonstrated i...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
1,26163474,Is there a connection between sublingual varic...,Sublingual varices have earlier been related t...,yes,An association was found between sublingual va...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
2,19100463,Is the affinity column-mediated immunoassay me...,Tacrolimus is a potent immunosuppressive drug ...,yes,The ACMIA method used for a tacrolimus assay i...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
3,18537964,Does a physician's specialty influence the rec...,To determine the impact of a physician's speci...,yes,Physicians appear to document more frequently ...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
4,12913878,Locoregional opening of the rodent blood-brain...,Nd:YAG laser-induced thermo therapy (LITT) of ...,yes,LITT induces a locoregional passage of chemoth...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
5,12765819,Spinal subdural hematoma: a sequela of a ruptu...,A case of spinal subdural hematoma (SSDH) foll...,yes,Although the exact mechanism of SSDH in this c...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
6,25475395,Is there a correlation between androgens and s...,"For women, the correlation between circulating...",yes,"In the present study, FT and androstenedione w...","{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
7,19130332,Is the zeolite hemostatic agent beneficial in ...,Uncontrolled hemorrhage is the leading cause o...,yes,"According to the physiological parameters, we ...","{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
8,9427037,Are endothelial cell patterns of astrocytomas ...,The most common primary brain tumors in childr...,yes,Evaluation of astrocytomas utilizing antibody ...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A
9,24481006,Should cavitation in proximal surfaces be repo...,79 adjacent proximal surfaces without restorat...,yes,CBCT was more accurate in detecting cavitation...,"{'A': 'yes', 'B': 'no', 'C': 'maybe'}",A


In [6]:
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def deduplicate_data(data):
    seen = set()
    deduplicated_data = []
    for item in data:
        idx = item['id']
        if idx not in seen:
            deduplicated_data.append(item)
            seen.add(idx)
    return deduplicated_data

def calculate_accuracy(data):
    correct_predictions = 0
    total_predictions = len(data)
    for item in data:
        if item['gold_answer'] == item['pred_answer']:
            correct_predictions += 1
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

tasks = {
    'medqa': ['sampled_50', 'sampled_50_hard', 'sampled_50_5options'],
    'pubmedqa': ['sampled_50']
}
models = ['gpt-4o-mini', 'gpt-4o']
methods = ['syn_verif']

table = PrettyTable()
table.field_names = ["Model", "Task", "Subtask", "Method", "Accuracy", "Total Number"]

for model in models:
    for task in tasks:
        for subtask in tasks[task]:
            for method in methods:
                file_path = f'./output/{task}/{model}-{task}-{subtask}-{method}.json'
                data = load_json(file_path)
                deduplicated_data = deduplicate_data(data)
                accuracy = calculate_accuracy(deduplicated_data)
                total = len(deduplicated_data)
                table.add_row([
                    model, task, subtask, method,
                    f"{accuracy * 100:.1f}%",
                    total
        ])

print(table)
# End of Selection

+-------------+----------+---------------------+-----------+----------+--------------+
|    Model    |   Task   |       Subtask       |   Method  | Accuracy | Total Number |
+-------------+----------+---------------------+-----------+----------+--------------+
| gpt-4o-mini |  medqa   |      sampled_50     | syn_verif |  78.0%   |      50      |
| gpt-4o-mini |  medqa   |   sampled_50_hard   | syn_verif |  50.0%   |      50      |
| gpt-4o-mini |  medqa   | sampled_50_5options | syn_verif |  80.0%   |      50      |
| gpt-4o-mini | pubmedqa |      sampled_50     | syn_verif |  66.0%   |      50      |
|    gpt-4o   |  medqa   |      sampled_50     | syn_verif |  90.0%   |      50      |
|    gpt-4o   |  medqa   |   sampled_50_hard   | syn_verif |  78.0%   |      50      |
|    gpt-4o   |  medqa   | sampled_50_5options | syn_verif |  88.0%   |      50      |
|    gpt-4o   | pubmedqa |      sampled_50     | syn_verif |  65.4%   |      26      |
+-------------+----------+-----------------