In [1]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), "../"))


In [5]:
import json
from datetime import datetime, timezone

import pandas as pd

from medbench.datasets import (
    CORRECT_TAG,
    Data,
    Dataset,
    Instance,
    MediaObject,
    Reference,
    EMediaObjectType,
)
from medbench.models import Model, ModelOutput, ModelRun

In [29]:
DATA_FOLDER_PATH = "../data/arena/segmed/segmed"
CXRREPORTGEN_FILE_PATH = os.path.join(DATA_FOLDER_PATH, "cxrreportgen.json")
GPT_4O_FILE_PATH = os.path.join(DATA_FOLDER_PATH, "gpt-4o.json")
DATA_FILE_PATH = os.path.join(DATA_FOLDER_PATH, "data.json")

MEDBENCH_ARENA_DATASET_TARGET_PATH = "../data/arena/segmed/"

In [30]:
with open(DATA_FILE_PATH, "r") as f:
    dataset_data = json.load(f)
    dataset = Dataset.from_json(dataset_data)


In [31]:
with open(CXRREPORTGEN_FILE_PATH, "r") as f:
    cxrreportgen_data = json.load(f)
    cxrreportgen_data["dataset"] = dataset_data
    cxrreportgen_model_run = ModelRun.from_json(cxrreportgen_data)

with open(GPT_4O_FILE_PATH, "r") as f:
    gpt4o_data = json.load(f)
    gpt4o_data["dataset"] = dataset_data
    gpt4o_model_run = ModelRun.from_json(gpt4o_data)

In [32]:
model_runs = [cxrreportgen_model_run, gpt4o_model_run]

In [33]:
# Map file names to model names
clinical_tasks_data: dict[str, list] = {}
for model_run in model_runs:
    clinical_task = model_run.dataset.name

    # Store metrics
    if clinical_task not in clinical_tasks_data:
        clinical_tasks_data[clinical_task] = {}

    arena_dataset_data: list[dict] = []
    for input, output in zip(model_run.dataset.instances, model_run.results):
        correct_answer: str = None
        for ref in input.references:
            if CORRECT_TAG not in ref.tags:
                continue

            # NOTE: This assume a single output was given
            ref_content = ref.output.content[0]
            if ref_content.metadata and "multiple_choice_letter" in ref_content.metadata:
                correct_answer = ref_content.metadata["multiple_choice_letter"]
            else:
                # NOTE: This already supports multiple outputs.
                correct_answer = ref.output.get_text()
            break

        arena_dataset_data.append(
            {
                "id": input.id,
                "input": [content for content in input.input.content if content.type == EMediaObjectType.IMAGE][0].to_json(),
                "output": output.completions.get_text(),
            }
        )
    
    clinical_tasks_data[clinical_task][model_run.model.name] = arena_dataset_data

# Save the dataset data
print("Saving dataset data")
for clinical_task in clinical_tasks_data:
    print(f"\tSaving {clinical_task} dataset data")
    merged_df: pd.DataFrame = None
    for model_name in clinical_tasks_data[clinical_task]:
        cur_df = pd.DataFrame(clinical_tasks_data[clinical_task][model_name])
        cur_df.rename(
            columns={
                "output": model_name,
            },
            inplace=True,
        )

        if merged_df is None:
            merged_df = cur_df
            continue

        print("\t\tMerging with", model_name, cur_df.columns)
        cur_df.drop(columns=["input"], inplace=True)
        merged_df = pd.merge(
            merged_df,
            cur_df,
            how="outer",
            on=["id"],
        )

    merged_df.drop(columns=["id"], inplace=True)
    merged_df.to_json(
        os.path.join(MEDBENCH_ARENA_DATASET_TARGET_PATH, f"{clinical_task}.jsonl"),
        orient="records",
        lines=True,
    )

Saving dataset data
	Saving segmed dataset data
		Merging with gpt-4o Index(['id', 'input', 'gpt-4o'], dtype='object')
