# LLM as a judge over MedHelm datasets

Current learnings:
- Limitations on TPM
- Limitations on output length


## Dependencies

In [1]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), ".."))


In [2]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [3]:
%load_ext autoreload
%autoreload 2

import json

from dotenv import load_dotenv

from medbench.config import settings
from medbench.datasets import Dataset
from medbench.evaluators import MultimodalEvaluatorRunner, SummaryEvaluatorRunner
from medbench.models import ModelOutput, ModelRun, SystemPromptModel, Runner
from medbench.models.azureoai import OpenAIChatModel, OpenAIReasoningModel
from medbench.models.cxrreportgen import CXRReportGenModel
from medbench.utils import load_arena_data

load_dotenv("../.env")


INFO:root:Environment variable BABELBENCH_AML_WORKSPACE_NAME not found.


True

## Load data

In [4]:
DATA_PATH = "../data/"
MEDHELM_DATA_PATH = os.path.join(DATA_PATH, "metrics/medhelm")
MEDHELM_DATASETS_PATH = os.path.join(MEDHELM_DATA_PATH, "arena/datasets/")
MEDHELM_METRICS_PATH = os.path.join(MEDHELM_DATA_PATH, "arena/metrics/")
MEDHELM_MODEL_RUNS_PATH = os.path.join(MEDHELM_DATA_PATH, "func_output/")

In [5]:
freetext_model_runs: dict[str, list[ModelRun]] = {}
for filename in os.listdir(MEDHELM_MODEL_RUNS_PATH):
    if filename.endswith(".json"):
        with open(os.path.join(MEDHELM_MODEL_RUNS_PATH, filename), "r") as f:
            model_run_data = json.load(f)

            if "bert_score" not in model_run_data["metrics_results"]["aggregated_metrics"]:
                continue
            
            model_run = ModelRun.from_json(model_run_data["original_run"]["model_run"])

            if model_run.dataset.name not in freetext_model_runs:
                freetext_model_runs[model_run.dataset.name] = []
            freetext_model_runs[model_run.dataset.name].append(model_run)

freetext_model_runs.keys()

dict_keys(['aci_bench', 'mtsamples', 'medication_qa', 'mtsamples_replicate'])

## Evaluation

In [None]:
EVALUATION_OUTPUT_PATH = os.path.join(MEDHELM_DATA_PATH, "medbench_evaluation/")
os.makedirs(EVALUATION_OUTPUT_PATH, exist_ok=True)

In [None]:
async def batch_summarization_eval(
    model_runs: list[ModelRun],
    llm_evaluator: SystemPromptModel,
    output_path: str,
    questions_generator_runner: Runner = None,
):
    """Batch evaluation of summarization tasks.
    
    Args:
        model_runs (list[ModelRun]): List of model runs to evaluate.
            Note that all model runs are expected to be from the **same** dataset.
        llm_evaluator (SystemPromptModel): LLM evaluator to use for evaluation.
    """
    summarization_evaluators = {}
    for model_run in model_runs:
        kwargs = {}
        if questions_generator_runner is not None:
            kwargs["questions_generator_runner"] = questions_generator_runner
        
        evaluator = SummaryEvaluatorRunner(
            predictions_model_run=model_run,
            evaluator=llm_evaluator,
            skip_errors=True,
            **kwargs,
        )

        summarization_evaluators[model_run.model.name] = evaluator

        await evaluator.evaluate()

        if questions_generator_runner is None:
            questions_generator_runner = evaluator.questions_generator_runner

            with open(os.path.join(output_path, f"{model_run.dataset.name}-questions.json"), "w+") as f:
                json.dump(evaluator.questions_generator_runner._model_run.to_json(), f, indent=2)
        
        model_name = model_run.model.name.replace("/", "-")
        with open(os.path.join(output_path, f"{model_name}-answers.json"), "w+") as f:
            json.dump(evaluator.answerer_runner._model_run.to_json(), f, indent=2)
        
        with open(os.path.join(output_path, f"{model_name}-summ.json"), "w+") as f:
            json.dump(evaluator.evaluator_runner._model_run.to_json(), f, indent=2)

    return summarization_evaluators


In [None]:
llm_text_evaluator = OpenAIReasoningModel(
    name=settings.azure_openai_o3_deployment,
    version=settings.azure_openai_o3_version,
    endpoint=settings.azure_openai_o3_endpoint,
    api_key=settings.azure_openai_o3_api_key,
    vision_enabled=False,
    # Prompts are defined by the evaluator runner.
    system_prompt="",
    # Values from AI foundry playground
    max_tokens=40000,
    stop=None,
    stream=False,
)

llm_text_evaluator.name, llm_text_evaluator.version

### ACI Bench

In [None]:
ACI_BENCH_OUTPUT_PATH = os.path.join(EVALUATION_OUTPUT_PATH, "aci_bench/")
os.makedirs(ACI_BENCH_OUTPUT_PATH, exist_ok=True)

In [None]:
model_runs = freetext_model_runs["aci_bench"]

#### Summary evaluator

In [None]:
ACI_BENCH_SUMM_OUTPUT_PATH = os.path.join(ACI_BENCH_OUTPUT_PATH, "summ/")
os.makedirs(ACI_BENCH_SUMM_OUTPUT_PATH, exist_ok=True)

In [None]:
questions_model_run = None

try:
    with open(os.path.join(ACI_BENCH_SUMM_OUTPUT_PATH, "aci_bench-questions.json"), "w+") as f:
        questions_model_run = ModelRun.from_json(json.load(f))
except Exception:
    pass

questions_model_run


In [None]:
aci_bench_evaluators = await batch_summarization_eval(
    model_runs=model_runs,
    llm_evaluator=llm_text_evaluator,
    output_path=ACI_BENCH_SUMM_OUTPUT_PATH,
    questions_generator_runner=questions_model_run,
)

In [None]:
summary_model_runs = {}
for output_filename in os.listdir(ACI_BENCH_SUMM_OUTPUT_PATH):
    if output_filename.endswith("summ.json"):
        with open(os.path.join(ACI_BENCH_SUMM_OUTPUT_PATH, output_filename), "r") as f:
            model_run_data = json.load(f)
            model_run = ModelRun.from_json(model_run_data)
            summary_model_runs[output_filename] = model_run

In [None]:
for model_name, model_run in summary_model_runs.items():
    results = ""
    results += f"Model: {model_name}\n"
    results += f"Results count: {len(model_run.results)}\n"
    for res in model_run.results:
        if res.completions is None:
            continue
        results += f"First results: {res.completions.get_text()}\n"
        results += "\n____________________________________________________________\n"

    with open(os.path.join(ACI_BENCH_SUMM_OUTPUT_PATH, f"aci_bench-{model_name}-results.txt"), "w+") as f:
        f.write(results)

#### TBFact evaluator

This section runs the TBFact evaluator on the ACI Bench model runs and saves the results for further analysis and human review.

In [None]:
ACI_BENCH_TBFACT_OUTPUT_PATH = os.path.join(ACI_BENCH_OUTPUT_PATH, "tbfact/")
ACI_BENCH_TBFACT_REFERENCE_FACTS_PATH = os.path.join(ACI_BENCH_OUTPUT_PATH, "tbfact/reference_facts/reference_facts.json")

os.makedirs(ACI_BENCH_TBFACT_OUTPUT_PATH, exist_ok=True)
os.makedirs(os.path.dirname(ACI_BENCH_TBFACT_REFERENCE_FACTS_PATH), exist_ok=True)

In [None]:
from medbench.evaluators.tbfact.runner import TBFactEvaluatorRunner
from medbench.models import SystemPromptModel, Runner

tbfact_evaluators = {}
for model_run in model_runs:
    tbfact_evaluator = TBFactEvaluatorRunner(
        predictions_model_run=model_run,
        evaluator=llm_text_evaluator,
        reference_facts_path=ACI_BENCH_TBFACT_REFERENCE_FACTS_PATH,
    )
    await tbfact_evaluator.evaluate()

    if not os.path.exists(ACI_BENCH_TBFACT_REFERENCE_FACTS_PATH):
        tbfact_evaluator.tbfact.save_reference_facts(ACI_BENCH_TBFACT_REFERENCE_FACTS_PATH)

    tbfact_evaluators[model_run.model.name] = tbfact_evaluator

    model_name = model_run.model.name.replace("/", "-")
    with open(os.path.join(ACI_BENCH_TBFACT_OUTPUT_PATH, f"{model_name}-tbfact.json"), "w+") as f:
        json.dump(tbfact_evaluator.tbfact_evaluation_model_run.to_json(), f, indent=2)
    
    with open(os.path.join(ACI_BENCH_TBFACT_OUTPUT_PATH, f"{model_name}-fact-extraction.json"), "w+") as f:
        json.dump(tbfact_evaluator.fact_extraction_model_run.to_json(), f, indent=2)
    
    with open(os.path.join(ACI_BENCH_TBFACT_OUTPUT_PATH, f"{model_name}-entailment.json"), "w+") as f:
        json.dump(tbfact_evaluator.entailment_model_run.to_json(), f, indent=2)

The TBFact evaluation results are now saved in the output directory for each model. These can be further analyzed or loaded into MedBench Arena for human expert review.