# LLM as an evaluator

Current learnings:
- Limitations on TPM
- Limitations on output length


## Dependencies

In [None]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), ".."))


In [None]:
%load_ext autoreload
%autoreload 2

import json

from dotenv import load_dotenv

from medbench.config import settings
from medbench.datasets import Dataset
from medbench.evaluators import MultimodalEvaluatorRunner, SummaryEvaluatorRunner
from medbench.models import ModelOutput, ModelRun
from medbench.models.azureoai import OpenAIChatModel, OpenAIReasoningModel
from medbench.models.cxrreportgen import CXRReportGenModel
from medbench.utils import load_arena_data

load_dotenv("../.env")


## Load data

### Segmed

In [None]:
segmed_gpt4o_path = (
    "../data/arena/segmed/segmed/gpt-4o.json"
)
segmed_cxrreportgen_path = (
    "../data/arena/segmed/segmed/cxrreportgen.json"
)
segmed_dataset_path = (
    "../data/arena/segmed/segmed/data.json"
)

with open(segmed_dataset_path, "r") as f:
    segmed_dataset_json = json.load(f)
    segmed_dataset_json["description"] = (
        "Image to findings dataset. Given Chest X-ray images, the model predicts the findings."
    )
    segmed_dataset = Dataset.from_json(segmed_dataset_json)

with open(segmed_gpt4o_path, "r") as f:
    segmed_gpt4o_json = json.load(f)
    segmed_gpt4o_model_run = ModelRun(
        id="gpt-4o-sampled_segmed_gpt4",
        model=OpenAIChatModel.from_json(segmed_gpt4o_json["model"]),
        dataset=segmed_dataset,
        results=[ModelOutput.from_json(o) for o in segmed_gpt4o_json["results"]],
    )

with open(segmed_cxrreportgen_path, "r") as f:
    segmed_cxrreportgen_json = json.load(f)
    segmed_cxrreportgen_model_run = ModelRun(
        id="cxrreportgen-sampled_segmed_gpt4",
        model=CXRReportGenModel.from_json(segmed_cxrreportgen_json["model"]),
        dataset=segmed_dataset,
        results=[ModelOutput.from_json(o) for o in segmed_cxrreportgen_json["results"]],
    )

segmed_gpt4o_model_run

### MT Samples - Pharmacy

In [None]:
mtsamples_gpt4o_path = (
    "../../data/medbench/arena/mtsamples-pharmacy/sampled_mtsamples_gpt4/gpt-4o.json"
)
mtsamples_dataset_path = (
    "../../data/medbench/arena/mtsamples-pharmacy/sampled_mtsamples_gpt4/data.json"
)

with open(mtsamples_dataset_path, "r") as f:
    mtsamples_dataset_json = json.load(f)
    mtsamples_dataset_json["description"] = (
        "The dataset is a collection of Clinical Notes from MTSamples.com, and the expected output is a summary targeting the pharmacy professionals."
    )
    mtsamples_dataset = Dataset.from_json(mtsamples_dataset_json)

with open(mtsamples_gpt4o_path, "r") as f:
    mtsamples_gpt4o_json = json.load(f)
    mtsamples_gpt4o_model_run = ModelRun(
        id="gpt-4o-sampled_mtsamples_gpt4",
        model=OpenAIChatModel.from_json(mtsamples_gpt4o_json["model"]),
        dataset=mtsamples_dataset,
        results=[ModelOutput.from_json(o) for o in mtsamples_gpt4o_json["results"]],
    )

mtsamples_gpt4o_model_run

### MT Samples - Patient summary

Data used in Arena

In [None]:
with open("../../data/medbench/arena/mtsamples-pharmacy/formatted_multi-output-dataset.jsonl", "r") as f:
    mtsamples_summary_data = []
    for line in f:
        mtsamples_summary_data.append(json.loads(line))

len(mtsamples_summary_data), mtsamples_summary_data[0].keys()

In [None]:
model_runs = load_arena_data(
    dataset_name="mtsamples-summary",
    dataset_description=(
        "The dataset is a collection of Clinical Notes from MTSamples.com, and the objective "
        "of the dataset is summarization of clinical notes into a structured patient summary."
    ),
    data_split="eval",
    data=mtsamples_summary_data,
    data_key="clinical_note",
    output_keys=["gpt4o", "gpt4o-mini", "deepseek"],
    max_instances=None,
)

len(model_runs["deepseek"].dataset.instances), model_runs

## Eval

In [None]:
llm_text_evaluator = OpenAIReasoningModel(
    name=settings.azure_openai_deployment,
    version=settings.azure_openai_version,
    endpoint=settings.azure_openai_endpoint,
    api_key=settings.azure_openai_api_key,
    vision_enabled=False,
    # Prompts are defined by the evaluator runner.
    system_prompt="",
    # Values from AI foundry playground
    max_tokens=4000,
    stop=None,
    stream=False,
)

llm_text_evaluator.name, llm_text_evaluator.version

In [None]:
llm_vision_evaluator = OpenAIReasoningModel(
    name="azure_openai_o1_deployment",
    version="azure_openai_o1_version",
    endpoint="azure_openai_o1_endpoint",
    api_key="azure_openai_o1_api_key",
    # Prompts are defined by the evaluator runner.
    system_prompt="",
    vision_enabled=True,
    # Values from AI foundry playground
    max_tokens=4000,
    stop=None,
    stream=False,
)

llm_vision_evaluator.name, llm_vision_evaluator.version

### Segmed

In [None]:
segmed_evaluators = {}

evaluator = MultimodalEvaluatorRunner(
    predictions_model_run=segmed_gpt4o_model_run, evaluator=llm_vision_evaluator
)
segmed_evaluators["gpt-4o"] = evaluator

evaluator = MultimodalEvaluatorRunner(
    predictions_model_run=segmed_cxrreportgen_model_run, evaluator=llm_vision_evaluator
)
segmed_evaluators["cxrreportgen"] = evaluator

In [None]:
for evaluator in segmed_evaluators.values():
    evaluator.evaluate()

In [None]:
for eval_key, evaluator in segmed_evaluators.items():
    with open(f"../data/arena/segmed/eval/{eval_key}.json", "w+") as f:
        json.dump(evaluator.evaluator_runner._model_run.to_json(), f, indent=2)

In [None]:
len(segmed_cxrreportgen_model_run.dataset.instances[0].input.content)

In [None]:
e = segmed_evaluators["cxrreportgen"]
e.evaluator_runner.build_user_input(e.evaluator_runner._model_run.dataset.instances[0])

In [None]:
e.evaluator_runner._model_run.dataset.instances[0].input.content[1]

### Arena data eval - MTSamples

Evaluate with simple prompt base model as a judge

In [None]:
arena_evaluators = {}

for model_key in model_runs:
    evaluator = MultimodalEvaluatorRunner(
        predictions_model_run=model_runs[model_key],
        evaluator=llm_text_evaluator
    )
    arena_evaluators[model_key] = evaluator

In [None]:
for evaluator in arena_evaluators.values():
    evaluator.evaluate()

In [None]:
for eval_key, evaluator in arena_evaluators.items():
    with open(f"../data/eval/{eval_key}.json", "w+") as f:
        json.dump(evaluator.evaluator_runner._model_run.to_json(), f, indent=2)

Evaluate using the Summary Evaluator approach

In [None]:
arena_summ_evaluators = {}

questions_generator_runner = None
for model_key in model_runs:
    kwargs = {}
    if questions_generator_runner is not None:
        kwargs["questions_generator_runner"] = questions_generator_runner

    evaluator = SummaryEvaluatorRunner(
        predictions_model_run=model_runs[model_key],
        evaluator=llm_text_evaluator,
        skip_errors=True,
        **kwargs,
    )
    arena_summ_evaluators[model_key] = evaluator

    evaluator.evaluate()

    if questions_generator_runner is None:
        questions_generator_runner = evaluator.questions_generator_runner
    
    # break

In [None]:
for model_key in model_runs:
    print(model_key)

Save data

In [None]:
for eval_key, evaluator in arena_summ_evaluators.items():
    with open(f"../data/eval/{eval_key}-questions.json", "w+") as f:
        json.dump(evaluator.questions_generator_runner._model_run.to_json(), f, indent=2)
    with open(f"../data/eval/{eval_key}-answers.json", "w+") as f:
        json.dump(evaluator.answerer_runner._model_run.to_json(), f, indent=2)
    with open(f"../data/eval/{eval_key}-summ.json", "w+") as f:
        json.dump(evaluator.evaluator_runner._model_run.to_json(), f, indent=2)

### MT Samples - Pharmacy eval

In [None]:
evaluator = MultimodalEvaluatorRunner(
    predictions_model_run=mtsamples_gpt4o_model_run,
    evaluator=llm_text_evaluator
)

In [None]:
await evaluator.evaluate()

In [None]:
# Write this to a file: evaluator._model_run.to_json()
with open("../../data/medbench/arena/mtsamples-pharmacy/sampled_mtsamples_gpt4/gpt-4o-evaluated.json", "w+") as f:
    f.write(json.dumps(evaluator.predictions_model_run.to_json()))