In [1]:
import os
import sys
import yaml
sys.path.append(".")
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from tqdm import tqdm
tqdm.pandas()

from src.data import files_to_df, Prompt, create_examples
from src.generative_models import LlamaInstruct
from src.generate import safe_generate
from src.utils import setup_logger, log_info, path_with_datetime, load_config, log_config

# Load configuration
config = load_config(config_path="scripts/MMed-Llama-3-8B-EnIns/config.yaml")

N_EXPECTED_SAMPLES = config["N_EXPECTED_SAMPLES"]
# N_EXAMPLES = config["N_EXAMPLES"]
N_EXAMPLES = 0
MODEL_ID = config["MODEL_ID"]
SOURCE_PATH = config["SOURCE_PATH"]
TEMPLATES_PATH = config["TEMPLATES_PATH"]
OUTPUT_PATH = path_with_datetime(config["OUTPUT_PATH"])

# Ensure output directory exists
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH, exist_ok=True)

# write config to file in OUTPUT_PATH
with open(os.path.join(OUTPUT_PATH, "config.yaml"), "w") as f:
    yaml.dump(config, f)
        
# Setup logger
setup_logger(os.path.join(OUTPUT_PATH, "app.log"))
log_config(config)

def load_file_content(filepath):
    """Safely load text file content."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    with open(filepath, "r") as file:
        return file.read().strip()  # Strip trailing spaces

def load_datasets():
    """Load generated, original, human evaluation, and automatic evaluation datasets.
    
    Returns:
        df_pairs: pd.DataFrame, pairs of generated and original samples
        df_human: pd.DataFrame, human evaluation scores
        df_auto: pd.DataFrame, automatic evaluation scores
    """
    
    # 1. Load generated and original datasets and format it
    df_gen = files_to_df(os.path.join(SOURCE_PATH, "generated"))
    df_gen["filenameid"] = df_gen["filenameid"].str.replace("_transformed_step1", "")
    
    df_orig = files_to_df(os.path.join(SOURCE_PATH, "original"))
    df_pairs = df_orig.merge(df_gen, on="filenameid", suffixes=("_orig", "_gen"))
    assert len(df_pairs) == N_EXPECTED_SAMPLES, f"Expected {N_EXPECTED_SAMPLES} samples, got {len(df_pairs)}"
    
    df_pairs.rename(columns={"text_orig": "clinical_case", "text_gen": "discharge_summary"}, inplace=True)
    
    # 2. Load human evaluation dataset and format it
    
    # Input: human_eval.csv (From Google Forms)
    # Timestamp,Email Address,Original file name (e.g. 36951253),Overall validation [Content Relevance],Overall validation [Information Completeness],
    # Overall validation [Clarity and Structure],Overall validation [Content Accuracy],Overall validation [Hallucinations],Overall validation [Impact of Hallucinations],
    # Overall validation [Relevance to Practice],Overall validation [Overall Quality],
    # Positive highlights: Describe what aspects of the synthetic discharge summaries resemble the best real EHRs? (Empty if nothing remarkable),
    # Negative highlights: Which aspects of the synthetic discharge summaries do not resemble well real EHRs? (Empty if nothing remarkable),
    # Other Comments: Do you have any other feedback or comment on the generated synthetic discharge summaries or in the original cases? (Empty if nothing remarkable)

    df_human = pd.read_csv(os.path.join(SOURCE_PATH, "human_eval.csv")).rename(columns={"Original file name (e.g. 36951253)": "filenameid"}).drop(columns=["Email Address", "Timestamp"]).fillna("")
    d_score_cols = {
                            "Overall validation [Content Relevance]": "Content Relevance",
                            "Overall validation [Information Completeness]": "Information Completeness",
                            "Overall validation [Clarity and Structure]": "Clarity and Structure",
                            "Overall validation [Content Accuracy]": "Content Accuracy",
                            "Overall validation [Hallucinations]": "Hallucinations",
                            "Overall validation [Impact of Hallucinations]": "Impact of Hallucinations",
                            "Overall validation [Relevance to Practice]": "Relevance to Practice",
                            "Overall validation [Overall Quality]": "Overall Quality",
                            "Positive highlights: Describe what aspects of the synthetic discharge summaries resemble the best real EHRs? (Empty if nothing remarkable)": "Positive highlights",
                            "Negative highlights: Which aspects of the synthetic discharge summaries do not resemble well real EHRs? (Empty if nothing remarkable)": "Negative highlights",
                            "Other Comments: Do you have any other feedback or comment on the generated synthetic discharge summaries or in the original cases? (Empty if nothing remarkable)": "Other Comments"
    }

    df_human.rename(columns=d_score_cols, inplace=True)
    df_human.rename(columns={"Original file name (e.g. 36951253)": "filenameid"}, inplace=True)
    df_human["human_score"] = df_human.drop(columns=["filenameid"]).to_dict(orient="records")

    
    # Output: df_human
    # | filenameid |                  human_score                     |
    # | 33857916   | {'Content Relevance': 1, 'Information Complete...|
    
    # 3. Load automatic evaluation dataset and format it
    
    # Input: auto_eval.csv (From Google Forms)
    # filename,precision,recall,f1,tp,fp,fn,cluster

    df_auto = pd.read_csv(os.path.join(SOURCE_PATH, "auto_eval.csv")).drop(columns=["cluster"]).rename(columns={"filename": "filenameid"})
    df_auto["auto_score"] = df_auto.drop(columns=["filenameid"]).to_dict(orient="records")
    
    # Ensure filenameid is string
    df_pairs["filenameid"] = df_pairs["filenameid"].map(str)
    df_human["filenameid"] = df_human["filenameid"].map(str)
    df_auto["filenameid"] = df_auto["filenameid"].map(str)
    
    # Output: df_auto
    # | filenameid |                  auto_score                     |
    # | 33857916   | {'precision': 0.5, 'recall': 0.5, 'f1': 0.5,...|
    
    return df_pairs, df_human, df_auto

def select_examples(df_prompt, n=5, seed=42, examples_ids=None):
    """Select a few examples for few-shot learning."""
    
    if not examples_ids:
        example_filenames = df_prompt.sample(n, random_state=seed)["filenameid"].tolist()
    else:
        example_filenames = df_prompt[df_prompt["filenameid"].isin(examples_ids)]
    
    log_info(f"Selected Examples: {example_filenames}")
    
    return df_prompt[df_prompt["filenameid"].isin(example_filenames)]

def prepare_prompt_data(df_pairs, df_human, df_auto, examples_ids=None):
    """Merge datasets and prepare prompt inputs."""
    df_prompt = df_pairs.merge(df_human[["filenameid", "human_score"]], on="filenameid").merge(df_auto[["filenameid", "auto_score"]], on="filenameid")
    if examples_ids:
        df_prompt = df_prompt[df_prompt["filenameid"].isin(examples_ids)]
    return df_prompt

def generate_prompts(df_prompt, guidelines, template, examples):
    """Generate prompts for LLM processing."""
    df_prompt["prompts"] = df_prompt.progress_apply(lambda x: Prompt(
                                                                        guidelines=guidelines,
                                                                        template=template,
                                                                        clinical_case=x["clinical_case"],
                                                                        discharge_summary=x["discharge_summary"],
                                                                        examples=str(examples),
                                                                    ).text, axis=1)
    return df_prompt

def compute_correlations(df_human_preds, df_preds):
    """Compute Pearson correlation between human and model scores."""
    return pearsonr(df_human_preds["Overall Quality"], df_preds["Overall Quality"])

def plot_correlation_heatmap(df_hm, df_llm, suffixes=("_hm", "_llm")):
    """Plot a heatmap of correlations."""
    
    df_hm_llm_corr = df_hm.merge(df_llm, on="filenameid", suffixes=suffixes)
    df_hm_llm_corr = df_hm_llm_corr.select_dtypes(np.number).corr()
    
    fig, ax = plt.subplots(figsize=(10, 10))
    x_suffix, y_suffix = suffixes[0], suffixes[1]
    x_cols = [col for col in df_hm_llm_corr.columns if col.endswith(x_suffix)]
    y_cols = [col for col in df_hm_llm_corr.columns if col.endswith(y_suffix)]

    corr_matrix = df_hm_llm_corr.loc[x_cols, y_cols]
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", vmin=-1, vmax=1, ax=ax)
    ax.set_title("Correlation Heatmap: Human vs LLM")
    fig.tight_layout()
    # plt.savefig(os.path.join(output_path, "correlation_heatmap.png"))   
    return fig , corr_matrix




2025-04-05 10:57:47,372 - INFO - Configuration Settings:
2025-04-05 10:57:47,373 - INFO - N_EXPECTED_SAMPLES: 35
2025-04-05 10:57:47,373 - INFO - N_EXAMPLES: 7
2025-04-05 10:57:47,373 - INFO - MODEL_ID: /gpfs/projects/bsc14/abecerr1/hub/models--Henrychur--MMed-Llama-3-8B-EnIns/snapshots/45936f724a7eabfce59cd8eaed318970db468cad
2025-04-05 10:57:47,374 - INFO - SOURCE_PATH: output/samples/en/phase_2/
2025-04-05 10:57:47,374 - INFO - TEMPLATES_PATH: utils/templates/basic
2025-04-05 10:57:47,374 - INFO - OUTPUT_PATH: output/evaluation/MMed-Llama-3-8B-EnIns


In [2]:

"""Main execution function."""

log_info(f"Starting evaluation of {MODEL_ID}")
log_info("")
log_info(f"Loading templates and guidelines from {TEMPLATES_PATH}")
guidelines = load_file_content(os.path.join(TEMPLATES_PATH, "guidelines.txt"))
template = load_file_content(os.path.join(TEMPLATES_PATH, "template.txt"))
example_template = load_file_content(os.path.join(TEMPLATES_PATH,"example_template.txt"))
system_prompt = load_file_content(os.path.join(TEMPLATES_PATH,"system.txt"))

log_info(f"Loading datasets from {SOURCE_PATH}")
df_pairs, df_human, df_auto = load_datasets()
df_prompt = prepare_prompt_data(df_pairs, df_human, df_auto)

log_info("")
log_info(f"Selecting {N_EXAMPLES} examples for few-shot learning and generating prompts")
if N_EXAMPLES > 0:
    df_examples = select_examples(df_prompt, n=N_EXAMPLES)
    few_shot_examples = df_examples.to_dict(orient="records")
    examples = create_examples(few_shot_examples, example_template=example_template)
else:
    examples = [""]

df_prompt = generate_prompts(df_prompt, guidelines, template, examples)

log_info("")
log_info("Starting generation of evaluation results")
# model = LlamaInstruct(MODEL_ID, device="cuda:1")


2025-04-05 10:57:47,465 - INFO - Starting evaluation of /gpfs/projects/bsc14/abecerr1/hub/models--Henrychur--MMed-Llama-3-8B-EnIns/snapshots/45936f724a7eabfce59cd8eaed318970db468cad
2025-04-05 10:57:47,466 - INFO - 
2025-04-05 10:57:47,466 - INFO - Loading templates and guidelines from utils/templates/basic
2025-04-05 10:57:47,467 - INFO - Loading datasets from output/samples/en/phase_2/
2025-04-05 10:57:47,481 - INFO - 
2025-04-05 10:57:47,481 - INFO - Selecting 0 examples for few-shot learning and generating prompts
100%|██████████| 15/15 [00:00<00:00, 25987.01it/s]
2025-04-05 10:57:47,484 - INFO - 
2025-04-05 10:57:47,484 - INFO - Starting generation of evaluation results


In [3]:
import torch
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load MedAlpaca model + tokenizer
model_path = "/gpfs/projects/bsc14/abecerr1/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to("cuda:1")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:

# Helper function to safely extract JSON
def safe_json_extract(text):
    match = re.search(r"\{[^{}]*\"feedback\"[^{}]*\"score\"\s*:\s*\d[^{}]*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            return None
    return None

# Core evaluation function
def evaluate_clinical_summaries(clinical_case, discharge_summary, rubric, output_file=None):
    """
    Uses MedAlpaca to evaluate a clinical discharge summary using rubric.
    Returns: dict with 'feedback' and 'score'
    """
    prompt = f"""
You are a clinical evaluator. Given the clinical case and the generated discharge summary, compare them and respond with a JSON object containing:

- "feedback": A short explanation (1-3 sentences)
- "score": An integer from 1 to 5 based on the rubric

Only output a valid JSON object. Do not include any other text.

If the clinical case is too long, summarize it preserving the most important details.

### Evaluation Criterion:
{rubric['criteria']}

### Scoring Rubric:
1 - {rubric['score1_description']}
2 - {rubric['score2_description']}
3 - {rubric['score3_description']}
4 - {rubric['score4_description']}
5 - {rubric['score5_description']}

### Clinical Case:
{clinical_case}

### Discharge Summary:
{discharge_summary}

### JSON Response:
"""

    for attempt in range(3):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        json_candidate = decoded.replace(prompt, "").strip()

        result = safe_json_extract(json_candidate)
        if result:
            if output_file:
                with open(output_file, "w") as f:
                    json.dump(result, f, indent=2)
            return result

        print(f"⚠️ Attempt {attempt+1}: Failed to extract JSON\nRaw Output:\n{json_candidate}")

    return {"feedback": "Unable to parse model output after retries.", "score": None}


clinical_case = df_prompt["clinical_case"].iloc[11]
discharge_summary = df_prompt["discharge_summary"].iloc[11]

rubric = {
    "criteria": "Does the summary focus on clinically relevant information?",
    "score1_description": "The summary largely misses the clinically relevant details.",
    "score2_description": "The summary includes only a few clinically relevant details.",
    "score3_description": "The summary covers some relevant information but omits key aspects.",
    "score4_description": "The summary covers most clinically relevant details with minor omissions.",
    "score5_description": "The summary is entirely focused on clinically relevant information."
}

out = evaluate_clinical_summaries(clinical_case, discharge_summary, rubric)
print(out)


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


{'feedback': "The discharge summary effectively conveys the patient's complex clinical scenario, highlighting the critical role of orthotopic heart transplantation and its associated complications. However, there is a need for more concise language and better organization to facilitate clear understanding by non-experts.", 'score': 4}


In [None]:

df_prompt["generated"] = df_prompt.progress_apply(
    lambda x: evaluate_clinical_summaries(
        clinical_case=x["clinical_case"],
        discharge_summary=x["discharge_summary"],
        rubric=rubric
    ),
    axis=1
)

df_llm = df_prompt[["filenameid", "generated"]].rename(columns={"generated": "llm_score"})
df_llm["llm_score"] = df_llm["llm_score"].apply(lambda x: x if isinstance(x, dict) else json.loads(x))
score_cols = df_llm["llm_score"].apply(lambda x: pd.Series(x))
df_llm = pd.concat([df_llm["filenameid"], score_cols], axis=1)


  0%|          | 0/15 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 13%|█▎        | 2/15 [00:01<00:09,  1.41it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 20%|██        | 3/15 [00:01<00:07,  1.56it/s]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 27%|██▋       | 4/15 [00:05<00:20,  1.90s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 33%|███▎      | 5/15 [00:07<00:17,  1.78s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 40%|████      | 6/15 [00:08<00:13,  1.49s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 47%|████▋     | 7/15 [00:09<00:10,  1.34s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 53%|█████▎    | 8/15 [00:11<00:11,  1.71s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
 60%|██████    | 9/15 [00:13<00:09,  1.50s/it]Setting `p

In [12]:
df_llm = df_prompt[["filenameid", "generated"]].rename(columns={"generated": "llm_score"})
df_llm["llm_score"] = df_llm["llm_score"].apply(lambda x: x if isinstance(x, dict) else json.loads(x))
score_cols = df_llm["llm_score"].apply(lambda x: pd.Series(x))
df_llm = pd.concat([df_llm["filenameid"], score_cols], axis=1)
df_llm

Unnamed: 0,filenameid,feedback,score
0,29617510,The summary accurately captures the primary di...,4
1,31204375,The discharge summary effectively communicates...,5
2,30072860_2,The summary effectively captures the core deta...,4
3,31056078,The discharge summary effectively summarizes t...,4
4,31486515,The discharge summary effectively conveys the ...,4
5,31512669,The discharge summary adequately covers the ma...,4
6,32997782,The discharge summary effectively summarizes t...,4
7,31557359,The discharge summary effectively conveys the ...,4
8,26989133,The discharge summary effectively conveys crit...,4
9,31049155,The summary provides an overview of the patien...,4


In [13]:
df_llm["feedback"].values

array(['The summary accurately captures the primary diagnosis and key interventions, highlighting the successful percutaneous closure of the PVL and subsequent hemodynamic stability. However, minor details such as specific medication lists and follow-up appointment schedules could be included for completeness.',
       "The discharge summary effectively communicates the patient's diagnosis, treatment, and follow-up instructions, focusing on the clinically relevant details.",
       "The summary effectively captures the core details of the clinical scenario, including the patient's diagnosis, interventions, and outcomes. However, some minor omissions regarding specific test results and medications are present.",
       "The discharge summary effectively summarizes the patient's clinical history, diagnosis, and treatment plan, focusing on clinically relevant information. However, there could be more detail about the patient's symptoms and laboratory results.",
       "The discharge summa

In [None]:

# df_prompt = df_prompt.sample(3) # For testing
df_prompt["generation"] = df_prompt.progress_apply(lambda x: evaluate_clinical_summaries(clinical_case=x["clinical_case"],
    discharge_summary=x["discharge_summary"],
    rubric={
        "criteria": "Does the summary focus on clinically relevant information?",
        "score1_description": "The summary largely misses the clinically relevant details.",
        "score2_description": "The summary includes only a few clinically relevant details.",
        "score3_description": "The summary covers some relevant information but omits key aspects.",
        "score4_description": "The summary covers most clinically relevant details with minor omissions.",
        "score5_description": "The summary is entirely focused on clinically relevant information."
    }
), axis=1)

: 

In [None]:
df_prompt.iloc[11]

Unnamed: 0,filenameid,clinical_case,discharge_summary,human_score,auto_score,prompts
7,31557359,"A 61‐year‐old man, diabetic and hypertensive, ...",**Discharge Summary**\n\n**Patient Information...,"{'Content Relevance': 5, 'Information Complete...","{'precision': 0.599999940000006, 'recall': 0.6...","Look at these guidelines carefully, i have als..."


In [None]:
df_prompt["generation"][7]

{'feedback': 'The summary focuses on clinically relevant information, though minor details about sensing and pacing rhythms would be useful for optimizing therapy',
 'score': 4}

In [58]:

df_human_preds = pd.DataFrame(df_prompt["human_score"].tolist()).assign(filenameid=df_prompt["filenameid"].values)
df_preds = pd.DataFrame(df_prompt["generation"].tolist()).assign(filenameid=df_prompt["filenameid"].values)

eval_metric = compute_correlations(df_human_preds, df_preds)
log_info(f"Evaluation Metric: {eval_metric}")


AttributeError: 'NoneType' object has no attribute 'keys'

In [None]:

fig, df_hm_llm_corr = plot_correlation_heatmap(df_human_preds, df_preds)

log_info("")
log_info(f"Saving results to {OUTPUT_PATH}")
    
fig.savefig(os.path.join(OUTPUT_PATH, "correlation_heatmap.png"))
df_human_preds.to_csv(os.path.join(OUTPUT_PATH, "human_predictions.csv"), index=False)
df_preds.to_csv(os.path.join(OUTPUT_PATH, "llm_predictions.csv"), index=False)
df_hm_llm_corr.to_csv(os.path.join(OUTPUT_PATH, "correlation_matrix.csv"), index=True)
df_prompt.to_csv(os.path.join(OUTPUT_PATH, "prompt_data.csv"), index=False)


if N_EXAMPLES > 0:
    df_examples_human = pd.DataFrame(df_examples["human_score"].tolist()).assign(filenameid=df_examples["filenameid"].values)
    df_examples_preds = df_prompt[df_prompt["filenameid"].isin(df_examples["filenameid"].values)]
    df_examples_preds = pd.DataFrame(df_examples_preds["generation"].tolist()).assign(filenameid=df_examples_preds["filenameid"].values)
    df_examples_auto = pd.DataFrame(df_examples["auto_score"].tolist()).assign(filenameid=df_examples["filenameid"].values)
    
    df_examples_human.to_csv(os.path.join(OUTPUT_PATH, "examples_human_eval.csv"), index=False)
    df_examples_preds.to_csv(os.path.join(OUTPUT_PATH, "examples_predictions.csv"), index=False)
    df_examples_auto.to_csv(os.path.join(OUTPUT_PATH, "examples_auto_eval.csv"), index=False)
    
log_info(f"Results saved to {OUTPUT_PATH}")

if __name__ == "__main__":
main()
