Step1: Sample 20 images for each organ

In [1]:
import json
import random
from collections import defaultdict

# Path to your metadata file
metadata_path = "../../editing/editing_metadata.json"
output_path = "sampled_ids_by_organ.json"

# Load metadata
with open(metadata_path, "r") as f:
    data = json.load(f)["samples"]

# Group IDs by Organ
organ_to_ids = defaultdict(list)
for sample in data:
    organ = sample.get("Organ")
    sample_id = sample.get("id")
    if organ and sample_id is not None:
        organ_to_ids[organ].append(sample_id)

# Sample 10 IDs per organ
sampled = {
    organ: random.sample(ids, min(20, len(ids)))
    for organ, ids in organ_to_ids.items()
}

# Save to JSON
with open(output_path, "w") as f:
    json.dump(sampled, f, indent=2)

print(f"✅ Sampled IDs saved to: {output_path}")


Step2: Human evaluation by ranking

In [None]:
import os
import json
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk
from collections import defaultdict
from tqdm import tqdm

# === Configuration ===
base_dir = os.path.abspath("../../")
metadata_path = os.path.join(base_dir, "editing/editing_metadata.json")
sampled_ids_path = "sampled_ids_by_organ.json"
results_path = "human_rank.json"

model_dirs = {
    "gemini_2_flash": os.path.join(base_dir, "generated_images/gemini_2_flash"),
    "seedx": os.path.join(base_dir, "generated_images/seedx"),
    "imagic-sd-v1-4": os.path.join(base_dir, "generated_images/imagic-sd-v1-4"),
    "instruct-pix2pix": os.path.join(base_dir, "generated_images/instruct-pix2pix"),
    "instruct-diffusion": os.path.join(base_dir, "generated_images/instruct-diffusion"),
    "paint-by-inpaint": os.path.join(base_dir, "generated_images/paint-by-inpaint"),
    "icedit": os.path.join(base_dir, "generated_images/icedit")
}

with open(metadata_path, "r") as f:
    metadata = {str(s["id"]): s for s in json.load(f)["samples"]}

with open(sampled_ids_path, "r") as f:
    organ_to_ids = json.load(f)

# Load previous progress if exists
if os.path.exists(results_path):
    with open(results_path, "r") as f:
        results = json.load(f)
else:
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

model_names = list(model_dirs.keys())

def load_and_resize(path, size):
    img = Image.open(path).convert("RGB")
    img = img.resize(size, Image.BILINEAR)
    return ImageTk.PhotoImage(img)

def create_panel(prompt, prev_path, changed_path, sid, callback):
    root = tk.Tk()
    root.title(f"Rank the Models for ID {sid}")

    tk.Label(root, text=prompt, font=("Arial", 14), wraplength=1000).pack()

    size = (128, 128)
    imgs = []
    for p in [prev_path, changed_path]:
        if not os.path.exists(p):
            messagebox.showerror("Error", f"Image not found: {p}")
            root.destroy()
            callback("next")
            return
        imgs.append(load_and_resize(p, size))

    frame1 = tk.Frame(root)
    tk.Label(frame1, text="Previous").pack(side="left")
    tk.Label(frame1, image=imgs[0]).pack(side="left")
    tk.Label(frame1, text="Changed").pack(side="left")
    tk.Label(frame1, image=imgs[1]).pack(side="left")
    frame1.pack()

    edited_images = []
    for model in model_names:
        path = os.path.join(model_dirs[model], f"{sid}_0.png")
        if os.path.exists(path):
            edited_images.append((model, load_and_resize(path, size)))
        else:
            edited_images.append((model, None))

    frame2 = tk.Frame(root)
    entry_boxes = {"accuracy": {}, "context": {}, "quality": {}}

    for idx, (model, img) in enumerate(edited_images):
        col = tk.Frame(frame2)
        if img:
            tk.Label(col, image=img).pack()
        else:
            tk.Label(col, text="Missing", width=20, height=10, bg="gray").pack()

        for metric in entry_boxes:
            label_text = {
                "accuracy": "Editing Accuracy (0-6)",
                "context": "Contextual Preservation (0-6)",
                "quality": "Visual Quality (0-6)"
            }[metric]
            tk.Label(col, text=label_text, font=("Arial", 8)).pack()
            entry = tk.Entry(col, width=5)
            entry.pack()
            entry_boxes[metric][model] = entry

        col.pack(side="left", padx=5)
    frame2.pack()

    def submit_and_exit():
        save_results()
        print("👋 Exiting and saving progress.")
        root.update()  # ensure all events are processed
        root.destroy()
        os._exit(0)  # force quit in Jupyter environment

    def submit():
        try:
            scores = {metric: {} for metric in entry_boxes}
            for metric in entry_boxes:
                for model, entry in entry_boxes[metric].items():
                    val = entry.get().strip()
                    if val not in map(str, range(7)):
                        raise ValueError(f"Invalid score {val} for {metric} - must be 0 to 6")
                    scores[metric][model] = int(val)
            root.destroy()
            callback(scores)
        except Exception as e:
            messagebox.showerror("Error", str(e))

    button_frame = tk.Frame(root)
    tk.Button(button_frame, text="✔ Submit", command=submit).pack(side="left", padx=10)
    tk.Button(button_frame, text="💾 Save & Quit", command=submit_and_exit).pack(side="left", padx=10)
    tk.Button(button_frame, text="⏭ Skip", command=lambda: [root.destroy(), callback("next")]).pack(side="left", padx=10)
    button_frame.pack(pady=10)

    root.mainloop()

def save_results():
    with open(results_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"✅ Progress saved to {results_path}")

def run_full_ranking():
    for organ, ids in organ_to_ids.items():
        for sid in tqdm(ids, desc=f"Ranking {organ}"):
            sid_str = str(sid)
            if sid_str in results.get(organ, {}):
                continue

            sample = metadata[sid_str]
            prompt = sample["prompt"]
            prev_path = os.path.join(base_dir, sample["previous_image"])
            changed_path = os.path.join(base_dir, sample["changed_image"])

            finished = False
            while not finished:
                def store(scores):
                    nonlocal finished
                    if scores == "next":
                        finished = True
                    elif scores == "quit":
                        save_results()
                        print("👋 Exiting...")
                        exit(0)
                    else:
                        for metric in scores:
                            results.setdefault(organ, {}).setdefault(sid_str, {})[metric] = scores[metric]
                        save_results()
                        finished = True

                create_panel(prompt, prev_path, changed_path, sid_str, store)

    save_results()
    print("\n✅ Completed full ranking.")

if __name__ == "__main__":
    run_full_ranking()


In [None]:
import os
import json
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk
from collections import defaultdict
from tqdm import tqdm

# === Configuration ===
base_dir = os.path.abspath("../../")
metadata_path = os.path.join(base_dir, "editing/editing_metadata.json")
sampled_ids_path = "sampled_ids_by_organ.json"
results_path = os.path.join(base_dir, "evaluation_result/human_ranked_scores.json")
automated_metrics_path = os.path.join(base_dir, "evaluation_result/automated_metrics_ranked_scores.json")
human_prev_scores_path = os.path.join(base_dir, "evaluation_result/human_ranked_scores_prev.json")

model_dirs = {
    "gemini_2_flash": os.path.join(base_dir, "generated_images/gemini_2_flash"),
    "seedx": os.path.join(base_dir, "generated_images/seedx"),
    "imagic-sd-v1-4": os.path.join(base_dir, "generated_images/imagic-sd-v1-4"),
    "instruct-pix2pix": os.path.join(base_dir, "generated_images/instruct-pix2pix"),
    "instruct-diffusion": os.path.join(base_dir, "generated_images/instruct-diffusion"),
    "paint-by-inpaint": os.path.join(base_dir, "generated_images/paint-by-inpaint"),
    "icedit": os.path.join(base_dir, "generated_images/icedit")
}

# Load metadata and scores
with open(metadata_path, "r") as f:
    metadata = {str(s["id"]): s for s in json.load(f)["samples"]}

with open(sampled_ids_path, "r") as f:
    organ_to_ids = json.load(f)

with open(automated_metrics_path, "r") as f:
    automated_metrics = json.load(f)

if os.path.exists(human_prev_scores_path):
    with open(human_prev_scores_path, "r") as f:
        human_prev_scores = json.load(f)
else:
    human_prev_scores = {}

# Load previous progress if exists
if os.path.exists(results_path):
    with open(results_path, "r") as f:
        results = json.load(f)
else:
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

model_names = list(model_dirs.keys())

def load_and_resize(path, size):
    img = Image.open(path).convert("RGB")
    img = img.resize(size, Image.BILINEAR)
    return ImageTk.PhotoImage(img)

def create_panel(prompt, prev_path, changed_path, sid, organ, callback):
    root = tk.Tk()
    root.title(f"Rank the Models for ID {sid}")

    tk.Label(root, text=prompt, font=("Arial", 14), wraplength=1000).pack()

    size = (128, 128)
    imgs = []
    for p in [prev_path, changed_path]:
        if not os.path.exists(p):
            messagebox.showerror("Error", f"Image not found: {p}")
            root.destroy()
            callback("next")
            return
        imgs.append(load_and_resize(p, size))

    frame1 = tk.Frame(root)
    tk.Label(frame1, text="Previous").pack(side="left")
    tk.Label(frame1, image=imgs[0]).pack(side="left")
    tk.Label(frame1, text="Changed").pack(side="left")
    tk.Label(frame1, image=imgs[1]).pack(side="left")
    frame1.pack()

    edited_images = []
    for model in model_names:
        path = os.path.join(model_dirs[model], f"{sid}_0.png")
        if os.path.exists(path):
            edited_images.append((model, load_and_resize(path, size)))
        else:
            edited_images.append((model, None))

    frame2 = tk.Frame(root)
    entry_boxes = {"accuracy": {}, "context": {}, "quality": {}}
    default_fields = {
        "accuracy": "gpt4o_editing_accuracy_detailed",
        "context": "masked_ssim",
        "quality": "gpt4o_visual_quality_detailed"
    }

    for idx, (model, img) in enumerate(edited_images):
        col = tk.Frame(frame2)
        if img:
            tk.Label(col, image=img).pack()
        else:
            tk.Label(col, text="Missing", width=20, height=10, bg="gray").pack()

        for metric in entry_boxes:
            label_text = {
                "accuracy": "Editing Accuracy (0-6)",
                "context": "Contextual Preservation (0-6)",
                "quality": "Visual Quality (0-6)"
            }[metric]
            tk.Label(col, text=label_text, font=("Arial", 8)).pack()
            entry = tk.Entry(col, width=5)

            # Try human prev scores first, then fallback to automated
            default_value = human_prev_scores.get(organ, {}).get(sid, {}).get(metric, {}).get(model)
            if default_value is None:
                default_key = default_fields[metric]
                default_value = automated_metrics.get(organ, {}).get(sid, {}).get(default_key, {}).get(model)

            if default_value is not None:
                entry.insert(0, str(default_value))

            entry.pack()
            entry_boxes[metric][model] = entry

        col.pack(side="left", padx=5)
    frame2.pack()

    def submit_and_exit():
        save_results()
        print("👋 Exiting and saving progress.")
        root.update()
        root.destroy()
        os._exit(0)

    def submit():
        try:
            scores = {metric: {} for metric in entry_boxes}
            for metric in entry_boxes:
                for model, entry in entry_boxes[metric].items():
                    val = entry.get().strip()
                    if val not in map(str, range(7)):
                        raise ValueError(f"Invalid score {val} for {metric} - must be 0 to 6")
                    scores[metric][model] = int(val)
            root.destroy()
            callback(scores)
        except Exception as e:
            messagebox.showerror("Error", str(e))

    button_frame = tk.Frame(root)
    tk.Button(button_frame, text="✔ Submit", command=submit).pack(side="left", padx=10)
    tk.Button(button_frame, text="💾 Save & Quit", command=submit_and_exit).pack(side="left", padx=10)
    tk.Button(button_frame, text="⏭ Skip", command=lambda: [root.destroy(), callback("next")]).pack(side="left", padx=10)
    button_frame.pack(pady=10)

    root.mainloop()

def save_results():
    with open(results_path, "w") as f:
        json.dump(results, f, indent=2)
    print(f"✅ Progress saved to {results_path}")

def run_full_ranking():
    for organ, ids in organ_to_ids.items():
        for sid in tqdm(ids, desc=f"Ranking {organ}"):
            sid_str = str(sid)
            if sid_str in results.get(organ, {}):
                continue

            sample = metadata[sid_str]
            prompt = sample["prompt"]
            prev_path = os.path.join(base_dir, sample["previous_image"])
            changed_path = os.path.join(base_dir, sample["changed_image"])

            finished = False
            while not finished:
                def store(scores):
                    nonlocal finished
                    if scores == "next":
                        finished = True
                    elif scores == "quit":
                        save_results()
                        print("👋 Exiting...")
                        exit(0)
                    else:
                        for metric in scores:
                            results.setdefault(organ, {}).setdefault(sid_str, {})[metric] = scores[metric]
                        save_results()
                        finished = True

                create_panel(prompt, prev_path, changed_path, sid_str, organ, store)

    save_results()
    print("\n✅ Completed full ranking.")

if __name__ == "__main__":
    run_full_ranking()


Step3: Automated metrics detailed scores to ranked scores.

In [9]:
import json
import pandas as pd
from collections import defaultdict
import random

# === Configuration ===
metric_file = "../../evaluation_result/gpt4o_detailed_scores.json"
sample_file = "sampled_ids_by_organ.json"
output_file = "../../evaluation_result/gpt4o_ranked_scores.json"


# === Load input JSONs ===
with open(metric_file, "r") as f:
    metric_data = json.load(f)

with open(sample_file, "r") as f:
    sampled_ids = json.load(f)

metrics = list(metric_data["gemini_2_flash"]["1"].keys())
# === Format into nested dict: {organ: {sample_id: {metric: {model: rank}}}} ===
nested_result = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

# === Process each metric ===
for target_metric in metrics:
    ranking_records = []

    for organ, ids in sampled_ids.items():
        for sid in ids:
            sid = str(sid)
            rows = []
            for model, samples in metric_data.items():
                if sid in samples and target_metric in samples[sid]:
                    rows.append({
                        "organ": organ,
                        "sample_id": sid,
                        "model": model,
                        "score": samples[sid][target_metric]
                    })

            random.shuffle(rows)  # shuffle to randomize rank for tied scores
            df = pd.DataFrame(rows)
            df["rank_score"] = (
                df["score"].rank(method="first", ascending=False) - 1
            ).astype(int)
            df["rank_score"] = len(rows)-1 - df["rank_score"]  # adjust so highest = 6, lowest = 0

            for _, row in df.iterrows():
                nested_result[row["organ"]][row["sample_id"]][target_metric][row["model"]] = int(row["rank_score"])

# === Save to file ===
with open(output_file, "w") as f:
    json.dump(nested_result, f, indent=2)

print(f"✅ Saved ranked scores to: {output_file}")


✅ Saved ranked scores to: ../../evaluation_result/gpt4o_ranked_scores.json


Step4: Calculate Correlation score between human evaluation and automated evaluation.

In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# === Configuration ===
metrics = ['imagereward', 'imagereward2', 'clip_prompt_score', 'clip_prompt_score_2', 
 'clip_image_image_score', 'editclip', 'editclip_2', 'medclip_prompt_score', 
 'medclip_prompt_score_2', 'medclip_image_image_score', 'medclip_editclip', 
'medclip_editclip_2', 'psnr', 'lpips', 'masked_ssim', "gpt4o_editing_accuracy_prompt",
"gpt4o_contextual_preservation_prompt", "gpt4o_visual_quality_prompt", "gpt4o_editing_accuracy_detailed",
"gpt4o_contextual_preservation_detailed", "gpt4o_visual_quality_detailed"]

human_metrics = ["accuracy", "context", "quality"]

human_path = "../../evaluation_result/human_ranked_scores.json"
metric_path = "../../evaluation_result/automated_metrics_ranked_scores.json"

# === Load JSONs ===
with open(human_path, "r") as f:
    human_eval = json.load(f)

with open(metric_path, "r") as f:
    metric_eval = json.load(f)

# === Aggregate function across all organs
def average_scores_all_organs(data, metric):
    model_totals = {}
    model_counts = {}
    for organ in data:
        for sid, metrics in data[organ].items():
            for model, score in metrics.get(metric, {}).items():
                model_totals[model] = model_totals.get(model, 0) + score
                model_counts[model] = model_counts.get(model, 0) + 1
    return {model: model_totals[model] / model_counts[model] for model in model_totals}

# === Loop over each metric and compare to human evaluation
for human_metric in human_metrics:
    for metric_metric in metrics:
        human_score = average_scores_all_organs(human_eval, human_metric)
        metric_score = average_scores_all_organs(metric_eval, metric_metric)

        common_models = set(human_score.keys()) & set(metric_score.keys())
        if len(common_models) < 2:
            continue  # not enough data to compare

        x = [human_score[m] for m in common_models]
        y = [metric_score[m] for m in common_models]
        labels = list(common_models)

        plt.figure(figsize=(8, 6))
        sns.regplot(x=x, y=y, ci=None, scatter_kws={"s": 80})
        for i, label in enumerate(labels):
            plt.text(x[i] + 0.01, y[i], label, fontsize=10)

        corr, _ = pearsonr(x, y)
        plt.title(f"Correlation between {human_metric} and {metric_metric} (All Organs)\nPearson r = {corr:.2f}")
        plt.xlabel(f"Avg {human_metric} Score (Human Ranking)")
        plt.ylabel(f"Avg {metric_metric} Score (Automated Ranking)")
        plt.grid(True)
        plt.tight_layout()
        plt.show()


In [None]:
import json
import pandas as pd

# Load files
with open("../../evaluation_result/human_ranked_scores_prev.json", "r") as f:
    human_data = json.load(f)

with open("../../evaluation_result/automated_metrics_ranked_scores.json", "r") as f:
    auto_data = json.load(f)

# Detect metric names
human_metrics = set()
auto_metrics = set()

for organ in human_data:
    for sid in human_data[organ]:
        human_metrics.update(human_data[organ][sid].keys())

for organ in auto_data:
    for sid in auto_data[organ]:
        auto_metrics.update(auto_data[organ][sid].keys())

human_metrics = sorted(human_metrics)
auto_metrics = sorted(auto_metrics)

# Convert scores to ranks (default: higher score = better rank)
def score_to_rank(score_dict, reverse=True):
    sorted_models = sorted(score_dict.items(), key=lambda x: -x[1] if reverse else x[1])
    return {model: rank + 1 for rank, (model, _) in enumerate(sorted_models)}

# Compute Spearman correlation for rank vectors
def compute_spearman_rho(rh, rm):
    n = len(rh)
    m = len(rh[0])
    if n < 2:
        return None
    models = rh[0].keys()
    rhos = []
    for i in range(n):
        diff = 0
        for model in models:
            diff += (rh[i][model] - rm[i][model]) ** 2
        rho = 1 - (6 * diff) / (m * (m**2 - 1))
        rhos.append(rho)
    return round(sum(rhos) / n, 4)

# === GLOBAL CORRELATION TABLE ===
global_results = []

for h_metric in human_metrics:
    row = {}
    for a_metric in auto_metrics:
        all_rh = []
        all_rm = []
        for organ in human_data:
            for sid in human_data[organ]:
                h_scores = human_data[organ][sid][h_metric]
                a_scores = auto_data[organ][sid][a_metric]

                reverse_order = False if a_metric == "lpips" else True
                rh_rank = score_to_rank(h_scores)
                rm_rank = score_to_rank(a_scores, reverse=reverse_order)

                all_rh.append(rh_rank)
                all_rm.append(rm_rank)
        rho = compute_spearman_rho(all_rh, all_rm)
        row[a_metric] = rho
    global_results.append(row)

df = pd.DataFrame(global_results, index=human_metrics)
df.index.name = "Human Metric"
df.columns.name = "Automated Metric"

# === PER ORGAN CORRELATION TABLES ===
per_organ_dfs = {}

for organ in human_data:
    organ_results = []
    for h_metric in human_metrics:
        row = {}
        for a_metric in auto_metrics:
            all_rh = []
            all_rm = []
            for sid in human_data[organ]:
                h_scores = human_data[organ][sid][h_metric]
                a_scores = auto_data[organ][sid][a_metric]

                reverse_order = False if a_metric == "lpips" else True
                rh_rank = score_to_rank(h_scores)
                rm_rank = score_to_rank(a_scores, reverse=reverse_order)

                all_rh.append(rh_rank)
                all_rm.append(rm_rank)
            rho = compute_spearman_rho(all_rh, all_rm)
            row[a_metric] = rho
        organ_results.append(row)
    organ_df = pd.DataFrame(organ_results, index=human_metrics)
    organ_df.index.name = "Human Metric"
    organ_df.columns.name = "Automated Metric"
    per_organ_dfs[organ] = organ_df

# === Display with Highlights ===
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

print("\n📊 Global Spearman Rank Correlation Table:\n")
display(df.style.highlight_max(axis=1, color='lightgreen'))
output_dir = "../../evaluation_result/spearman_outputs"
os.makedirs(output_dir, exist_ok=True)
# Save global table to CSV
df.to_csv(os.path.join(output_dir, "spearman_global.csv"))
print("✅ Global correlation saved to 'spearman_global.csv'")

# Display and save each per-organ table
for organ, organ_df in per_organ_dfs.items():
    print(f"\n📊 Spearman Correlation for Organ: {organ}\n")
    display(organ_df.style.highlight_max(axis=1, color='lightyellow'))

    # Save to CSV
    filename = f"spearman_{organ.lower()}.csv"
    organ_df.to_csv(os.path.join(output_dir,filename))
    print(f"✅ Saved: {filename}")
