# Initialization


## Imports


In [None]:
import numpy as np
import pandas as pd
import altair as alt
from scipy import stats

alt.data_transformers.enable("vegafusion")

## Utils


In [None]:
def read_wandb_table(path: str) -> pd.DataFrame:
    import json

    with open(path, "r") as file:
        data = json.load(file)
    columns = data["columns"]
    rows = data["data"]
    return pd.DataFrame(rows, columns=columns)

# Metrics


## Download


In [None]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={
#         "jobType": "test",
#         "createdAt": {"$gt": "2025-09-01T12:00:00Z", "$lt": "2025-11-04T00:00:00Z"},
#     },
# )

# for run in runs:
#     run_id = run.name.split(" ")[2]
#     dataset = run.config["test_dataset"].replace("-test", "")
#     model = run.config["model"]
#     group = run.group + ("-r" if "deeplab" in model else "")
#     run.logged_artifacts()[1].download(
#         f"logs/wandb/2_metrics/{group} {dataset} {run_id}"
#     )
#     print(group, dataset, run_id)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/2_metrics"
# for dir in os.listdir(wandb_dir):
#     group, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch", "loss"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df.insert(0, "method", group)
#     df_list.append(df)

# meta_metrics_df = pd.concat(df_list)
# meta_metrics_df.to_csv("logs/wandb/2_meta_metrics.csv", index=False)

## Preparation


In [None]:
meta_metrics_df = pd.read_csv("logs/wandb/2_meta_metrics.csv")

meta_metrics_df["iou_cup"] = meta_metrics_df["iou_cup"] * 100
meta_metrics_df["iou_disc"] = meta_metrics_df["iou_disc"] * 100

meta_metrics_df["iou"] = (meta_metrics_df["iou_cup"] + meta_metrics_df["iou_disc"]) / 2

method_mapping = {
    "PS": "PS-U",
    "PS-r": "PS-DL",
    "PA": "PA-U",
    "PA-r": "PA-DL",
    "PAS-nc": "PAA-U",
    "PAS-nc-2": "PAA-2-U",
    "PAS-nc-r": "PAA-DL",
    "PAS-nc-r-2": "PAA-2-DL",
    "PAS": "PAAC-U",
    "PAS-2": "PAAC-2-U",
    "PAS-r": "PAAC-DL",
    "PAS-r-2": "PAAC-2-DL",
}
meta_metrics_df["method"] = meta_metrics_df["method"].apply(lambda x: method_mapping[x])

meta_metrics_df = meta_metrics_df[
    meta_metrics_df["method"].isin(["PS-U", "PS-DL", "PA-U", "PA-DL"])
]

meta_metrics_df

## Comparison


In [None]:
def compare_metrics(target_column: str, use_best: bool) -> pd.DataFrame:
    method_mapping = {
        "PS-U": "miniUNet w/o Q2S",
        "PS-DL": "DeepLabv3+ w/o Q2S",
        "PA-U": "miniUNet w/ Q2S",
        "PA-DL": "DeepLabv3+ w/ Q2S",
    }

    data = meta_metrics_df[
        ~(
            (meta_metrics_df["sparsity_mode"] == "point")
            & (meta_metrics_df["sparsity_value"] == 1)
        )
    ]
    data["method"] = data["method"].apply(lambda x: method_mapping[x])

    if use_best:
        comparison_df = data[
            [
                "dataset",
                "method",
                "shot",
                "sparsity_mode",
                "sparsity_value",
                target_column,
            ]
        ].copy()
        comparison_df["iou_ref"] = data["iou"]
        comparison_df = (
            comparison_df.groupby(
                ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
                dropna=False,
            )
            .agg(
                iou_ref=("iou_ref", "mean"),
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
            .reset_index()
        )
        comparison_df = comparison_df.loc[
            comparison_df.groupby(["dataset", "method"])["iou_ref"].idxmax()
        ]
    else:
        comparison_df = (
            data[["dataset", "method", target_column]]
            .groupby(["dataset", "method"])
            .agg(
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
        ).reset_index()

    comparison_df["iou_std_err"] = (
        comparison_df["iou_std"] / comparison_df["iou_count"] ** 0.5
    )
    comparison_df["iou_low"] = (
        comparison_df["iou"] - 1.96 * comparison_df["iou_std_err"]
    )
    comparison_df["iou_high"] = (
        comparison_df["iou"] + 1.96 * comparison_df["iou_std_err"]
    )

    return comparison_df

In [None]:
comparison_df = compare_metrics("iou", False)
disc_comparison_df = compare_metrics("iou_disc", False)
cup_comparison_df = compare_metrics("iou_cup", False)
best_comparison_df = compare_metrics("iou", True)
best_disc_comparison_df = compare_metrics("iou_disc", True)
best_cup_comparison_df = compare_metrics("iou_cup", True)

In [None]:
comparison_df.sort_values(by=["dataset", "iou"], ascending=False)

In [None]:
comparison_df.groupby("method")[["iou"]].mean().sort_values(by="iou", ascending=False)

## Visualization


In [None]:
def compose_bar_chart(
    data: pd.DataFrame,
    scale: tuple[float, float],
    title: str,
    hide_header: bool = False,
):
    ordered_methods = [
        "miniUNet w/o Q2S",
        "DeepLabv3+ w/o Q2S",
        "miniUNet w/ Q2S",
        "DeepLabv3+ w/ Q2S",
    ]

    color_scale = alt.Scale(
        domain=ordered_methods,
        scheme="category10",
    )

    base = alt.Chart(data).encode(
        x=alt.X(
            "method:N",
            title=None,
            sort=ordered_methods,
            axis=alt.Axis(labels=False, ticks=False),
        ),
    )
    y_scale = alt.Scale(domain=scale, clamp=True)

    layered = (
        base.mark_bar().encode(
            y=alt.Y(
                "iou:Q",
                title=None,
                scale=y_scale,
            ),
            color=alt.Color(
                "method:N",
                scale=color_scale,
                title="Variant",
                legend=alt.Legend(
                    # orient="right",
                    orient="bottom",
                    direction="horizontal",
                    titleAnchor="start",
                    columns=4,
                ),
            ),
        )
        + base.mark_errorbar(
            extent="ci", thickness=2.0, ticks=True, color="black"
        ).encode(
            y=alt.Y(
                "iou_low:Q",
                title=None,
                scale=y_scale,
            ),
            y2="iou_high:Q",
        )
        + base.mark_text(align="center", baseline="top", dy=85, fontSize=16).encode(
            text=alt.Text("iou:Q", format=".0f"),
        )
    ).properties(width=150, height=200)  # type: ignore

    header = alt.Header(labelFontSize=0) if hide_header else alt.Header()
    return layered.facet(
        column=alt.Column("dataset:N", title=title, header=header),
        spacing=4,
    )

In [None]:
(
    (
        compose_bar_chart(disc_comparison_df, (55, 95), "OD IoU (%)")
        | compose_bar_chart(cup_comparison_df, (35, 75), "OC IoU (%)")
    )
    .configure_axis(labelFontSize=14, titleFontSize=16)
    .configure_header(labelFontSize=14, titleFontSize=16)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

In [None]:
def compose_line_chart():
    color_scale = alt.Scale(
        domain=["point", "contour", "grid", "region", "skeleton"],
        scheme="category10",
    )

    new_data = meta_metrics_df[
        ~(
            (meta_metrics_df["sparsity_mode"] == "point")
            & (meta_metrics_df["sparsity_value"] == 1)
        )
    ]
    # new_data = new_data[new_data["method"] == "DeepLabv3+ w/ Q2S"]
    new_data = new_data[new_data["method"] == "PA-DL"]
    new_data["shot"] = new_data["shot"].apply(
        lambda x: f"{x} shot" if x == 1 else f"{x} shots"
    )

    new_data = (
        new_data.groupby(["dataset", "shot", "sparsity_mode", "sparsity_value"])
        .agg(
            iou_mean=("iou", "mean"),
            iou_var=("iou", "var"),
            count=("iou", "count"),
        )
        .reset_index()
    )

    new_data_list = []
    for shot in new_data["shot"].unique():
        for sparsity_mode in new_data["sparsity_mode"].unique():
            for sparsity_value in new_data["sparsity_value"].unique():
                subset = new_data[
                    (new_data["shot"] == shot)
                    & (new_data["sparsity_mode"] == sparsity_mode)
                    & (new_data["sparsity_value"] == sparsity_value)
                ]
                if len(subset) == 0:
                    continue
                count = subset["count"].sum()
                iou_mean = subset["iou_mean"].mean()
                iou_var = subset["iou_var"].mean() + np.var(subset["iou_mean"])
                iou_delta = stats.t.ppf(0.975, count) * np.sqrt(iou_var / count)
                if sparsity_mode == "point":
                    sparsity_value /= 50
                new_data_list.append(
                    {
                        "shot": shot,
                        "sparsity_mode": sparsity_mode,
                        "sparsity_value": sparsity_value,
                        "iou_mean": iou_mean,
                        "iou_lower": iou_mean - iou_delta,
                        "iou_upper": iou_mean + iou_delta,
                        "iou_var": iou_var,
                    }
                )
    new_data = pd.DataFrame(new_data_list)

    encodings = {
        "x": alt.X(
            "sparsity_value", title=None, scale=alt.Scale(domain=[0.1, 1.0], clamp=True)
        ),
        "color": alt.Color(
            "sparsity_mode:N",
            title="Sparse Label Type",
            scale=color_scale,
            legend=alt.Legend(
                orient="bottom", direction="horizontal", titleAnchor="start"
            ),
        ),
    }
    y_kwargs = {
        "title": "Mean IoU (%)",
        "scale": alt.Scale(domain=[66, 78], clamp=True),
    }

    error_area = (
        alt.Chart(new_data)
        .mark_area(opacity=0.1)
        .encode(
            y=alt.Y("iou_upper", **y_kwargs),
            y2=alt.Y2("iou_lower"),
            **encodings,
        )
    )
    line = (
        alt.Chart(new_data)
        .mark_line(strokeWidth=1.5)
        .encode(y=alt.Y("iou_mean", **y_kwargs), **encodings)
    )
    point = (
        alt.Chart(new_data)
        .mark_point(size=7)
        .encode(y=alt.Y("iou_mean", **y_kwargs), **encodings)
    )

    combined_chart = line + point + error_area  # type: ignore
    combined_chart = (
        combined_chart.properties(width=170, height=200)
        .facet(
            column=alt.Column(
                "shot",
                sort=["1 shot", "5 shots", "10 shots", "15 shots", "20 shots"],
                header=alt.Header(title="Density Values", titleOrient="bottom"),
            ),
            spacing=10,
        )
        .resolve_scale(x="independent")
        .configure_axis(labelFontSize=12, titleFontSize=16)
        .configure_header(labelFontSize=16, titleFontSize=16)
        .configure_legend(labelFontSize=14, titleFontSize=16)
    )

    return combined_chart

In [None]:
compose_line_chart()

## Tables


In [None]:
best_results_df = pd.merge(
    best_disc_comparison_df,
    best_cup_comparison_df,
    on=best_disc_comparison_df.columns.tolist()[:5],
    suffixes=("_disc", "_cup"),
)
best_results_df["method_order"] = best_results_df["method"].map(
    {
        "miniUNet w/o Q2S": 0,
        "DeepLabv3+ w/o Q2S": 1,
        "miniUNet w/ Q2S": 2,
        "DeepLabv3+ w/ Q2S": 3,
    }
)
best_results_df.sort_values(["dataset", "method_order"], inplace=True)

best_results_df

In [None]:
for i in range(len(best_results_df)):
    row = best_results_df.iloc[i]
    if row["sparsity_mode"] == "point":
        shot, sparsity = int(row["shot"]), "point - " + str(int(row["sparsity_value"]))
    else:
        shot, sparsity = (
            int(row["shot"]),
            row["sparsity_mode"] + f" ({row['sparsity_value']:.2f})",
        )
    if row["method"] == "miniUNet w/o Q2S":
        print("\\hline")
        print("\\multirow{7}{*}{" + row["dataset"] + "}", end="")
    print(
        f" & {row['method']} & & {shot} & {sparsity} & & {row['iou_disc']:.2f} & {row['iou_low_disc']:.2f}-{row['iou_high_disc']:.2f} & & {row['iou_cup']:.2f} & {row['iou_low_cup']:.2f}-{row['iou_high_cup']:.2f} \\\\"
    )

In [None]:
selected_metrics_df = meta_metrics_df[
    (meta_metrics_df["method"] == "DeepLabv3+ w/ Q2S")
]

selected_metrics_df = (
    selected_metrics_df.groupby(["dataset", "sparsity_mode", "sparsity_value", "shot"])[
        "iou"
    ]
    .mean()
    .reset_index()
    .groupby(["sparsity_mode", "sparsity_value", "shot"])["iou"]
    .mean()
    .reset_index()
    .sort_values(["sparsity_mode", "sparsity_value", "shot"])
)

selected_metrics_df

In [None]:
selected_metrics_ls = [[[] for _ in range(5)] for _ in range(5)]

for i in range(len(selected_metrics_df)):
    row = selected_metrics_df.iloc[i]
    ls = selected_metrics_ls[i // 25][(i % 25) // 5]
    if len(ls) == 0:
        ls.append(row["sparsity_value"])
    ls.append(row["iou"])

In [None]:
sparsity_modes = ["contours", "grid", "point", "regions", "skeleton"]

for i in range(5):
    print("\\multirow{5}{*}{mode}  ".replace("mode", sparsity_modes[i]), end="")
    print(
        " \\\\\n".join(
            [
                "& "
                + str(ls[0])
                + " & & "
                + " & ".join([f"{np.mean(v):.2f}" for v in ls[1:]])
                for ls in selected_metrics_ls[i]
            ]
        )
        + " \\\\"
    )
    print("\\hline")

In [None]:
mean_dataset_method = (
    meta_metrics_df.groupby(
        ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
        dropna=False,
    )[["iou", "iou_disc", "iou_cup"]]
    .mean()
    .reset_index()
)

mean_method = (
    mean_dataset_method.groupby(
        ["method", "shot", "sparsity_mode", "sparsity_value"],
        dropna=False,
    )[["iou"]]
    .mean()
    .reset_index()
)

best_method = mean_method.loc[mean_method.groupby(["method"])["iou"].idxmax()]
best_method_1 = best_method[best_method["method"] == "DeepLabv3+ w/ Q2S"].iloc[0]
best_method_2 = best_method[best_method["method"] == "miniUNet w/ Q2S"].iloc[0]

mdm = mean_dataset_method
mdm[
    (
        (mdm["method"] == "DeepLabv3+ w/ Q2S")
        & (mdm["shot"] == best_method_1["shot"])
        & (mdm["sparsity_mode"] == best_method_1["sparsity_mode"])
        & (mdm["sparsity_value"] == best_method_1["sparsity_value"])
    )
    | (
        (mdm["method"] == "miniUNet w/ Q2S")
        & (mdm["shot"] == best_method_2["shot"])
        & (mdm["sparsity_mode"] == best_method_2["sparsity_mode"])
        & (mdm["sparsity_value"] == best_method_2["sparsity_value"])
    )
].sort_values(["method", "dataset"])

## Hypotheses


In [None]:
def test_two_methods_score_t(
    df: pd.DataFrame, higher_method: str, lower_method: str
) -> pd.DataFrame:
    results = []
    for dataset in df["dataset"].unique():
        subset = df[df["dataset"] == dataset]
        higher_row = subset[subset["method"] == higher_method].iloc[0]
        lower_row = subset[subset["method"] == lower_method].iloc[0]

        t_statistic = (higher_row["iou"] - lower_row["iou"]) / np.sqrt(
            (higher_row["iou_std"] ** 2 / higher_row["iou_count"])
            + (lower_row["iou_std"] ** 2 / lower_row["iou_count"])
        )
        p_value = stats.t.sf(
            t_statistic, df=higher_row["iou_count"] + lower_row["iou_count"] - 2
        )
        results.append({"dataset": dataset, "stat": t_statistic, "p_value": p_value})
    return pd.DataFrame(results)

In [None]:
test_two_methods_score_t(comparison_df, "DeepLabv3+ w/ Q2S", "miniUNet w/ Q2S")

In [None]:
test_two_methods_score_t(comparison_df, "DeepLabv3+ w/ Q2S", "DeepLabv3+ w/o Q2S")

In [None]:
test_two_methods_score_t(comparison_df, "DeepLabv3+ w/o Q2S", "miniUNet w/o Q2S")

In [None]:
test_two_methods_score_t(comparison_df, "miniUNet w/ Q2S", "miniUNet w/o Q2S")

In [None]:
def test_two_methods_score_wilcoxon(
    higher_method: str, lower_method: str, use_best: bool
) -> pd.DataFrame:
    results = []
    data = meta_metrics_df[
        ~(
            (meta_metrics_df["sparsity_mode"] == "point")
            & (meta_metrics_df["sparsity_value"] == 1)
        )
    ]
    for dataset in best_comparison_df["dataset"].unique():
        if use_best:
            higher_row = best_comparison_df[
                (best_comparison_df["dataset"] == dataset)
                & (best_comparison_df["method"] == higher_method)
            ].iloc[0]
            lower_row = best_comparison_df[
                (best_comparison_df["dataset"] == dataset)
                & (best_comparison_df["method"] == lower_method)
            ].iloc[0]
            high_filter = (
                (data["dataset"] == dataset)
                & (data["method"] == higher_method)
                & (data["shot"] == higher_row["shot"])
                & (data["sparsity_mode"] == higher_row["sparsity_mode"])
                & (data["sparsity_value"] == higher_row["sparsity_value"])
            )
            low_filter = (
                (data["dataset"] == dataset)
                & (data["method"] == lower_method)
                & (data["shot"] == lower_row["shot"])
                & (data["sparsity_mode"] == lower_row["sparsity_mode"])
                & (data["sparsity_value"] == lower_row["sparsity_value"])
            )
        else:
            high_filter = (data["dataset"] == dataset) & (
                data["method"] == higher_method
            )
            low_filter = (data["dataset"] == dataset) & (data["method"] == lower_method)

        higher_subset = data[high_filter][["batch", "iou"]].sort_values("batch")
        lower_subset = data[low_filter][["batch", "iou"]].sort_values("batch")
        stat, p_value = tuple(
            stats.wilcoxon(
                higher_subset["iou"], lower_subset["iou"], alternative="greater"
            )
        )
        results.append(
            {
                "dataset": dataset,
                "stat": stat,
                "p_value": p_value,
            }
        )

    return pd.DataFrame(results)

In [None]:
test_two_methods_score_wilcoxon("DeepLabv3+ w/ Q2S", "miniUNet w/ Q2S", False)

In [None]:
test_two_methods_score_wilcoxon("DeepLabv3+ w/ Q2S", "DeepLabv3+ w/o Q2S", False)

In [None]:
test_two_methods_score_wilcoxon("DeepLabv3+ w/o Q2S", "miniUNet w/o Q2S", False)

In [None]:
test_two_methods_score_wilcoxon("miniUNet w/ Q2S", "miniUNet w/o Q2S", False)

# Distance Metrics


## Download


In [None]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={
#         "jobType": "test",
#         "createdAt": {"$gt": "2025-11-05T00:00:00Z", "$lt": "2025-11-06T12:00:00Z"},
#     },
# )

# for run in runs:
#     run_id = run.name.split(" ")[2]
#     dataset = run.config["test_dataset"].replace("-test", "")
#     model = run.config["model"]
#     group = run.group + ("-r" if "deeplab" in model else "")
#     run.logged_artifacts()[1].download(
#         f"logs/wandb/2_distance_metrics/{group} {dataset} {run_id}"
#     )
#     print(group, dataset, run_id)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/2_distance_metrics"
# for dir in os.listdir(wandb_dir):
#     group, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch", "loss"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df.insert(0, "method", group)
#     df_list.append(df)

# meta_dist_metrics_df = pd.concat(df_list)
# meta_dist_metrics_df.to_csv("logs/wandb/2_meta_dist_metrics.csv", index=False)

## Preparation


In [None]:
meta_dist_metrics_df = pd.read_csv("logs/wandb/2_meta_dist_metrics.csv")

method_mapping = {
    "PS": "miniUNet w/o Q2S",
    "PS-r": "DeepLabv3+ w/o Q2S",
    "PA": "miniUNet w/ Q2S",
    "PA-r": "DeepLabv3+ w/ Q2S",
}

meta_dist_metrics_df["method"] = meta_dist_metrics_df["method"].apply(
    lambda x: method_mapping[x]
)

meta_dist_metrics_df.drop(
    columns=[
        "iou_cup",
        "iou_disc",
        "boundary_iou_cup",
        "boundary_iou_disc",
        "nsd_cup",
        "nsd_disc",
        "hd_cup",
        "hd_disc",
        "hd_perc_cup",
        "hd_perc_disc",
    ],
    inplace=True,
)

meta_dist_metrics_df["assd"] = (
    meta_dist_metrics_df["assd_cup"] + meta_dist_metrics_df["assd_disc"]
) / 2
meta_dist_metrics_df["masd"] = (
    meta_dist_metrics_df["masd_cup"] + meta_dist_metrics_df["masd_disc"]
) / 2

meta_dist_metrics_df

In [None]:
meta_dist_metrics_df.groupby("method").apply(lambda x: x.isna().sum())

In [None]:
meta_dist_metrics_df.groupby(["method", "dataset"]).mean(
    numeric_only=True
).reset_index()

## Comparison


In [None]:
def compare_metrics(target_column: str) -> pd.DataFrame:
    data = meta_dist_metrics_df[
        ~(
            (meta_dist_metrics_df["sparsity_mode"] == "point")
            & (meta_dist_metrics_df["sparsity_value"] == 1)
        )
    ]

    # data.loc[:,target_column] = data.loc[:,target_column].fillna(100)

    comparison_df = (
        data[["dataset", "method", target_column]]
        .groupby(["dataset", "method"])
        .agg(
            mean=(target_column, "mean"),
            std=(target_column, "std"),
            count=(target_column, "count"),
        )
    ).reset_index()

    comparison_df["std_err"] = comparison_df["std"] / comparison_df["count"] ** 0.5
    comparison_df["low"] = comparison_df["mean"] - 1.96 * comparison_df["std_err"]
    comparison_df["high"] = comparison_df["mean"] + 1.96 * comparison_df["std_err"]

    return comparison_df

In [None]:
assd_comparison_df = compare_metrics("assd")
assd_disc_comparison_df = compare_metrics("assd_disc")
assd_cup_comparison_df = compare_metrics("assd_cup")

masd_comparison_df = compare_metrics("masd")
masd_disc_comparison_df = compare_metrics("masd_disc")
masd_cup_comparison_df = compare_metrics("masd_cup")

In [None]:
masd_comparison_df

## Visualization


In [None]:
def compose_bar_chart(
    data: pd.DataFrame,
    scale: tuple[float, float],
    title: str,
    hide_header: bool = False,
):
    ordered_methods = [
        "miniUNet w/o Q2S",
        "DeepLabv3+ w/o Q2S",
        "miniUNet w/ Q2S",
        "DeepLabv3+ w/ Q2S",
    ]

    color_scale = alt.Scale(
        domain=ordered_methods,
        scheme="category10",
    )

    base = alt.Chart(data).encode(
        x=alt.X(
            "method:N",
            title=None,
            sort=ordered_methods,
            axis=alt.Axis(labels=False, ticks=False),
        ),
    )
    y_scale = alt.Scale(domain=scale, clamp=True)

    layered = (
        base.mark_bar().encode(
            y=alt.Y(
                "mean:Q",
                title=None,
                scale=y_scale,
            ),
            color=alt.Color(
                "method:N",
                scale=color_scale,
                title="Variant",
                legend=alt.Legend(
                    # orient="right",
                    orient="bottom",
                    direction="horizontal",
                    titleAnchor="start",
                    columns=4,
                ),
            ),
        )
        + base.mark_errorbar(
            extent="ci", thickness=2.0, ticks=True, color="black"
        ).encode(
            y=alt.Y(
                "low:Q",
                title=None,
                scale=y_scale,
            ),
            y2="high:Q",
        )
        + base.mark_text(align="center", baseline="top", dy=85, fontSize=13).encode(
            text=alt.Text("mean:Q", format=".2f"),
        )
    ).properties(width=150, height=200)  # type: ignore

    header = alt.Header(labelFontSize=0) if hide_header else alt.Header()
    return layered.facet(
        column=alt.Column("dataset:N", title=title, header=header),
        spacing=4,
    )

In [None]:
(
    (
        compose_bar_chart(masd_disc_comparison_df, (2, 12), "MASD OD (pixels)")
        | compose_bar_chart(masd_cup_comparison_df, (6, 22), "MASD OC (pixels)")
    )
    .configure_axis(labelFontSize=14, titleFontSize=16)
    .configure_header(labelFontSize=14, titleFontSize=16)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

## Hypotheses


In [None]:
def test_two_methods_score_mannwhitney(
    lower_method: str, higher_method: str
) -> pd.DataFrame:
    results = []
    data = meta_dist_metrics_df[
        ~(
            (meta_dist_metrics_df["sparsity_mode"] == "point")
            & (meta_dist_metrics_df["sparsity_value"] == 1)
        )
    ]
    for dataset in masd_comparison_df["dataset"].unique():
        high_filter = (data["dataset"] == dataset) & (data["method"] == higher_method)
        low_filter = (data["dataset"] == dataset) & (data["method"] == lower_method)

        higher_subset = data[high_filter]["masd"].dropna()
        lower_subset = data[low_filter]["masd"].dropna()
        stat, p_value = tuple(
            stats.mannwhitneyu(lower_subset, higher_subset, alternative="less")
        )
        results.append(
            {
                "dataset": dataset,
                "stat": stat,
                "p_value": p_value,
            }
        )

    return pd.DataFrame(results)

In [None]:
test_two_methods_score_mannwhitney("DeepLabv3+ w/ Q2S", "miniUNet w/ Q2S")

In [None]:
test_two_methods_score_mannwhitney("DeepLabv3+ w/ Q2S", "DeepLabv3+ w/o Q2S")

In [None]:
test_two_methods_score_mannwhitney("DeepLabv3+ w/o Q2S", "miniUNet w/o Q2S")

In [None]:
test_two_methods_score_mannwhitney("miniUNet w/ Q2S", "miniUNet w/o Q2S")