# Initialization


## Imports


In [None]:
from typing import Literal

import numpy as np
import pandas as pd
import altair as alt
from scipy import stats

alt.data_transformers.enable("vegafusion")

## Utils


In [None]:
def read_wandb_table(path: str) -> pd.DataFrame:
    import json

    with open(path, "r") as file:
        data = json.load(file)
    columns = data["columns"]
    rows = data["data"]
    return pd.DataFrame(rows, columns=columns)

# Metrics


## Download


In [None]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={"jobType": "test"},
# )

# for i, run in enumerate(runs):
#     run_id = run.name.split(" ")[-1]
#     dataset = run.config["test_dataset"].replace("-test", "")
#     group = run.group
#     if group == "SL":
#         pass
#     elif len(group.split("-")) == 1:
#         group += "-new"
#     elif group.split("-")[1] == "b":
#         group = group.replace("-b", "-new-b")
#     run.logged_artifacts()[2].download(f"logs/wandb/1_metrics/{group} {dataset} {run_id}")
#     print(group, dataset, run_id)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/1_metrics"
# for dir in os.listdir(wandb_dir):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     if not dir.startswith("SL"):
#         continue
#     if dir.split(" ")[-1] not in ["ZAr", "eMD", "N8k"]:
#         continue
#     _, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df_list.append(df)

# simple_metrics_df = pd.concat(df_list)
# simple_metrics_df.to_csv("logs/wandb/1_simple_metrics.csv", index=False)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/1_metrics"
# for dir in os.listdir(wandb_dir):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     if dir.startswith("SL"):
#         continue
#     group, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df.insert(0, "method", group)
#     df_list.append(df)

# meta_metrics_df = pd.concat(df_list)
# meta_metrics_df.to_csv("logs/wandb/1_meta_metrics.csv", index=False)

## Preparation


In [None]:
simple_metrics_df = pd.read_csv("logs/wandb/1_simple_metrics.csv")
simple_metrics_df.insert(0, "method", "SL")

simple_metrics_df["iou_cup"] = simple_metrics_df["iou_cup"] * 100
simple_metrics_df["iou_disc"] = simple_metrics_df["iou_disc"] * 100

simple_metrics_df["iou"] = (
    simple_metrics_df["iou_cup"] + simple_metrics_df["iou_disc"]
) / 2

simple_metrics_df

In [None]:
meta_metrics_df = pd.read_csv("logs/wandb/1_meta_metrics.csv")
meta_metrics_df = meta_metrics_df[~meta_metrics_df["method"].str.endswith("-B")]

meta_metrics_df["iou_cup"] = meta_metrics_df["iou_cup"] * 100
meta_metrics_df["iou_disc"] = meta_metrics_df["iou_disc"] * 100

meta_metrics_df["iou"] = (meta_metrics_df["iou_cup"] + meta_metrics_df["iou_disc"]) / 2

meta_metrics_df

## Comparison


In [None]:
def compare_metrics(target_column: str, use_best: bool) -> pd.DataFrame:
    if use_best:
        comparison_df = (
            pd.concat(
                [
                    simple_metrics_df[["dataset", "method", "iou", target_column]],
                    meta_metrics_df[
                        [
                            "dataset",
                            "method",
                            "shot",
                            "sparsity_mode",
                            "sparsity_value",
                            "iou",
                            target_column,
                        ]
                    ],
                ]
            )
            .rename(columns={"iou": "iou_ref"})
            .groupby(
                ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
                dropna=False,
            )
            .agg(
                iou_ref=("iou_ref", "mean"),
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
            .reset_index()
        )
        comparison_df = comparison_df.loc[
            comparison_df.groupby(["dataset", "method"])["iou_ref"].idxmax()
        ]
    else:
        comparison_df = (
            pd.concat(
                [
                    simple_metrics_df[["dataset", "method", target_column]],
                    meta_metrics_df[["dataset", "method", target_column]],
                ]
            )
            .groupby(["dataset", "method"])
            .agg(
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
        ).reset_index()

    comparison_df["iou_std_err"] = (
        comparison_df["iou_std"] / comparison_df["iou_count"] ** 0.5
    )
    comparison_df["iou_low"] = (
        comparison_df["iou"] - 1.96 * comparison_df["iou_std_err"]
    )
    comparison_df["iou_high"] = (
        comparison_df["iou"] + 1.96 * comparison_df["iou_std_err"]
    )

    return comparison_df

In [None]:
comparison_df = compare_metrics("iou", False)
disc_comparison_df = compare_metrics("iou_disc", False)
cup_comparison_df = compare_metrics("iou_cup", False)
# best_comparison_df = compare_metrics("iou", True)
best_disc_comparison_df = compare_metrics("iou_disc", True)
best_cup_comparison_df = compare_metrics("iou_cup", True)

In [None]:
def compare_methods_with_sl_by_dataset(df):
    """
    Performs one-tailed t-tests comparing SL method with all other methods within each dataset.
    H0: μ_SL <= μ_other
    H1: μ_SL > μ_other

    Parameters:
    df: DataFrame containing columns 'dataset', 'method', 'iou', 'iou_std', 'iou_count'

    Returns:
    DataFrame with comparison results
    """
    # Initialize results list
    results = []

    # Group by dataset and process each separately
    for dataset in df["dataset"].unique():
        dataset_df = df[df["dataset"] == dataset]

        # Get SL method statistics for this dataset
        sl_stats = dataset_df[dataset_df["method"] == "SL"].iloc[0]
        sl_mean = sl_stats["iou"]
        sl_std = sl_stats["iou_std"]
        sl_n = sl_stats["iou_count"]

        # Compare SL with each other method in this dataset
        for _, row in dataset_df[dataset_df["method"] != "SL"].iterrows():
            other_mean = row["iou"]
            other_std = row["iou_std"]
            other_n = row["iou_count"]

            # Calculate pooled standard error
            s_p = np.sqrt((sl_std**2 / sl_n) + (other_std**2 / other_n))

            # Calculate t-statistic
            t_stat = (sl_mean - other_mean) / s_p

            # Calculate degrees of freedom using Welch-Satterthwaite equation
            df_num = (sl_std**2 / sl_n + other_std**2 / other_n) ** 2
            df_denom = (sl_std**4) / (sl_n**2 * (sl_n - 1)) + (other_std**4) / (
                other_n**2 * (other_n - 1)
            )
            df_welch = df_num / df_denom

            # Calculate one-tailed p-value
            p_value = 1 - stats.t.cdf(t_stat, df_welch)

            # Store results
            results.append(
                {
                    "dataset": dataset,
                    "method_compared": row["method"],
                    "sl_iou": sl_mean,
                    "method_iou": other_mean,
                    "iou_diff": sl_mean - other_mean,
                    "t_statistic": t_stat,
                    "degrees_of_freedom": df_welch,
                    "p_value": p_value,
                }
            )

    # Create DataFrame and sort by dataset and p-value
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values(["dataset", "p_value"])

    # Round numerical columns for better readability
    numeric_cols = [
        "sl_iou",
        "method_iou",
        "iou_diff",
        "t_statistic",
        "degrees_of_freedom",
        "p_value",
    ]
    result_df[numeric_cols] = result_df[numeric_cols].round(6)

    return result_df


# compare_methods_with_sl_by_dataset(best_comparison_df)

## Visualization


In [None]:
ordered_methods = [
    "ProtoSeg",
    "O-ProtoSeg",
    "EO-ProtoSeg",
    "SL",
    "WeaSeL",
    "O-WeaSeL",
    "EO-WeaSeL",
]

color_scale = alt.Scale(
    domain=ordered_methods,
    range=[
        "#ffda03",  # Yellow
        "#e85d04",  # Orange
        "#d00000",  # Red
        "#757575",  # Gray
        "#43b0f1",  # Blue
        "#2ec4b6",  # Turquoise
        "#2d6a4f",  # Green
    ],
)

In [None]:
def compose_bar_chart(
    dataframes: list[tuple[pd.DataFrame, str]], scale: tuple[float, float] | None = None
):
    new_dataframes = []
    for df in dataframes:
        new_df = df[0].copy()
        new_df["iou_type"] = df[1]
        new_dataframes.append(new_df)
    data = pd.concat(new_dataframes)

    base = alt.Chart(data).encode(
        x=alt.X(
            "method:N",
            title=None,
            sort=ordered_methods,
            axis=alt.Axis(labels=False, ticks=False),
        ),
    )
    if scale is not None:
        y_scale = alt.Scale(domain=scale, clamp=True)
    else:
        y_scale = alt.Scale()

    layered = (
        base.mark_bar().encode(
            y=alt.Y(
                "iou:Q",
                title="Mean IoU",
                scale=y_scale,
            ),
            color=alt.Color(
                "method:N",
                scale=color_scale,
                title="Method",
                legend=alt.Legend(
                    orient="bottom",
                    direction="horizontal",
                    titleAnchor="start",
                    columns=4,
                ),
            ),
        )
        + base.mark_errorbar(
            extent="ci", thickness=2.0, ticks=True, color="black"
        ).encode(
            y=alt.Y(
                "iou_low:Q",
                title=None,
                scale=y_scale,
            ),
            y2="iou_high:Q",
        )
        + base.mark_text(align="center", baseline="top", dy=85, fontSize=14).encode(
            text=alt.Text("iou:Q", format=".0f"),
        )
    ).properties(width=200, height=200)  # type: ignore

    chart = (
        layered.facet(
            row=alt.Row("dataset:N", title="Dataset"),
            column=alt.Column("iou_type:N", title="IoU (%)", sort="descending"),
        )
        .configure_axis(labelFontSize=12, titleFontSize=16)
        .configure_header(labelFontSize=12, titleFontSize=16)
        .configure_legend(labelFontSize=14, titleFontSize=16)
    )

    if scale is None:
        chart = chart.resolve_scale(y="independent")

    return chart

In [None]:
compose_bar_chart(
    [
        (disc_comparison_df, "Overall Mean - Optic Disc"),
        (best_disc_comparison_df, "Best Mean - Optic Disc"),
    ],
    (50, 100),
)

In [None]:
compose_bar_chart(
    [
        (cup_comparison_df, "Overall Mean - Optic Cup"),
        (best_cup_comparison_df, "Best Mean  - Optic Cup"),
    ],
    (0, 80),
)

In [None]:
def compose_line_chart(data: pd.DataFrame):
    new_data = data.copy()
    new_data["shot"] = new_data["shot"].apply(
        lambda x: f"{x} shot" if x == 1 else f"{x} shots"
    )
    new_data["sparsity_mode"] = "IoU " + new_data["sparsity_mode"] + " (%)"
    encodings = {
        "x": alt.X("sparsity_value", title=None),
        "y": alt.Y("mean(iou)", title=None),
        "color": alt.Color(
            "method",
            title="Methods",
            scale=color_scale,
            legend=alt.Legend(
                orient="bottom", direction="horizontal", titleAnchor="start"
            ),
        ),
    }

    error_band = (
        alt.Chart(new_data).mark_errorband(extent="ci", opacity=0.5).encode(**encodings)
    )
    line = alt.Chart(new_data).mark_line(strokeWidth=1).encode(**encodings)
    point = alt.Chart(new_data).mark_point(size=5).encode(**encodings)

    combined_chart = error_band + line + point  # type: ignore
    combined_chart = (
        combined_chart.properties(width=150, height=150)
        .facet(
            row=alt.Row("sparsity_mode", title=None),
            column=alt.Column(
                "shot",
                sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
                header=alt.Header(title="Sparsity Values", titleOrient="bottom"),
            ),
            spacing=10,
        )
        .resolve_scale(x="independent")
        .configure_axis(labelFontSize=12)
        .configure_header(labelFontSize=16, titleFontSize=16)
        .configure_legend(labelFontSize=14, titleFontSize=16)
    )

    return combined_chart

In [None]:
compose_line_chart(meta_metrics_df)

In [None]:
# meta_metrics_df_with_ref = meta_metrics_df.copy()

# meta_metrics_df_with_ref["method_parent"] = (
#     meta_metrics_df_with_ref["method"].str.split("-").str[-1]
# )
# meta_metrics_df_with_ref["method_child"] = meta_metrics_df_with_ref["method"].apply(
#     lambda x: x.split("-")[0] if "-" in x else "original"
# )

# meta_metrics_df_with_ref = pd.merge(
#     meta_metrics_df_with_ref,
#     simple_metrics_df.groupby(["dataset"])["iou"].mean(),
#     on="dataset",
#     suffixes=("", "_ref"),
# )

# meta_metrics_df_with_ref

In [None]:
# base = alt.Chart(meta_metrics_df_with_ref)

# lines = base.mark_line().encode(
#     x=alt.X("sparsity_value", title=None),
#     y="mean(iou)",
#     color=alt.Color(
#         "method_parent",
#         scale=alt.Scale(domain=["ProtoSeg", "WeaSeL"], range=["#ff4444", "#77aaff"]),
#     ),
#     strokeDash="method_child",
# )

# ref_lines = base.mark_rule(color="#33cc33").encode(y="mean(iou_ref)")

# (lines + ref_lines).properties(width=150, height=150).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(x="independent")

In [None]:
# data = meta_metrics_df[
#     (meta_metrics_df["method"].str.endswith("ProtoSeg"))
#     & (meta_metrics_df["dataset"] == "REFUGE")
# ]
# alt.Chart(data).mark_errorband(extent="ci").encode(
#     x="sparsity_value",
#     y="mean(iou)",
#     color="method",
# ).properties(width=300, height=200).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(
#     x="independent",
# )

In [None]:
# data = meta_metrics_df[
#     (meta_metrics_df["method"].str.endswith("WeaSeL"))
#     & (meta_metrics_df["dataset"] == "REFUGE")
# ]
# alt.Chart(data).mark_errorband(extent="ci").encode(
#     x="sparsity_value",
#     y="mean(iou)",
#     color="method",
# ).properties(width=300, height=200).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(
#     x="independent",
# )

## Tables


In [None]:
best_results_df = pd.merge(
    best_disc_comparison_df,
    best_cup_comparison_df,
    on=best_disc_comparison_df.columns.tolist()[:5],
    suffixes=("_disc", "_cup"),
)
best_results_df["method_order"] = best_results_df["method"].map(
    {
        "ProtoSeg": 0,
        "O-ProtoSeg": 1,
        "EO-ProtoSeg": 2,
        "SL": 3,
        "WeaSeL": 4,
        "O-WeaSeL": 5,
        "EO-WeaSeL": 6,
    }
)
best_results_df.sort_values(["dataset", "method_order"], inplace=True)

best_results_df

In [None]:
for i in range(len(best_results_df)):
    row = best_results_df.iloc[i]
    is_sl = row["method"] == "SL"
    is_point = row["sparsity_mode"] == "point"
    if is_sl:
        shot, sparsity = "-", "-"
    elif is_point:
        shot, sparsity = int(row["shot"]), "point - " + str(int(row["sparsity_value"]))
    else:
        shot, sparsity = (
            int(row["shot"]),
            row["sparsity_mode"] + " - " + f"{row['sparsity_value']:.2f}",
        )
    print(
        f"& {row['method']} & {shot} & {sparsity} & {row['iou_disc']:.2f} & {row['iou_low_disc']:.2f}-{row['iou_high_disc']:.2f} & {row['iou_cup']:.2f} & {row['iou_low_cup']:.2f}-{row['iou_high_cup']:.2f} \\\\"
    )

In [None]:
eop_metrics_df = meta_metrics_df[(meta_metrics_df["method"] == "EO-ProtoSeg")]

agg_eop_metrics_df = (
    eop_metrics_df.groupby(["sparsity_mode", "sparsity_value", "shot"])
    .agg(iou=("iou", "mean"))
    .sort_values(["sparsity_mode", "sparsity_value", "shot"])
    .reset_index()
)

agg_eop_metrics_df

In [None]:
agg_eop_metrics_ls = [[[] for _ in range(5)] for _ in range(5)]

for i in range(len(agg_eop_metrics_df)):
    row = agg_eop_metrics_df.iloc[i]
    ls = agg_eop_metrics_ls[i // 25][(i % 25) // 5]
    if len(ls) == 0:
        ls.append(row["sparsity_value"])
    ls.append(row["iou"])

In [None]:
sparsity_modes = ["contours", "grid", "point", "regions", "skeleton"]

for i in range(5):
    print("\\multirow{5}{*}{mode}  ".replace("mode", sparsity_modes[i]), end="")
    print(
        " \\\\\n".join(
            [
                "& " + " & ".join([f"{np.mean(v):.2f}" for v in ls])
                for ls in agg_eop_metrics_ls[i]
            ]
        )
        + " \\\\"
    )
    print("\\hline")

## Hypothesis Testing


In [None]:
def test_two_methods_score_od_oc(
    higher_method: str, lower_method: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
    df = meta_metrics_df.drop(
        columns=["loss", "iou", "shot", "sparsity_mode", "sparsity_value"]
    )
    higher_df = df[df["method"] == higher_method]
    lower_df = df[df["method"] == lower_method]

    merged_df = pd.merge(
        higher_df, lower_df, on=["dataset", "batch"], suffixes=("_h", "_l")
    )
    n = len(merged_df)

    merged_df["iou_cup_diff"] = merged_df["iou_cup_h"] - merged_df["iou_cup_l"]
    merged_df["iou_disc_diff"] = merged_df["iou_disc_h"] - merged_df["iou_disc_l"]

    grouped_df = merged_df.groupby(["dataset"])[["iou_cup_diff", "iou_disc_diff"]]
    t_vals = grouped_df.mean() / (grouped_df.std() / np.sqrt(n))

    p_vals = t_vals.apply(lambda x: stats.t.sf(x, n - 1))

    return t_vals, p_vals

In [None]:
# def test_two_methods_score(higher_method: str, lower_method: str):
#     df = meta_metrics_df.drop(
#         columns=[
#             "loss",
#             "iou_cup",
#             "iou_disc",
#             "shot",
#             "sparsity_mode",
#             "sparsity_value",
#         ]
#     )
#     higher_df = df[df["method"] == higher_method]
#     lower_df = df[df["method"] == lower_method]
#     merged_df = pd.merge(
#         higher_df, lower_df, on=["dataset", "batch"], suffixes=("_h", "_l")
#     )

#     grouped_df = merged_df.groupby(["dataset"])[["iou_h", "iou_l"]]
#     return grouped_df.apply(lambda x: stats.wilcoxon(x["iou_h"], x["iou_l"], alternative="greater"))

# merged_df["iou_diff"] = merged_df["iou_h"] - merged_df["iou_l"]
# grouped_df = merged_df.groupby(["dataset"])["iou_diff"]
# n = len(merged_df)
# t_vals = grouped_df.mean() / (grouped_df.std() / np.sqrt(n))
# p_vals = t_vals.apply(lambda x: stats.t.sf(x, n - 1))
# return t_vals, p_vals

In [None]:
def test_two_methods_score(higher_method: str, lower_method: str):
    best_indices = (
        meta_metrics_df.groupby(
            ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
            dropna=False,
        )["iou"]
        .mean()
        .groupby(["dataset", "method"])
        .idxmax()
    )

    def get_best_ious(method: str) -> pd.DataFrame:
        df_list = []
        for ds in meta_metrics_df["dataset"].unique():
            _, _, shot, sparsity_mode, sparsity_value = best_indices[ds, method]
            df = meta_metrics_df[
                (meta_metrics_df["dataset"] == ds)
                & (meta_metrics_df["method"] == method)
                & (meta_metrics_df["shot"] == shot)
                & (meta_metrics_df["sparsity_mode"] == sparsity_mode)
                & (meta_metrics_df["sparsity_value"] == sparsity_value)
            ]
            min_batch = df["batch"].min()
            df.loc[:, "batch"] = df["batch"] - min_batch + 1
            df_list.append(df[["dataset", "method", "batch", "iou"]])
        return pd.concat(df_list).reset_index(drop=True)

    higher_df = get_best_ious(higher_method)
    lower_df = get_best_ious(lower_method)
    merged_df = pd.merge(
        higher_df, lower_df, on=["dataset", "batch"], suffixes=("_h", "_l")
    )

    grouped_df = merged_df.groupby(["dataset"])[["iou_h", "iou_l"]]
    return grouped_df.apply(
        lambda x: stats.wilcoxon(x["iou_h"], x["iou_l"], alternative="greater")
    )

In [None]:
test_two_methods_score("EO-ProtoSeg", "ProtoSeg")

In [None]:
test_two_methods_score("EO-WeaSeL", "WeaSeL")

In [None]:
test_two_methods_score("EO-ProtoSeg", "EO-WeaSeL")

In [None]:
test_two_methods_score("ProtoSeg", "WeaSeL")

In [None]:
# diff_value = 5
# dropped_columns = [
#     "shot",
#     "sparsity_mode",
#     "sparsity_value",
#     "iou_std_err",
#     "iou_low",
#     "iou_high",
# ]
# rename_columns = {
#     "iou_ref": "ref",
#     "iou": "mean",
#     "iou_std": "std",
#     "iou_count": "n",
# }
# disc_df = best_disc_comparison_df.drop(columns=dropped_columns).rename(
#     columns=rename_columns
# )
# cup_df = best_cup_comparison_df.drop(columns=dropped_columns).rename(
#     columns=rename_columns
# )

# simple_disc_df = disc_df[disc_df["method"] == "SL"].drop(columns=["method", "ref"])
# simple_disc_df["object"] = "disc"
# simple_cup_df = cup_df[cup_df["method"] == "SL"].drop(columns=["method", "ref"])
# simple_cup_df["object"] = "cup"
# simple_df = pd.concat([simple_disc_df, simple_cup_df], axis=0)

# meta_disc_df = disc_df.loc[
#     disc_df[disc_df["method"] != "SL"].groupby(["dataset"])["ref"].idxmax()
# ].drop(columns=["method", "ref"])
# meta_disc_df["object"] = "disc"
# meta_cup_df = cup_df.loc[
#     cup_df[cup_df["method"] != "SL"].groupby(["dataset"])["ref"].idxmax()
# ].drop(columns=["method", "ref"])
# meta_cup_df["object"] = "cup"
# meta_df = pd.concat([meta_disc_df, meta_cup_df], axis=0)

# df = pd.merge(
#     simple_df,
#     meta_df,
#     on=["dataset", "object"],
#     suffixes=("_simple", "_meta"),
# )

# var_simple = (df["std_simple"] ** 2) / df["n_simple"]
# var_meta = (df["std_meta"] ** 2) / df["n_meta"]

# df["t_value"] = (df["mean_simple"] - df["mean_meta"] - diff_value) / (
#     var_simple + var_meta
# ) ** 0.5

# df["dof"] = ((var_simple + var_meta) ** 2) / (
#     (var_simple**2) / (df["n_simple"] - 1) + (var_meta**2) / (df["n_meta"] - 1)
# )

# df["p_value"] = stats.t.cdf(df["t_value"], df["dof"]).round(6)

# df

In [None]:
best_meta_df = (
    meta_metrics_df.groupby(
        ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
        dropna=False,
    )
    .agg(
        iou=("iou", "mean"),
        iou_cup=("iou_cup", "mean"),
        iou_disc=("iou_disc", "mean"),
    )
    .groupby(["dataset"])["iou"]
    .idxmax()
)

diff_value = 0
stat_test_result = []
for dataset in ["DRISHTI-GS", "REFUGE", "RIM-ONE-3"]:
    for object in ["disc", "cup"]:
        simple_score = simple_metrics_df[(simple_metrics_df["dataset"] == dataset)][
            f"iou_{object}"
        ]

        _, method, shot, sparsity_mode, sparsity_value = best_meta_df.loc[dataset]
        meta_score = meta_metrics_df[
            (meta_metrics_df["dataset"] == dataset)
            & (meta_metrics_df["method"] == method)
            & (meta_metrics_df["shot"] == shot)
            & (meta_metrics_df["sparsity_mode"] == sparsity_mode)
            & (meta_metrics_df["sparsity_value"] == sparsity_value)
        ][f"iou_{object}"]

        statistic, pvalue = stats.mannwhitneyu(
            simple_score - diff_value, meta_score, alternative="greater"
        )

        stat_test_result.append(
            {
                "dataset": dataset,
                "object": object,
                "simple_score": simple_score.mean(),
                "meta_score": meta_score.mean(),
                "meta_method": method,
                "meta_shot": shot,
                "meta_sparsity_mode": sparsity_mode,
                "meta_sparsity_value": sparsity_value,
                "statistic": statistic,
                "p_value": pvalue,
            }
        )

pd.DataFrame(stat_test_result)

# Test Profiles


## Download


In [None]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={"jobType": "profile-test", "createdAt": {"$gt": "2025-01-01T00:00:00Z"}},
# )

# group_names = {
#     "SL": "SL",
#     "WS-ori": "WeaSeL",
#     "WS-ms": "O-WeaSeL",
#     "WS": "EO-WeaSeL",
#     "PS-ori": "ProtoSeg",
#     "PS-mp": "O-ProtoSeg",
#     "PS": "EO-ProtoSeg",
# }

# for i, run in enumerate(runs):
#     group = group_names[run.group]
#     run_id = run.name.split(" ")[-1]
#     batch_size = run.config["batch_size"]
#     shot = run.config.get("shot", -1)
#     shot_str = f" s{shot}" if shot != -1 else ""
#     run.logged_artifacts()[0].download(
#         f"logs/wandb/1_test_profile/{group} b{batch_size}{shot_str} {run_id}"
#     )
#     print(group, run_id)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/1_test_profile"
# for i, dir in enumerate(os.listdir(wandb_dir)):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     splitted = dir.split(" ")
#     if len(splitted) == 3:
#         group, batch_str, _ = splitted
#         shot = -1
#     else:
#         group, batch_str, shot_str, _ = splitted
#         shot = int(shot_str[1:])
#     batch_size = int(batch_str[1:])
#     df = read_wandb_table(f"{wandb_dir}/{dir}/test_profile.table.json")
#     df.insert(0, "shot", shot)
#     df.insert(0, "batch_size", batch_size)
#     df.insert(0, "method", group)
#     df.insert(0, "index", i)
#     df_list.append(df)

# test_profile_df = pd.concat(df_list)
# test_profile_df.to_csv("logs/wandb/1_test_profile.csv", index=False)

## Preparation


In [None]:
def calc_confidence_limits(
    data: pd.DataFrame,
    mean_col: str = "Mean (s)",
    std_col: str = "Std (s)",
    ci: Literal[90, 95, 99] = 95,
) -> pd.DataFrame:
    data["Std Err"] = data[std_col] / (data["Num Calls"]) ** 0.5
    if ci == 90:
        z = 1.645
    elif ci == 95:
        z = 1.96
    elif ci == 99:
        z = 2.576
    data[f"CL {ci} L"] = data[mean_col] - z * data["Std Err"]
    data[f"CL {ci} U"] = data[mean_col] + z * data["Std Err"]
    return data

In [None]:
def combine_mean(data: pd.DataFrame, mean_col: str, num_items_col: str) -> float:
    return (data[mean_col] * data[num_items_col]).sum() / data[num_items_col].sum()


def combine_variance(
    data: pd.DataFrame,
    mean_col: str,
    var_col: str,
    num_items_col: str,
    combined_mean: float | None = None,
) -> float:
    num_items = data[num_items_col]
    variances = data[var_col] ** 2
    means = data[mean_col]
    if combined_mean is None:
        combined_mean = combine_mean(data, mean_col, num_items_col)

    weighted_var = ((num_items - 1) * variances).sum()
    between_var = (num_items * (means - combined_mean) ** 2).sum()

    total_num_items = num_items.sum()
    return (weighted_var + between_var) / total_num_items


def combine_mean_variance(
    data: pd.DataFrame,
    groupby_cols: list[str],
    mean_col: str,
    std_col: str,
    num_items_col: str,
) -> pd.DataFrame:
    def agg_func(data: pd.DataFrame) -> pd.Series:
        combined_mean = combine_mean(data, mean_col, num_items_col)
        combined_std = (
            combine_variance(data, mean_col, std_col, num_items_col, combined_mean)
            ** 0.5
        )
        total_num_items = data[num_items_col].sum()
        return pd.Series(
            {
                mean_col: combined_mean,
                std_col: combined_std,
                num_items_col: total_num_items,
            }
        )

    combined_df = data.groupby(groupby_cols).apply(agg_func).reset_index()
    return combined_df


In [None]:
test_profile_df = pd.read_csv("logs/wandb/test_profile.csv")

test_profile_df

## Simple Learner


In [None]:
sl_test_profile_df = test_profile_df[test_profile_df["method"].isin(["SL"])]

sl_inf_df = sl_test_profile_df[
    sl_test_profile_df["Action"] == "[Learner]SimpleUnet.forward"
].drop(columns=["index", "Action", "Percentage (%)"])
sl_inf_df["Mean per Image (s)"] = sl_inf_df["Mean (s)"] / sl_inf_df["batch_size"]
sl_inf_df["Std per Image (s)"] = sl_inf_df["Std (s)"] / sl_inf_df["batch_size"]

sl_inf_df = calc_confidence_limits(sl_inf_df, "Mean per Image (s)", "Std per Image (s)")

line_chart = (
    alt.Chart(sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(sl_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## ProtoSeg


In [None]:
ps_test_profile_df = test_profile_df[test_profile_df["method"].str.endswith("ProtoSeg")]

ps_test_profile_df[ps_test_profile_df["index"] == 0]

In [None]:
ps_inf_df = calc_confidence_limits(
    ps_test_profile_df[
        ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.get_prototypes"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)

line_chart = (
    alt.Chart(ps_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=300, height=200).facet(column="shot")

In [None]:
ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.prediction"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]

ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ps_inf_df,
    ]
)

ps_sl_inf_df = ps_sl_inf_df[ps_sl_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ps_sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_sl_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

In [None]:
ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.post_process"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]

ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_inf_df = ps_inf_df[ps_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ps_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## WeaSeL


In [None]:
ws_test_profile_df = test_profile_df[test_profile_df["method"].str.endswith("WeaSeL")]

tp_rows = ws_test_profile_df["Action"] == "[Learner]WeaselUnet.tune_process"

ws_test_profile_df.loc[tp_rows, "Num Calls"] = (
    ws_test_profile_df.loc[tp_rows, "Num Calls"] // 33
)
ws_test_profile_df.loc[tp_rows, "Mean (s)"] = (
    ws_test_profile_df.loc[tp_rows, "Mean (s)"] * 33
)
ws_test_profile_df.loc[tp_rows, "Std (s)"] = (
    ws_test_profile_df.loc[tp_rows, "Std (s)"] * 33**0.5
)

# ws_test_profile_df[ws_test_profile_df["index"] == 80]
ws_test_profile_df

In [None]:
ws_inf_df = calc_confidence_limits(
    ws_test_profile_df[
        ws_test_profile_df["Action"] == "[Learner]WeaselUnet.tune_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)

line_chart = (
    alt.Chart(ws_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ws_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=300, height=200).facet(column="shot")

In [None]:
ws_inf_df = ws_test_profile_df[
    ws_test_profile_df["Action"] == "[Learner]WeaselUnet.prediction"
].drop(columns=["index", "Action", "Percentage (%)"])
ws_inf_df["Mean per Image (s)"] = ws_inf_df["Mean (s)"] / ws_inf_df["batch_size"]
ws_inf_df["Std per Image (s)"] = ws_inf_df["Std (s)"] / ws_inf_df["batch_size"]

ws_inf_df = combine_mean_variance(
    ws_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ws_inf_df = calc_confidence_limits(ws_inf_df, "Mean per Image (s)", "Std per Image (s)")
ws_inf_df["CL 95 L"] = ws_inf_df["CL 95 L"].clip(lower=0)

ws_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ws_inf_df,
    ]
)

ws_sl_inf_df = ws_sl_inf_df[ws_sl_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ws_sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ws_sl_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

In [None]:
ws_inf_df = ws_test_profile_df[
    ws_test_profile_df["Action"] == "[Learner]WeaselUnet.post_process"
].drop(columns=["index", "Action", "Percentage (%)"])
ws_inf_df["Mean per Image (s)"] = ws_inf_df["Mean (s)"] / ws_inf_df["batch_size"]
ws_inf_df["Std per Image (s)"] = ws_inf_df["Std (s)"] / ws_inf_df["batch_size"]

ws_inf_df = combine_mean_variance(
    ws_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ws_inf_df = calc_confidence_limits(ws_inf_df, "Mean per Image (s)", "Std per Image (s)")
ws_inf_df["CL 95 L"] = ws_inf_df["CL 95 L"].clip(lower=0)

ws_inf_df = ws_inf_df[ws_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ws_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ws_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## Publication


In [None]:
ordered_methods = [
    "ProtoSeg",
    "O-ProtoSeg",
    "EO-ProtoSeg",
    "SL",
    "WeaSeL",
    "O-WeaSeL",
    "EO-WeaSeL",
]

color_values = [
    "#ffda03",  # Yellow
    "#e85d04",  # Orange
    "#d00000",  # Red
    "#757575",  # Gray
    "#43b0f1",  # Blue
    "#2ec4b6",  # Turquoise
    "#2d6a4f",  # Green
]

color_scale = alt.Scale(domain=ordered_methods, range=color_values)
color_scale_no_sl = alt.Scale(
    domain=ordered_methods[:3] + ordered_methods[4:],
    range=color_values[:3] + color_values[4:],
)
color_scale_ps = alt.Scale(
    domain=ordered_methods[:3],
    range=color_values[:3],
)

In [None]:
width, height = 150, 150

color = alt.Color(
    "method:N",
    scale=color_scale_no_sl,
    title="Method",
    legend=alt.Legend(
        orient="bottom",
        direction="horizontal",
        titleAnchor="start",
        columns=6,
        symbolOpacity=1,
    ),
)

ps_inf_df = calc_confidence_limits(
    ps_test_profile_df[
        ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.get_prototypes"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)
ps_inf_df["shot"] = ps_inf_df["shot"].apply(lambda x: f"{x}-shot")

line_chart = (
    alt.Chart(ps_inf_df).mark_line().encode(x="batch_size", y="Mean (s)", color=color)
)
error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband()
    .encode(
        x=alt.X("batch_size")
        .title(None)
        .scale(domain=[1, 16], nice=False)
        .axis(labels=False),
        y=alt.Y("CL 95 U").title("ProtoSeg Time (s)").scale(nice=False),
        y2="CL 95 L",
        color=color,
    )
)
ps_chart = (
    (line_chart + error_chart)
    .properties(width=width, height=height)
    .facet(
        column=alt.Column(
            "shot",
            sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
            header=alt.Header(title=None),
        ),
        spacing=10,
    )
)

ws_inf_df = calc_confidence_limits(
    ws_test_profile_df[
        ws_test_profile_df["Action"] == "[Learner]WeaselUnet.tune_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)
ws_inf_df["shot"] = ws_inf_df["shot"].apply(lambda x: f"{x}-shot")

line_chart = (
    alt.Chart(ws_inf_df).mark_line().encode(x="batch_size", y="Mean (s)", color=color)
)
error_chart = (
    alt.Chart(ws_inf_df)
    .mark_errorband()
    .encode(
        x=alt.X("batch_size").title(None).scale(domain=[1, 16], nice=False),
        y=alt.Y("CL 95 U").title("WeaSeL Time (s)").scale(nice=False),
        y2="CL 95 L",
        color=color,
    )
)
ws_chart = (
    (line_chart + error_chart)
    .properties(width=width, height=height)
    .facet(
        column=alt.Column(
            "shot",
            sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
            header=alt.Header(title="Batch Size", titleOrient="bottom", labels=False),
        ),
        spacing=10,
    )
)

(
    alt.vconcat(ps_chart, ws_chart)
    .configure_axis(labelFontSize=14, titleFontSize=14)
    .configure_header(labelFontSize=14, titleFontSize=14)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

In [None]:
color = alt.Color(
    "method:N",
    scale=color_scale,
    title="Method",
    legend=alt.Legend(
        orient="bottom",
        direction="horizontal",
        titleAnchor="start",
        columns=4,
        symbolOpacity=1,
    ),
)

ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.prediction"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]
ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)
ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ps_inf_df,
    ]
)
ps_sl_inf_df["method_parent"] = "ProtoSeg"

ws_inf_df = ws_test_profile_df[
    ws_test_profile_df["Action"] == "[Learner]WeaselUnet.prediction"
].drop(columns=["index", "Action", "Percentage (%)"])
ws_inf_df["Mean per Image (s)"] = ws_inf_df["Mean (s)"] / ws_inf_df["batch_size"]
ws_inf_df["Std per Image (s)"] = ws_inf_df["Std (s)"] / ws_inf_df["batch_size"]
ws_inf_df = combine_mean_variance(
    ws_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)
ws_inf_df = calc_confidence_limits(ws_inf_df, "Mean per Image (s)", "Std per Image (s)")
ws_inf_df["CL 95 L"] = ws_inf_df["CL 95 L"].clip(lower=0)

ws_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ws_inf_df,
    ]
)
ws_sl_inf_df["method_parent"] = "WeaSeL"

all_inf_df = pd.concat([ps_sl_inf_df, ws_sl_inf_df])
all_inf_df = all_inf_df[all_inf_df["batch_size"] >= 4]

line_chart = (
    alt.Chart(all_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color=color,
    )
)

error_chart = (
    alt.Chart(all_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x=alt.X("batch_size").title("Batch Size"),
        y=alt.Y("CL 95 U").title("Time per Image (s)"),
        y2="CL 95 L",
        color=color,
    )
)

(
    (error_chart + line_chart)
    .properties(width=335, height=200)
    .facet(row=alt.Row("method_parent", header=alt.Header(title=None)), spacing=10)
    .resolve_scale(y="independent")
    .configure_axis(labelFontSize=14, titleFontSize=14)
    .configure_header(labelFontSize=14, titleFontSize=14)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

In [None]:
color = alt.Color(
    "method:N",
    scale=color_scale_ps,
    title="Method",
    legend=alt.Legend(
        orient="bottom",
        direction="horizontal",
        titleAnchor="start",
        columns=4,
        symbolOpacity=1,
    ),
)

ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.post_process"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]
ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)
ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_inf_df = ps_inf_df[ps_inf_df["batch_size"] >= 4]

line_chart = (
    alt.Chart(ps_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color=color,
    )
)

error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x=alt.X("batch_size").title("Batch Size"),
        y=alt.Y("CL 95 U").title("Time per Image (s)"),
        y2="CL 95 L",
        color=color,
    )
)

(
    (line_chart + error_chart)
    .properties(width=335, height=200)
    .configure_axis(labelFontSize=14, titleFontSize=14)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

## Hypothesis Testing


In [None]:
# def test_two_methods_time_paired(
#     faster_method: str, slower_method: str, action_name: str
# ) -> tuple[float, float]:
#     df = test_profile_df[test_profile_df["Action"] == action_name]
#     df = df.drop(
#         columns=["index", "Action", "Std (s)", "Num Calls", "Sum (s)", "Percentage (%)"]
#     )
#     slower_df = df[df["method"] == slower_method]
#     faster_df = df[df["method"] == faster_method]

#     merged_df = pd.merge(
#         slower_df, faster_df, on=["batch_size", "shot"], suffixes=(" S", " F")
#     )
#     n = len(merged_df)

#     merged_df["diff"] = merged_df["Mean (s) F"] - merged_df["Mean (s) S"]

#     t_value = merged_df["diff"].mean() / (merged_df["diff"].std() / np.sqrt(n))
#     p_value = stats.t.cdf(t_value, n - 1)
#     assert isinstance(p_value, float)

#     return t_value, p_value

In [None]:
# test_two_methods_time_paired(
#     "EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.evaluation_process"
# )

In [None]:
# test_two_methods_time_paired(
#     "EO-WeaSeL", "WeaSeL", "[Learner]WeaselUnet.evaluation_process"
# )

In [None]:
def test_two_methods_time(
    faster_method: str, slower_method: str, action_name: str
) -> tuple[float, float]:
    def combine_variance(df, mean_value):
        within = ((df["Num Calls"] - 1) * df["Std (s)"] ** 2).sum()
        between = (df["Num Calls"] * (df["Mean (s)"] - mean_value) ** 2).sum()
        return (within + between) / df["Num Calls"].sum()

    df = test_profile_df[test_profile_df["Action"] == action_name]
    df = df.drop(columns=["index", "Action", "Percentage (%)"])
    slower_df = df[df["method"] == slower_method]
    faster_df = df[df["method"] == faster_method]

    slower_n = slower_df["Num Calls"].sum()
    faster_n = faster_df["Num Calls"].sum()

    slower_mean = slower_df["Sum (s)"].sum() / slower_n
    faster_mean = faster_df["Sum (s)"].sum() / faster_n

    slower_var = combine_variance(slower_df, slower_mean) / slower_n
    faster_var = combine_variance(faster_df, faster_mean) / faster_n

    t_value = (faster_mean - slower_mean) / ((faster_var + slower_var) ** 0.5)
    dof = ((faster_var + slower_var) ** 2) / (
        (faster_var**2) / (faster_n - 1) + (slower_var**2) / (slower_n - 1)
    )
    p_value = stats.t.cdf(t_value, dof)
    assert isinstance(p_value, float)

    return t_value, p_value

In [None]:
test_two_methods_time("EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.get_prototypes")

In [None]:
test_two_methods_time("EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.prediction")

In [None]:
test_two_methods_time("EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.post_process")

In [None]:
test_two_methods_time("EO-WeaSeL", "WeaSeL", "[Learner]WeaselUnet.tune_process")

In [None]:
test_two_methods_time("EO-WeaSeL", "WeaSeL", "[Learner]WeaselUnet.prediction")

# Others


## Comparison with Other Studies


In [None]:
data = [
    ["CFEA [86]", "UDA", None, 79.78, 70.52, 88.96, 75.86, 60.08, 46.53],
    ["pOSAL [25]", "UDA", "5.8M", 91.42, 72.30, 90.83, 78.31, 76.75, 62.59],
    ["SIFA [182]", "UDA", "43.3M", 83.04, 57.29, 85.69, 69.57, 74.67, 52.84],
    ["WGAN [77]", "UDA", None, 91.20, 72.40, None, None, None, None],
    ["IOSUDA [88]", "UDA", "42.8M", 89.53, 65.56, 91.04, 71.03, 83.26, 60.07],
    ["CADA [87]", "UDA", "9.7M", 80.18, 72.41, 90.44, 77.21, 62.13, 47.1],
    ["SCUDA [90]", "UDA", None, 90.34, 66.61, None, None, 84.89, 61.65],
    ["GrabCut+UNet [65]", "WSS", None, 86.37, None, None, None, None, None],
    ["MERU [176]", "FSS", None, None, None, 83.92, 61.47, None, None],
    ["RDMT [178]", "SSS", None, None, None, None, 70.93, None, None],
    ["EO-ProtoSeg 1s", "FWS", "1.9M", 84.96, 63.69, 88.15, 71.17, 79.92, 44.01],
    ["EO-ProtoSeg 5s", "FWS", "1.9M", 85.30, 68.61, 88.18, 73.11, 80.46, 50.27],
    ["EO-ProtoSeg 10s", "FWS", "1.9M", 85.02, 68.93, 88.18, 73.52, 80.57, 52.42],
    ["EO-ProtoSeg best", "FWS", "1.9M", 86.80, 71.78, 88.21, 73.70, 80.39, 52.65],
]

columns = [
    "method",
    "method_type",
    "params",
    "drishti_od",
    "drishti_oc",
    "refuge_od",
    "refuge_oc",
    "rimone_od",
    "rimone_oc",
]

df = pd.DataFrame(data, columns=columns)
df

In [None]:
other_df = df[~df["method"].str.contains("EO-ProtoSeg")]

other_df.mean(axis=0, numeric_only=True, skipna=True)


In [None]:
other_df_diff = other_df.copy()
for col in other_df.columns[3:]:
    other_df_diff[col] = other_df_diff[col] - df.iloc[-1][col]

other_df_diff

## Region Analysis


In [None]:
import pandas as pd
import altair as alt

In [None]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={"tags": "var_region"},
# )

# for i, run in enumerate(runs):
#     run_id = run.name.split(" ")[-1]
#     group = run.group
#     segments = run.config["region_segments"]
#     compactness = run.config["region_compactness"]
#     compactness = round(np.log10(compactness), 2)
#     artifacts = run.logged_artifacts()
#     selected_artifact = artifacts[len(artifacts) - 2]
#     selected_artifact.download(f"logs/wandb/1_region_metrics/{group} {segments} {compactness} {run_id}")
#     print(group, segments, compactness, run_id)

In [None]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/1_region_metrics"
# for dir in os.listdir(wandb_dir):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     _, segments, compactness, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     assert sum(df["type"] == "TS") == len(df)
#     df.drop(columns=["type", "epoch"], inplace=True)
#     df.insert(0, "compactness", 10**(float(compactness)))
#     df.insert(0, "segments", int(segments))
#     df_list.append(df)

# region_metrics_df = pd.concat(df_list)
# region_metrics_df.to_csv("logs/wandb/1_region_metrics.csv", index=False)

In [None]:
region_coverage_df = pd.read_csv("logs/region_coverage.csv")

region_coverage_df["ratio_segments"] = (
    region_coverage_df["covered_segments"] / region_coverage_df["total_segments"]
)
region_coverage_df["ratio_class_1"] = (
    region_coverage_df["covered_class_1"] / region_coverage_df["total_class_1"]
)
region_coverage_df["ratio_class_2"] = (
    region_coverage_df["covered_class_2"] / region_coverage_df["total_class_2"]
)

region_coverage_df = (
    region_coverage_df.groupby(["segments", "compactness"])[
        ["ratio_segments", "ratio_class_1", "ratio_class_2"]
    ]
    .mean()
    .reset_index()
)

print(len(region_coverage_df))
region_coverage_df.head()

In [None]:
region_metrics_df = pd.read_csv("logs/wandb/1_region_metrics.csv")

region_metrics_df = (
    region_metrics_df.groupby(["segments", "compactness"])[["iou_cup", "iou_disc"]]
    .mean()
    .reset_index()
)
region_metrics_df["iou"] = (
    region_metrics_df["iou_cup"] + region_metrics_df["iou_disc"]
) / 2

print(len(region_metrics_df))
region_metrics_df.head()

In [None]:
region_df = pd.merge(
    region_coverage_df,
    region_metrics_df,
    on=["segments", "compactness"],
    how="inner",
)

region_df = region_df.rename(
    columns={
        "ratio_class_1": "coverage_cup",
        "ratio_class_2": "coverage_disc",
    }
)

for col in [
    "ratio_segments",
    "coverage_cup",
    "coverage_disc",
    "iou_cup",
    "iou_disc",
    "iou",
]:
    region_df[col] = region_df[col] * 100

print(len(region_df))
region_df.head()

In [None]:
alt.Chart(region_df).mark_circle(size=60).encode(
    x=alt.X("segments"),
    y=alt.Y("iou_disc", scale=alt.Scale(domain=[81, 87])),
    # y=alt.Y("iou_cup", scale=alt.Scale(domain=[45, 65])),
    color=alt.Color("coverage_disc", scale=alt.Scale(scheme="viridis")),
    # color=alt.Color("coverage_cup", scale=alt.Scale(scheme="viridis"))
).properties(
    width=150,
    height=200,
).facet(column=alt.Column("compactness"))

In [None]:
props = {"height": 200, "width": 200}

color_legend = alt.Legend(orient="bottom", titleOrient="left")

line_segments = (
    alt.Chart(region_df)
    .mark_line()
    .encode(
        x=alt.X("compactness", scale=alt.Scale(type="log")),
        y=alt.Y(
            "ratio_segments",
            scale=alt.Scale(domain=[40, 90]),
            title="Valid Segments (%)",
        ),
        color=alt.Color(
            "segments", scale=alt.Scale(scheme="yelloworangered"), legend=color_legend
        ),
        strokeWidth=alt.value(2.5),
    )
    .properties(**props)
)

line_disc = (
    alt.Chart(region_df)
    .mark_line()
    .encode(
        x=alt.X("compactness", scale=alt.Scale(type="log")),
        y=alt.Y(
            "coverage_disc",
            scale=alt.Scale(domain=[0, 70]),
            title="Coverage of OD Pixels (%)",
        ),
        color=alt.Color(
            "segments", scale=alt.Scale(scheme="yelloworangered"), legend=color_legend
        ),
        strokeWidth=alt.value(2.5),
    )
    .properties(**props)
)

line_cup = (
    alt.Chart(region_df)
    .mark_line()
    .encode(
        x=alt.X("compactness", scale=alt.Scale(type="log")),
        y=alt.Y(
            "coverage_cup",
            scale=alt.Scale(domain=[0, 70]),
            title="Coverage of OC Pixels (%)",
        ),
        color=alt.Color(
            "segments", scale=alt.Scale(scheme="yelloworangered"), legend=color_legend
        ),
        strokeWidth=alt.value(2.5),
    )
    .properties(**props)
)

scatter_segments = (
    alt.Chart(region_df)
    .mark_circle(size=40)
    .encode(
        x=alt.X(
            "ratio_segments",
            scale=alt.Scale(domain=[40, 90]),
            title="Valid Segments (%)",
        ),
        y=alt.Y("mean(iou)", scale=alt.Scale(domain=[50, 80]), title="Mean IoU (%)"),
        color=alt.value("gray"),
    )
    .properties(**props)
)

scatter_disc = (
    alt.Chart(region_df)
    .mark_circle(size=40)
    .encode(
        x=alt.X(
            "coverage_disc",
            scale=alt.Scale(domain=[0, 70]),
            title="Coverage of OD Pixels (%)",
        ),
        y=alt.Y("mean(iou_disc)", scale=alt.Scale(domain=[60, 90]), title="OD IoU (%)"),
        color=alt.value("gray"),
    )
    .properties(**props)
)

scatter_cup = (
    alt.Chart(region_df)
    .mark_circle(size=40)
    .encode(
        x=alt.X(
            "coverage_cup",
            scale=alt.Scale(domain=[0, 70]),
            title="Coverage of OC Pixels (%)",
        ),
        y=alt.Y("mean(iou_cup)", scale=alt.Scale(domain=[40, 70]), title="OC IoU (%)"),
        color=alt.value("gray"),
    )
    .properties(**props)
)

(
    (
        (line_segments & line_disc & line_cup)
        | (scatter_segments & scatter_disc & scatter_cup)
    )
    .configure_axis(labelFontSize=12, titleFontSize=14, titleFontWeight=500)
    .configure_legend(labelFontSize=14, titleFontSize=14, titleFontWeight=500)
)