# Initialization


## Imports


In [1]:
from typing import Literal

import numpy as np
import pandas as pd
import altair as alt
from scipy import stats

alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

## Utils


In [2]:
def read_wandb_table(path: str) -> pd.DataFrame:
    import json

    with open(path, "r") as file:
        data = json.load(file)
    columns = data["columns"]
    rows = data["data"]
    return pd.DataFrame(rows, columns=columns)

# Metrics


## Download


In [3]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={"jobType": "test"},
# )

# for i, run in enumerate(runs):
#     run_id = run.name.split(" ")[-1]
#     dataset = run.config["test_dataset"].replace("-test", "")
#     group = run.group
#     if group == "SL":
#         pass
#     elif len(group.split("-")) == 1:
#         group += "-new"
#     elif group.split("-")[1] == "b":
#         group = group.replace("-b", "-new-b")
#     run.logged_artifacts()[2].download(f"logs/wandb/metrics/{group} {dataset} {run_id}")
#     print(group, dataset, run_id)

In [4]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/metrics"
# for dir in os.listdir(wandb_dir):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     if not dir.startswith("SL"):
#         continue
#     _, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df_list.append(df)

# simple_metrics_df = pd.concat(df_list)
# simple_metrics_df.to_csv("logs/wandb/simple_metrics.csv", index=False)

In [5]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/metrics"
# for dir in os.listdir(wandb_dir):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     if dir.startswith("SL"):
#         continue
#     group, dataset, _ = dir.split(" ")
#     df = read_wandb_table(f"{wandb_dir}/{dir}/metrics.table.json")
#     df.drop(columns=["type", "epoch"], inplace=True)
#     df.insert(0, "dataset", dataset)
#     df.insert(0, "method", group)
#     df_list.append(df)

# meta_metrics_df = pd.concat(df_list)
# meta_metrics_df.to_csv("logs/wandb/meta_metrics.csv", index=False)

## Preparation


In [6]:
simple_metrics_df = pd.read_csv("logs/wandb/simple_metrics.csv")
simple_metrics_df.insert(0, "method", "SL")

simple_metrics_df["iou_cup"] = simple_metrics_df["iou_cup"] * 100
simple_metrics_df["iou_disc"] = simple_metrics_df["iou_disc"] * 100

simple_metrics_df["iou"] = (
    simple_metrics_df["iou_cup"] + simple_metrics_df["iou_disc"]
) / 2

simple_metrics_df

Unnamed: 0,method,dataset,batch,loss,iou_cup,iou_disc,iou
0,SL,DRISHTI-GS,0,0.108150,80.91,94.73,87.820
1,SL,DRISHTI-GS,1,0.102495,88.08,93.90,90.990
2,SL,DRISHTI-GS,2,0.160526,76.83,93.48,85.155
3,SL,DRISHTI-GS,3,0.204103,78.70,89.42,84.060
4,SL,DRISHTI-GS,4,0.127865,85.46,93.89,89.675
...,...,...,...,...,...,...,...
1417,SL,RIM-ONE-3,55,0.175297,64.91,88.02,76.465
1418,SL,RIM-ONE-3,56,0.152534,54.44,91.86,73.150
1419,SL,RIM-ONE-3,57,0.129055,72.66,95.33,83.995
1420,SL,RIM-ONE-3,58,0.108834,55.66,93.13,74.395


In [7]:
meta_metrics_df = pd.read_csv("logs/wandb/meta_metrics.csv")
meta_metrics_df = meta_metrics_df[~meta_metrics_df["method"].str.endswith("-B")]

meta_metrics_df["iou_cup"] = meta_metrics_df["iou_cup"] * 100
meta_metrics_df["iou_disc"] = meta_metrics_df["iou_disc"] * 100

meta_metrics_df["iou"] = (meta_metrics_df["iou_cup"] + meta_metrics_df["iou_disc"]) / 2

meta_metrics_df

Unnamed: 0,method,dataset,batch,shot,sparsity_mode,sparsity_value,loss,iou_cup,iou_disc,iou
0,EO-ProtoSeg,DRISHTI-GS,0,1,point,1.0,1.895624,0.00,39.94,19.970
1,EO-ProtoSeg,DRISHTI-GS,1,1,point,1.0,1.098613,0.00,0.00,0.000
2,EO-ProtoSeg,DRISHTI-GS,2,1,point,1.0,3.035231,0.00,17.13,8.565
3,EO-ProtoSeg,DRISHTI-GS,3,1,point,1.0,1.808312,0.00,39.31,19.655
4,EO-ProtoSeg,DRISHTI-GS,4,1,point,1.0,1.098613,0.00,0.00,0.000
...,...,...,...,...,...,...,...,...,...,...
101995,WeaSeL,RIM-ONE-3,1495,20,region,1.0,0.805449,26.34,54.45,40.395
101996,WeaSeL,RIM-ONE-3,1496,20,region,1.0,0.875089,11.79,54.07,32.930
101997,WeaSeL,RIM-ONE-3,1497,20,region,1.0,0.759501,23.46,63.76,43.610
101998,WeaSeL,RIM-ONE-3,1498,20,region,1.0,0.680005,23.69,75.08,49.385


## Comparison


In [8]:
meta_metrics_df

Unnamed: 0,method,dataset,batch,shot,sparsity_mode,sparsity_value,loss,iou_cup,iou_disc,iou
0,EO-ProtoSeg,DRISHTI-GS,0,1,point,1.0,1.895624,0.00,39.94,19.970
1,EO-ProtoSeg,DRISHTI-GS,1,1,point,1.0,1.098613,0.00,0.00,0.000
2,EO-ProtoSeg,DRISHTI-GS,2,1,point,1.0,3.035231,0.00,17.13,8.565
3,EO-ProtoSeg,DRISHTI-GS,3,1,point,1.0,1.808312,0.00,39.31,19.655
4,EO-ProtoSeg,DRISHTI-GS,4,1,point,1.0,1.098613,0.00,0.00,0.000
...,...,...,...,...,...,...,...,...,...,...
101995,WeaSeL,RIM-ONE-3,1495,20,region,1.0,0.805449,26.34,54.45,40.395
101996,WeaSeL,RIM-ONE-3,1496,20,region,1.0,0.875089,11.79,54.07,32.930
101997,WeaSeL,RIM-ONE-3,1497,20,region,1.0,0.759501,23.46,63.76,43.610
101998,WeaSeL,RIM-ONE-3,1498,20,region,1.0,0.680005,23.69,75.08,49.385


In [86]:
def compare_metrics(target_column: str, use_best: bool) -> pd.DataFrame:
    if use_best:
        comparison_df = (
            pd.concat(
                [
                    simple_metrics_df[["dataset", "method", "iou", target_column]],
                    meta_metrics_df[
                        [
                            "dataset",
                            "method",
                            "shot",
                            "sparsity_mode",
                            "sparsity_value",
                            "iou",
                            target_column,
                        ]
                    ],
                ]
            )
            .rename(columns={"iou": "iou_ref"})
            .groupby(
                ["dataset", "method", "shot", "sparsity_mode", "sparsity_value"],
                dropna=False,
            )
            .agg(
                iou_ref=("iou_ref", "mean"),
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
            .reset_index()
        )
        comparison_df = comparison_df.loc[
            comparison_df.groupby(["dataset", "method"])["iou_ref"].idxmax()
        ]
    else:
        comparison_df = (
            pd.concat(
                [
                    simple_metrics_df[["dataset", "method", target_column]],
                    meta_metrics_df[["dataset", "method", target_column]],
                ]
            )
            .groupby(["dataset", "method"])
            .agg(
                iou=(target_column, "mean"),
                iou_std=(target_column, "std"),
                iou_count=(target_column, "count"),
            )
        ).reset_index()

    comparison_df["iou_std_err"] = (
        comparison_df["iou_std"] / comparison_df["iou_count"] ** 0.5
    )
    comparison_df["iou_low"] = (
        comparison_df["iou"] - 1.96 * comparison_df["iou_std_err"]
    )
    comparison_df["iou_high"] = (
        comparison_df["iou"] + 1.96 * comparison_df["iou_std_err"]
    )

    return comparison_df

In [None]:
comparison_df = compare_metrics("iou", False)
disc_comparison_df = compare_metrics("iou_disc", False)
cup_comparison_df = compare_metrics("iou_cup", False)
# best_comparison_df = compare_metrics("iou", True)
best_disc_comparison_df = compare_metrics("iou_disc", True)
best_cup_comparison_df = compare_metrics("iou_cup", True)

In [12]:
def compare_methods_with_sl_by_dataset(df):
    """
    Performs one-tailed t-tests comparing SL method with all other methods within each dataset.
    H0: μ_SL <= μ_other
    H1: μ_SL > μ_other

    Parameters:
    df: DataFrame containing columns 'dataset', 'method', 'iou', 'iou_std', 'iou_count'

    Returns:
    DataFrame with comparison results
    """
    # Initialize results list
    results = []

    # Group by dataset and process each separately
    for dataset in df["dataset"].unique():
        dataset_df = df[df["dataset"] == dataset]

        # Get SL method statistics for this dataset
        sl_stats = dataset_df[dataset_df["method"] == "SL"].iloc[0]
        sl_mean = sl_stats["iou"]
        sl_std = sl_stats["iou_std"]
        sl_n = sl_stats["iou_count"]

        # Compare SL with each other method in this dataset
        for _, row in dataset_df[dataset_df["method"] != "SL"].iterrows():
            other_mean = row["iou"]
            other_std = row["iou_std"]
            other_n = row["iou_count"]

            # Calculate pooled standard error
            s_p = np.sqrt((sl_std**2 / sl_n) + (other_std**2 / other_n))

            # Calculate t-statistic
            t_stat = (sl_mean - other_mean) / s_p

            # Calculate degrees of freedom using Welch-Satterthwaite equation
            df_num = (sl_std**2 / sl_n + other_std**2 / other_n) ** 2
            df_denom = (sl_std**4) / (sl_n**2 * (sl_n - 1)) + (other_std**4) / (
                other_n**2 * (other_n - 1)
            )
            df_welch = df_num / df_denom

            # Calculate one-tailed p-value
            p_value = 1 - stats.t.cdf(t_stat, df_welch)

            # Store results
            results.append(
                {
                    "dataset": dataset,
                    "method_compared": row["method"],
                    "sl_iou": sl_mean,
                    "method_iou": other_mean,
                    "iou_diff": sl_mean - other_mean,
                    "t_statistic": t_stat,
                    "degrees_of_freedom": df_welch,
                    "p_value": p_value,
                }
            )

    # Create DataFrame and sort by dataset and p-value
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values(["dataset", "p_value"])

    # Round numerical columns for better readability
    numeric_cols = [
        "sl_iou",
        "method_iou",
        "iou_diff",
        "t_statistic",
        "degrees_of_freedom",
        "p_value",
    ]
    result_df[numeric_cols] = result_df[numeric_cols].round(6)

    return result_df


# compare_methods_with_sl_by_dataset(best_comparison_df)

## Visualization


In [11]:
ordered_methods = [
    "ProtoSeg",
    "O-ProtoSeg",
    "EO-ProtoSeg",
    "SL",
    "WeaSeL",
    "O-WeaSeL",
    "EO-WeaSeL",
]

color_scale = alt.Scale(
    domain=ordered_methods,
    range=[
        "#ffda03",  # Yellow
        "#e85d04",  # Orange
        "#d00000",  # Red
        "#757575",  # Gray
        "#43b0f1",  # Blue
        "#2ec4b6",  # Turquoise
        "#2d6a4f",  # Green
    ],
)

In [12]:
def compose_bar_chart(
    dataframes: list[tuple[pd.DataFrame, str]], scale: tuple[float, float] | None = None
):
    new_dataframes = []
    for df in dataframes:
        new_df = df[0].copy()
        new_df["iou_type"] = df[1]
        new_dataframes.append(new_df)
    data = pd.concat(new_dataframes)

    base = alt.Chart(data).encode(
        x=alt.X(
            "method:N",
            title=None,
            sort=ordered_methods,
            axis=alt.Axis(labels=False, ticks=False),
        ),
    )
    if scale is not None:
        y_scale = alt.Scale(domain=scale, clamp=True)
    else:
        y_scale = alt.Scale()

    layered = (
        base.mark_bar().encode(
            y=alt.Y(
                "iou:Q",
                title="Mean IoU",
                scale=y_scale,
            ),
            color=alt.Color(
                "method:N",
                scale=color_scale,
                title="Method",
                legend=alt.Legend(
                    orient="bottom",
                    direction="horizontal",
                    titleAnchor="start",
                    columns=4,
                ),
            ),
        )
        + base.mark_errorbar(
            extent="ci", thickness=2.0, ticks=True, color="black"
        ).encode(
            y=alt.Y(
                "iou_low:Q",
                title=None,
                scale=y_scale,
            ),
            y2="iou_high:Q",
        )
        + base.mark_text(align="center", baseline="top", dy=85, fontSize=14).encode(
            text=alt.Text("iou:Q", format=".0f"),
        )
    ).properties(width=200, height=200)  # type: ignore

    chart = (
        layered.facet(
            row=alt.Row("dataset:N", title="Dataset"),
            column=alt.Column("iou_type:N", title="IoU (%)", sort="descending"),
        )
        .configure_axis(labelFontSize=12, titleFontSize=16)
        .configure_header(labelFontSize=12, titleFontSize=16)
        .configure_legend(labelFontSize=14, titleFontSize=16)
    )

    if scale is None:
        chart = chart.resolve_scale(y="independent")

    return chart

In [25]:
compose_bar_chart(
    [
        (disc_comparison_df, "Overall Mean - Optic Disc"),
        (best_disc_comparison_df, "Best Mean - Optic Disc"),
    ],
    (50, 100),
)

In [28]:
compose_bar_chart(
    [
        (cup_comparison_df, "Overall Mean - Optic Cup"),
        (best_cup_comparison_df, "Best Mean  - Optic Cup"),
    ],
    (0, 80),
)

In [15]:
def compose_line_chart(data: pd.DataFrame):
    new_data = data.copy()
    new_data["shot"] = new_data["shot"].apply(
        lambda x: f"{x} shot" if x == 1 else f"{x} shots"
    )
    new_data["sparsity_mode"] = "IoU " + new_data["sparsity_mode"] + " (%)"
    encodings = {
        "x": alt.X("sparsity_value", title=None),
        "y": alt.Y("mean(iou)", title=None),
        "color": alt.Color(
            "method",
            title="Methods",
            scale=color_scale,
            legend=alt.Legend(
                orient="bottom", direction="horizontal", titleAnchor="start"
            ),
        ),
    }

    error_band = (
        alt.Chart(new_data).mark_errorband(extent="ci", opacity=0.5).encode(**encodings)
    )
    line = alt.Chart(new_data).mark_line(strokeWidth=1).encode(**encodings)
    point = alt.Chart(new_data).mark_point(size=5).encode(**encodings)

    combined_chart = error_band + line + point  # type: ignore
    combined_chart = (
        combined_chart.properties(width=150, height=150)
        .facet(
            row=alt.Row("sparsity_mode", title=None),
            column=alt.Column(
                "shot",
                sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
                header=alt.Header(title="Sparsity Values", titleOrient="bottom"),
            ),
            spacing=10,
        )
        .resolve_scale(x="independent")
        .configure_axis(labelFontSize=12)
        .configure_header(labelFontSize=16, titleFontSize=16)
        .configure_legend(labelFontSize=14, titleFontSize=16)
    )

    return combined_chart

In [None]:
compose_line_chart(meta_metrics_df)

In [None]:
# meta_metrics_df_with_ref = meta_metrics_df.copy()

# meta_metrics_df_with_ref["method_parent"] = (
#     meta_metrics_df_with_ref["method"].str.split("-").str[-1]
# )
# meta_metrics_df_with_ref["method_child"] = meta_metrics_df_with_ref["method"].apply(
#     lambda x: x.split("-")[0] if "-" in x else "original"
# )

# meta_metrics_df_with_ref = pd.merge(
#     meta_metrics_df_with_ref,
#     simple_metrics_df.groupby(["dataset"])["iou"].mean(),
#     on="dataset",
#     suffixes=("", "_ref"),
# )

# meta_metrics_df_with_ref

In [None]:
# base = alt.Chart(meta_metrics_df_with_ref)

# lines = base.mark_line().encode(
#     x=alt.X("sparsity_value", title=None),
#     y="mean(iou)",
#     color=alt.Color(
#         "method_parent",
#         scale=alt.Scale(domain=["ProtoSeg", "WeaSeL"], range=["#ff4444", "#77aaff"]),
#     ),
#     strokeDash="method_child",
# )

# ref_lines = base.mark_rule(color="#33cc33").encode(y="mean(iou_ref)")

# (lines + ref_lines).properties(width=150, height=150).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(x="independent")

In [None]:
# data = meta_metrics_df[
#     (meta_metrics_df["method"].str.endswith("ProtoSeg"))
#     & (meta_metrics_df["dataset"] == "REFUGE")
# ]
# alt.Chart(data).mark_errorband(extent="ci").encode(
#     x="sparsity_value",
#     y="mean(iou)",
#     color="method",
# ).properties(width=300, height=200).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(
#     x="independent",
# )

In [None]:
# data = meta_metrics_df[
#     (meta_metrics_df["method"].str.endswith("WeaSeL"))
#     & (meta_metrics_df["dataset"] == "REFUGE")
# ]
# alt.Chart(data).mark_errorband(extent="ci").encode(
#     x="sparsity_value",
#     y="mean(iou)",
#     color="method",
# ).properties(width=300, height=200).facet(
#     row="sparsity_mode", column="shot"
# ).resolve_scale(
#     x="independent",
# )

## Tables


In [12]:
best_results_df = pd.merge(
    best_disc_comparison_df,
    best_cup_comparison_df,
    on=best_disc_comparison_df.columns.tolist()[:5],
    suffixes=("_disc", "_cup"),
)
best_results_df["method_order"] = best_results_df["method"].map(
    {
        "ProtoSeg": 0,
        "O-ProtoSeg": 1,
        "EO-ProtoSeg": 2,
        "SL": 3,
        "WeaSeL": 4,
        "O-WeaSeL": 5,
        "EO-WeaSeL": 6,
    }
)
best_results_df.sort_values(["dataset", "method_order"], inplace=True)

best_results_df

Unnamed: 0,dataset,method,shot,sparsity_mode,sparsity_value,iou_ref_disc,iou_disc,iou_std_disc,iou_count_disc,iou_std_err_disc,iou_low_disc,iou_high_disc,iou_ref_cup,iou_cup,iou_std_cup,iou_count_cup,iou_std_err_cup,iou_low_cup,iou_high_cup,method_order
4,DRISHTI-GS,ProtoSeg,20.0,region,1.0,78.2055,86.387,3.695057,10,1.16848,84.09678,88.67722,78.2055,70.024,4.430107,10,1.400923,67.278191,72.769809,0
2,DRISHTI-GS,O-ProtoSeg,20.0,region,1.0,78.337,86.731,4.15932,10,1.315292,84.153027,89.308973,78.337,69.943,5.516088,10,1.74434,66.524093,73.361907,1
0,DRISHTI-GS,EO-ProtoSeg,15.0,point,50.0,79.289,86.797,2.887009,10,0.912953,85.007613,88.586387,79.289,71.781,3.144224,10,0.994291,69.83219,73.72981,2
5,DRISHTI-GS,SL,,,,85.01848,92.915392,3.263361,102,0.323121,92.282075,93.548709,85.01848,77.121569,12.778533,102,1.265263,74.641653,79.601485,3
6,DRISHTI-GS,WeaSeL,20.0,region,0.5,41.9485,68.125,4.365184,10,1.380392,65.419431,70.830569,41.9485,15.772,4.99153,10,1.57846,12.678218,18.865782,4
3,DRISHTI-GS,O-WeaSeL,1.0,region,0.5,67.7655,80.057,4.397643,10,1.390657,77.331312,82.782688,67.7655,55.474,7.017162,10,2.219021,51.124718,59.823282,5
1,DRISHTI-GS,EO-WeaSeL,5.0,grid,0.5,73.7425,86.965,2.382805,10,0.753509,85.488122,88.441878,73.7425,60.52,7.824163,10,2.474218,55.670533,65.369467,6
11,REFUGE,ProtoSeg,20.0,contour,1.0,68.8805,83.515125,4.466825,80,0.499406,82.536289,84.493961,68.8805,54.245875,9.02428,80,1.008945,52.268342,56.223408,0
9,REFUGE,O-ProtoSeg,20.0,region,0.5,75.925938,87.00025,3.567963,80,0.39891,86.218386,87.782114,75.925938,64.851625,9.909855,80,1.107955,62.680032,67.023218,1
7,REFUGE,EO-ProtoSeg,20.0,region,0.75,80.95525,88.207875,2.375723,80,0.265614,87.687272,88.728478,80.95525,73.702625,5.789343,80,0.647268,72.433979,74.971271,2


In [13]:
for i in range(len(best_results_df)):
    row = best_results_df.iloc[i]
    is_sl = row["method"] == "SL"
    is_point = row["sparsity_mode"] == "point"
    if is_sl:
        shot, sparsity = "-", "-"
    elif is_point:
        shot, sparsity = int(row["shot"]), "point - " + str(int(row["sparsity_value"]))
    else:
        shot, sparsity = (
            int(row["shot"]),
            row["sparsity_mode"] + " - " + f"{row['sparsity_value']:.2f}",
        )
    print(
        f"& {row['method']} & {shot} & {sparsity} & {row['iou_disc']:.2f} & {row['iou_low_disc']:.2f}-{row['iou_high_disc']:.2f} & {row['iou_cup']:.2f} & {row['iou_low_cup']:.2f}-{row['iou_high_cup']:.2f} \\\\"
    )

& ProtoSeg & 20 & region - 1.00 & 86.39 & 84.10-88.68 & 70.02 & 67.28-72.77 \\
& O-ProtoSeg & 20 & region - 1.00 & 86.73 & 84.15-89.31 & 69.94 & 66.52-73.36 \\
& EO-ProtoSeg & 15 & point - 50 & 86.80 & 85.01-88.59 & 71.78 & 69.83-73.73 \\
& SL & - & - & 92.92 & 92.28-93.55 & 77.12 & 74.64-79.60 \\
& WeaSeL & 20 & region - 0.50 & 68.12 & 65.42-70.83 & 15.77 & 12.68-18.87 \\
& O-WeaSeL & 1 & region - 0.50 & 80.06 & 77.33-82.78 & 55.47 & 51.12-59.82 \\
& EO-WeaSeL & 5 & grid - 0.50 & 86.97 & 85.49-88.44 & 60.52 & 55.67-65.37 \\
& ProtoSeg & 20 & contour - 1.00 & 83.52 & 82.54-84.49 & 54.25 & 52.27-56.22 \\
& O-ProtoSeg & 20 & region - 0.50 & 87.00 & 86.22-87.78 & 64.85 & 62.68-67.02 \\
& EO-ProtoSeg & 20 & region - 0.75 & 88.21 & 87.69-88.73 & 73.70 & 72.43-74.97 \\
& SL & - & - & 90.18 & 89.91-90.46 & 75.96 & 75.37-76.56 \\
& WeaSeL & 20 & grid - 0.25 & 71.48 & 70.42-72.53 & 29.41 & 27.95-30.87 \\
& O-WeaSeL & 20 & grid - 0.25 & 86.16 & 85.46-86.87 & 70.41 & 68.77-72.04 \\
& EO-WeaSeL & 

In [31]:
eop_metrics_df = meta_metrics_df[(meta_metrics_df["method"] == "EO-ProtoSeg")]

agg_eop_metrics_df = (
    eop_metrics_df.groupby(["sparsity_mode", "sparsity_value", "shot"])
    .agg(iou=("iou", "mean"))
    .sort_values(["sparsity_mode", "sparsity_value", "shot"])
    .reset_index()
)

agg_eop_metrics_df

Unnamed: 0,sparsity_mode,sparsity_value,shot,iou
0,contour,0.1,1,72.467206
1,contour,0.1,5,74.821863
2,contour,0.1,10,75.286863
3,contour,0.1,15,75.526275
4,contour,0.1,20,75.279265
...,...,...,...,...
120,skeleton,1.0,1,76.974706
121,skeleton,1.0,5,77.997794
122,skeleton,1.0,10,78.174657
123,skeleton,1.0,15,78.134853


In [32]:
agg_eop_metrics_ls = [[[] for _ in range(5)] for _ in range(5)]

for i in range(len(agg_eop_metrics_df)):
    row = agg_eop_metrics_df.iloc[i]
    ls = agg_eop_metrics_ls[i // 25][(i % 25) // 5]
    if len(ls) == 0:
        ls.append(row["sparsity_value"])
    ls.append(row["iou"])

In [33]:
sparsity_modes = ["contours", "grid", "point", "regions", "skeleton"]

for i in range(5):
    print("\\multirow{5}{*}{mode}  ".replace("mode", sparsity_modes[i]), end="")
    print(
        " \\\\\n".join(
            [
                "& " + " & ".join([f"{np.mean(v):.2f}" for v in ls])
                for ls in agg_eop_metrics_ls[i]
            ]
        )
        + " \\\\"
    )
    print("\\hline")

\multirow{5}{*}{contours}  & 0.10 & 72.47 & 74.82 & 75.29 & 75.53 & 75.28 \\
& 0.25 & 72.83 & 75.34 & 75.17 & 75.22 & 75.12 \\
& 0.50 & 74.02 & 75.27 & 74.88 & 74.92 & 75.27 \\
& 0.75 & 74.46 & 75.25 & 75.22 & 75.33 & 75.12 \\
& 1.00 & 74.48 & 75.30 & 75.09 & 75.13 & 75.19 \\
\hline
\multirow{5}{*}{grid}  & 0.10 & 72.04 & 76.96 & 77.33 & 77.25 & 77.91 \\
& 0.25 & 75.50 & 77.54 & 77.80 & 77.64 & 77.68 \\
& 0.50 & 76.94 & 77.62 & 77.79 & 78.00 & 77.79 \\
& 0.75 & 76.54 & 77.75 & 77.74 & 77.86 & 77.82 \\
& 1.00 & 77.37 & 77.69 & 77.72 & 77.87 & 77.89 \\
\hline
\multirow{5}{*}{point}  & 1.00 & 8.41 & 16.90 & 17.73 & 17.62 & 17.95 \\
& 13.00 & 75.37 & 77.27 & 77.71 & 77.83 & 77.93 \\
& 25.00 & 76.64 & 77.76 & 77.69 & 77.54 & 78.08 \\
& 37.00 & 76.62 & 77.52 & 77.76 & 77.84 & 77.95 \\
& 50.00 & 77.13 & 77.83 & 77.82 & 78.23 & 77.94 \\
\hline
\multirow{5}{*}{regions}  & 0.10 & 76.30 & 77.96 & 78.27 & 78.73 & 78.63 \\
& 0.25 & 77.05 & 78.48 & 78.37 & 78.63 & 78.78 \\
& 0.50 & 76.94 & 78.36 & 7

## Hypothesis Testing


In [None]:
def test_two_methods_score_od_oc(
    higher_method: str, lower_method: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
    df = meta_metrics_df.drop(
        columns=["loss", "iou", "shot", "sparsity_mode", "sparsity_value"]
    )
    higher_df = df[df["method"] == higher_method]
    lower_df = df[df["method"] == lower_method]

    merged_df = pd.merge(
        higher_df, lower_df, on=["dataset", "batch"], suffixes=("_h", "_l")
    )
    n = len(merged_df)

    merged_df["iou_cup_diff"] = merged_df["iou_cup_h"] - merged_df["iou_cup_l"]
    merged_df["iou_disc_diff"] = merged_df["iou_disc_h"] - merged_df["iou_disc_l"]

    grouped_df = merged_df.groupby(["dataset"])[["iou_cup_diff", "iou_disc_diff"]]
    t_vals = grouped_df.mean() / (grouped_df.std() / np.sqrt(n))

    p_vals = t_vals.apply(lambda x: stats.t.sf(x, n - 1))

    return t_vals, p_vals

In [None]:
def test_two_methods_score(
    higher_method: str, lower_method: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
    df = meta_metrics_df.drop(
        columns=[
            "loss",
            "iou_cup",
            "iou_disc",
            "shot",
            "sparsity_mode",
            "sparsity_value",
        ]
    )
    higher_df = df[df["method"] == higher_method]
    lower_df = df[df["method"] == lower_method]

    merged_df = pd.merge(
        higher_df, lower_df, on=["dataset", "batch"], suffixes=("_h", "_l")
    )
    n = len(merged_df)

    merged_df["iou_diff"] = merged_df["iou_h"] - merged_df["iou_l"]

    grouped_df = merged_df.groupby(["dataset"])["iou_diff"]
    t_vals = grouped_df.mean() / (grouped_df.std() / np.sqrt(n))

    p_vals = t_vals.apply(lambda x: stats.t.sf(x, n - 1))

    return t_vals, p_vals

In [16]:
test_two_methods_score("EO-ProtoSeg", "ProtoSeg")

(dataset
 DRISHTI-GS     44.235291
 REFUGE        170.117907
 RIM-ONE-3       1.939907
 Name: iou_diff, dtype: float64,
 dataset
 DRISHTI-GS    0.000000
 REFUGE        0.000000
 RIM-ONE-3     0.026207
 Name: iou_diff, dtype: float64)

In [17]:
test_two_methods_score("EO-WeaSeL", "WeaSeL")

(dataset
 DRISHTI-GS    743.767481
 REFUGE        505.085983
 RIM-ONE-3     576.031568
 Name: iou_diff, dtype: float64,
 dataset
 DRISHTI-GS    0.0
 REFUGE        0.0
 RIM-ONE-3     0.0
 Name: iou_diff, dtype: float64)

In [18]:
test_two_methods_score("EO-ProtoSeg", "EO-WeaSeL")

(dataset
 DRISHTI-GS    18.292044
 REFUGE        29.431257
 RIM-ONE-3      2.113864
 Name: iou_diff, dtype: float64,
 dataset
 DRISHTI-GS     4.196030e-74
 REFUGE        1.470543e-184
 RIM-ONE-3      1.727314e-02
 Name: iou_diff, dtype: float64)

In [19]:
test_two_methods_score("ProtoSeg", "WeaSeL")

(dataset
 DRISHTI-GS    245.965474
 REFUGE        158.395030
 RIM-ONE-3     178.832029
 Name: iou_diff, dtype: float64,
 dataset
 DRISHTI-GS    0.0
 REFUGE        0.0
 RIM-ONE-3     0.0
 Name: iou_diff, dtype: float64)

In [127]:
diff_value = 5
dropped_columns = [
    "shot",
    "sparsity_mode",
    "sparsity_value",
    "iou_std_err",
    "iou_low",
    "iou_high",
]
rename_columns = {
    "iou_ref": "ref",
    "iou": "mean",
    "iou_std": "std",
    "iou_count": "n",
}
disc_df = best_disc_comparison_df.drop(columns=dropped_columns).rename(
    columns=rename_columns
)
cup_df = best_cup_comparison_df.drop(columns=dropped_columns).rename(
    columns=rename_columns
)

simple_disc_df = disc_df[disc_df["method"] == "SL"].drop(columns=["method", "ref"])
simple_disc_df["object"] = "disc"
simple_cup_df = cup_df[cup_df["method"] == "SL"].drop(columns=["method", "ref"])
simple_cup_df["object"] = "cup"
simple_df = pd.concat([simple_disc_df, simple_cup_df], axis=0)

meta_disc_df = disc_df.loc[
    disc_df[disc_df["method"] != "SL"].groupby(["dataset"])["ref"].idxmax()
].drop(columns=["method", "ref"])
meta_disc_df["object"] = "disc"
meta_cup_df = cup_df.loc[
    cup_df[cup_df["method"] != "SL"].groupby(["dataset"])["ref"].idxmax()
].drop(columns=["method", "ref"])
meta_cup_df["object"] = "cup"
meta_df = pd.concat([meta_disc_df, meta_cup_df], axis=0)

df = pd.merge(
    simple_df,
    meta_df,
    on=["dataset", "object"],
    suffixes=("_simple", "_meta"),
)

var_simple = (df["std_simple"] ** 2) / df["n_simple"]
var_meta = (df["std_meta"] ** 2) / df["n_meta"]

df["t_value"] = (df["mean_simple"] - df["mean_meta"] - diff_value) / (
    var_simple + var_meta
) ** 0.5

df["dof"] = ((var_simple + var_meta) ** 2) / (
    (var_simple**2) / (df["n_simple"] - 1) + (var_meta**2) / (df["n_meta"] - 1)
)

df["p_value"] = stats.t.cdf(df["t_value"], df["dof"]).round(6)

df

Unnamed: 0,dataset,mean_simple,std_simple,n_simple,object,mean_meta,std_meta,n_meta,t_value,dof,p_value
0,DRISHTI-GS,92.915392,3.263361,102,disc,86.797,2.887009,10,1.154831,11.380103,0.864083
1,REFUGE,90.184208,4.799636,1200,disc,88.207875,2.375723,80,-10.093038,127.220853,0.0
2,RIM-ONE-3,90.933917,4.30777,120,disc,80.390833,3.769383,12,4.790889,14.038844,0.999857
3,DRISHTI-GS,77.121569,12.778533,102,cup,71.781,3.144224,10,0.211639,50.052488,0.583376
4,REFUGE,75.964983,10.492958,1200,cup,73.702625,5.789343,80,-3.830807,117.021311,0.000103
5,RIM-ONE-3,56.123917,23.561584,120,cup,52.645833,8.510627,12,-0.466089,32.556074,0.322128


# Test Profiles


## Download


In [13]:
# import wandb

# from utils.wandb import wandb_path

# runs = wandb.Api().runs(
#     wandb_path(False),
#     filters={"jobType": "profile-test"},
# )

# group_names = {
#     "SL": "SL",
#     "WS-ori": "WeaSeL",
#     "WS-ms": "O-WeaSeL",
#     "WS": "EO-WeaSeL",
#     "PS-ori": "ProtoSeg",
#     "PS-mp": "O-ProtoSeg",
#     "PS": "EO-ProtoSeg",
# }

# for i, run in enumerate(runs):
#     group = group_names[run.group]
#     if group in ["WeaSeL"]:
#         continue
#     run_id = run.name.split(" ")[-1]
#     batch_size = run.config["batch_size"]
#     shot = run.config.get("shot", -1)
#     shot_str = f" s{shot}" if shot != -1 else ""
#     run.logged_artifacts()[0].download(
#         f"logs/wandb/test_profile/{group} b{batch_size}{shot_str} {run_id}"
#     )
#     print(group, run_id)

In [14]:
# import os

# df_list = []

# wandb_dir = "logs/wandb/test_profile"
# for i, dir in enumerate(os.listdir(wandb_dir)):
#     if os.path.isfile(f"{wandb_dir}/{dir}"):
#         continue
#     splitted = dir.split(" ")
#     if len(splitted) == 3:
#         group, batch_str, _ = splitted
#         shot = -1
#     else:
#         group, batch_str, shot_str, _ = splitted
#         shot = int(shot_str[1:])
#     batch_size = int(batch_str[1:])
#     df = read_wandb_table(f"{wandb_dir}/{dir}/test_profile.table.json")
#     df.insert(0, "shot", shot)
#     df.insert(0, "batch_size", batch_size)
#     df.insert(0, "method", group)
#     df.insert(0, "index", i)
#     df_list.append(df)

# test_profile_df = pd.concat(df_list)
# test_profile_df.to_csv("logs/wandb/test_profile.csv", index=False)

## Preparation


In [15]:
def calc_confidence_limits(
    data: pd.DataFrame,
    mean_col: str = "Mean (s)",
    std_col: str = "Std (s)",
    ci: Literal[90, 95, 99] = 95,
) -> pd.DataFrame:
    data["Std Err"] = data[std_col] / (data["Num Calls"]) ** 0.5
    if ci == 90:
        z = 1.645
    elif ci == 95:
        z = 1.96
    elif ci == 99:
        z = 2.576
    data[f"CL {ci} L"] = data[mean_col] - z * data["Std Err"]
    data[f"CL {ci} U"] = data[mean_col] + z * data["Std Err"]
    return data

In [16]:
def combine_mean(data: pd.DataFrame, mean_col: str, num_items_col: str) -> float:
    return (data[mean_col] * data[num_items_col]).sum() / data[num_items_col].sum()


def combine_variance(
    data: pd.DataFrame,
    mean_col: str,
    var_col: str,
    num_items_col: str,
    combined_mean: float | None = None,
) -> float:
    num_items = data[num_items_col]
    variances = data[var_col] ** 2
    means = data[mean_col]
    if combined_mean is None:
        combined_mean = combine_mean(data, mean_col, num_items_col)

    weighted_var = ((num_items - 1) * variances).sum()
    between_var = (num_items * (means - combined_mean) ** 2).sum()

    total_num_items = num_items.sum()
    return (weighted_var + between_var) / total_num_items


def combine_mean_variance(
    data: pd.DataFrame,
    groupby_cols: list[str],
    mean_col: str,
    std_col: str,
    num_items_col: str,
) -> pd.DataFrame:
    def agg_func(data: pd.DataFrame) -> pd.Series:
        combined_mean = combine_mean(data, mean_col, num_items_col)
        combined_std = (
            combine_variance(data, mean_col, std_col, num_items_col, combined_mean)
            ** 0.5
        )
        total_num_items = data[num_items_col].sum()
        return pd.Series(
            {
                mean_col: combined_mean,
                std_col: combined_std,
                num_items_col: total_num_items,
            }
        )

    combined_df = data.groupby(groupby_cols).apply(agg_func).reset_index()
    return combined_df


In [17]:
test_profile_df = pd.read_csv("logs/wandb/test_profile.csv")

test_profile_df

Unnamed: 0,index,method,batch_size,shot,Action,Mean (s),Std (s),Num Calls,Sum (s),Percentage (%)
0,0,EO-ProtoSeg,1,1,Total,-1.000000,-1.000000,6424,8.871959,100.000000
1,0,EO-ProtoSeg,1,1,[Strategy]SingleDeviceStrategy.test_step,0.020325,0.037696,400,8.130012,91.637160
2,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.evaluation_process,0.020257,0.037689,400,8.102893,91.331492
3,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.forward,0.015158,0.037529,400,6.063162,68.340736
4,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.get_prototypes,0.010070,0.037423,400,4.027890,45.400233
...,...,...,...,...,...,...,...,...,...,...
2882,510,WeaSeL,9,20,[Learner]WeaselUnet.tune_process,0.131911,0.018867,1452,191.534241,94.796746
2883,511,WeaSeL,9,5,Total,-1.000000,-1.000000,10804,53.856575,100.000000
2884,511,WeaSeL,9,5,[Strategy]SingleDeviceStrategy.test_step,1.202386,0.006033,44,52.904991,98.233115
2885,511,WeaSeL,9,5,[Learner]WeaselUnet.evaluation_process,1.202291,0.005994,44,52.900787,98.225310


## Simple Learner


In [6]:
sl_test_profile_df = test_profile_df[test_profile_df["method"].isin(["SL"])]

sl_inf_df = sl_test_profile_df[
    sl_test_profile_df["Action"] == "[Learner]SimpleUnet.forward"
].drop(columns=["index", "Action", "Percentage (%)"])
sl_inf_df["Mean per Image (s)"] = sl_inf_df["Mean (s)"] / sl_inf_df["batch_size"]
sl_inf_df["Std per Image (s)"] = sl_inf_df["Std (s)"] / sl_inf_df["batch_size"]

sl_inf_df = calc_confidence_limits(sl_inf_df, "Mean per Image (s)", "Std per Image (s)")

line_chart = (
    alt.Chart(sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(sl_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## ProtoSeg


In [7]:
ps_test_profile_df = test_profile_df[test_profile_df["method"].str.endswith("ProtoSeg")]

ps_test_profile_df[ps_test_profile_df["index"] == 0]

Unnamed: 0,index,method,batch_size,shot,Action,Mean (s),Std (s),Num Calls,Sum (s),Percentage (%)
0,0,EO-ProtoSeg,1,1,Total,-1.0,-1.0,6424,8.871959,100.0
1,0,EO-ProtoSeg,1,1,[Strategy]SingleDeviceStrategy.test_step,0.020325,0.037696,400,8.130012,91.63716
2,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.evaluation_process,0.020257,0.037689,400,8.102893,91.331492
3,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.forward,0.015158,0.037529,400,6.063162,68.340736
4,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.get_prototypes,0.01007,0.037423,400,4.02789,45.400233
5,0,EO-ProtoSeg,1,1,[Learner]ProtosegUnet.get_predictions,0.004946,0.00058,400,1.978566,22.301345
6,0,EO-ProtoSeg,1,1,[_EvaluationLoop].test_next,0.000326,0.00408,400,0.130487,1.470781
7,0,EO-ProtoSeg,1,1,[Strategy]SingleDeviceStrategy.batch_to_device,0.000291,5.3e-05,400,0.116565,1.313862
8,0,EO-ProtoSeg,1,1,[LightningModule]ProtosegUnet.transfer_batch_t...,0.000242,5e-05,400,0.096633,1.089191


In [10]:
ps_inf_df = calc_confidence_limits(
    ps_test_profile_df[
        ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.evaluation_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)

line_chart = (
    alt.Chart(ps_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=300, height=200).facet(column="shot")

In [11]:
ps_inf_overhead_df = calc_confidence_limits(
    ps_test_profile_df[
        ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.get_prototypes"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)

line_chart = (
    alt.Chart(ps_inf_overhead_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_inf_overhead_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=300, height=200).facet(column="shot")

In [24]:
ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.get_predictions"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]

ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ps_inf_df,
    ]
)

ps_sl_inf_df = ps_sl_inf_df[ps_sl_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ps_sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ps_sl_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## WeaSeL


In [8]:
ws_test_profile_df = test_profile_df[test_profile_df["method"].str.endswith("WeaSeL")]

ws_test_profile_df[ws_test_profile_df["index"] == 80]

Unnamed: 0,index,method,batch_size,shot,Action,Mean (s),Std (s),Num Calls,Sum (s),Percentage (%)
563,80,EO-WeaSeL,1,1,Total,-1.0,-1.0,98024,135.287432,100.0
564,80,EO-WeaSeL,1,1,[Strategy]SingleDeviceStrategy.test_step,0.336349,0.104027,400,134.539703,99.447304
565,80,EO-WeaSeL,1,1,[Learner]WeaselUnet.evaluation_process,0.336283,0.104023,400,134.51326,99.427757
566,80,EO-WeaSeL,1,1,[Learner]WeaselUnet.tune_process,0.009059,0.017675,13200,119.579399,88.389141


In [15]:
ws_inf_df = calc_confidence_limits(
    ws_test_profile_df[
        ws_test_profile_df["Action"] == "[Learner]WeaselUnet.evaluation_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)

line_chart = (
    alt.Chart(ws_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ws_inf_df)
    .mark_errorband()
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=300, height=200).facet(column="shot")

In [103]:
ws_inf_df = ws_test_profile_df[
    ws_test_profile_df["Action"] == "[Learner]WeaselUnet.inference"
].drop(columns=["index", "Action", "Percentage (%)"])
ws_inf_df["Mean per Image (s)"] = ws_inf_df["Mean (s)"] / ws_inf_df["batch_size"]
ws_inf_df["Std per Image (s)"] = ws_inf_df["Std (s)"] / ws_inf_df["batch_size"]

ws_inf_df = combine_mean_variance(
    ws_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)

ws_inf_df = calc_confidence_limits(ws_inf_df, "Mean per Image (s)", "Std per Image (s)")
ws_inf_df["CL 95 L"] = ws_inf_df["CL 95 L"].clip(lower=0)

ws_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ws_inf_df,
    ]
)

ws_sl_inf_df = ws_sl_inf_df[ws_sl_inf_df["batch_size"] >= 2]

line_chart = (
    alt.Chart(ws_sl_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color="method",
    )
)

error_chart = (
    alt.Chart(ws_sl_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x="batch_size",
        y=alt.Y("CL 95 U").title("Time per Image CL 95 (s)"),
        y2="CL 95 L",
        color="method",
    )
)

(line_chart + error_chart).properties(width=600, height=400)

## Publication


In [26]:
ordered_methods = [
    "ProtoSeg",
    "O-ProtoSeg",
    "EO-ProtoSeg",
    "SL",
    "WeaSeL",
    "O-WeaSeL",
    "EO-WeaSeL",
]

color_values = [
    "#ffda03",  # Yellow
    "#e85d04",  # Orange
    "#d00000",  # Red
    "#757575",  # Gray
    "#43b0f1",  # Blue
    "#2ec4b6",  # Turquoise
    "#2d6a4f",  # Green
]

color_scale = alt.Scale(domain=ordered_methods, range=color_values)
color_scale_no_sl = alt.Scale(
    domain=ordered_methods[:3] + ordered_methods[4:],
    range=color_values[:3] + color_values[4:],
)

In [40]:
width, height = 150, 150

color = alt.Color(
    "method:N",
    scale=color_scale_no_sl,
    title="Method",
    legend=alt.Legend(
        orient="bottom",
        direction="horizontal",
        titleAnchor="start",
        columns=6,
    ),
)

ps_inf_df = calc_confidence_limits(
    ps_test_profile_df[
        ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.evaluation_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)
ps_inf_df["shot"] = ps_inf_df["shot"].apply(lambda x: f"{x}-shot")

line_chart = (
    alt.Chart(ps_inf_df).mark_line().encode(x="batch_size", y="Mean (s)", color=color)
)
error_chart = (
    alt.Chart(ps_inf_df)
    .mark_errorband()
    .encode(
        x=alt.X("batch_size")
        .title(None)
        .scale(domain=[1, 16], nice=False)
        .axis(labels=False),
        y=alt.Y("CL 95 U").title("ProtoSeg Time (s)").scale(nice=False),
        y2="CL 95 L",
        color=color,
    )
)
ps_chart = (
    (line_chart + error_chart)
    .properties(width=width, height=height)
    .facet(
        column=alt.Column(
            "shot",
            sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
            header=alt.Header(title=None),
        ),
        spacing=10,
    )
)

ws_inf_df = calc_confidence_limits(
    ws_test_profile_df[
        ws_test_profile_df["Action"] == "[Learner]WeaselUnet.evaluation_process"
    ].drop(columns=["index", "Action", "Percentage (%)"]),
)
ws_inf_df["shot"] = ws_inf_df["shot"].apply(lambda x: f"{x}-shot")

line_chart = (
    alt.Chart(ws_inf_df).mark_line().encode(x="batch_size", y="Mean (s)", color=color)
)
error_chart = (
    alt.Chart(ws_inf_df)
    .mark_errorband()
    .encode(
        x=alt.X("batch_size").title(None).scale(domain=[1, 16], nice=False),
        y=alt.Y("CL 95 U").title("WeaSeL Time (s)").scale(nice=False),
        y2="CL 95 L",
        color=color,
    )
)
ws_chart = (
    (line_chart + error_chart)
    .properties(width=width, height=height)
    .facet(
        column=alt.Column(
            "shot",
            sort=["1-shot", "5-shot", "10-shot", "15-shot", "20-shot"],
            header=alt.Header(title="Batch Size", titleOrient="bottom", labels=False),
        ),
        spacing=10,
    )
)

(
    alt.vconcat(ps_chart, ws_chart)
    .configure_axis(labelFontSize=14, titleFontSize=14)
    .configure_header(labelFontSize=14, titleFontSize=14)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

In [None]:
color = alt.Color(
    "method:N",
    scale=color_scale,
    title="Method",
    legend=alt.Legend(
        orient="bottom",
        direction="horizontal",
        titleAnchor="start",
        columns=4,
    ),
)

ps_inf_df = ps_test_profile_df[
    ps_test_profile_df["Action"] == "[Learner]ProtosegUnet.get_predictions"
].drop(columns=["index", "Action", "Percentage (%)"])
ps_inf_df["Mean per Image (s)"] = ps_inf_df["Mean (s)"] / ps_inf_df["batch_size"]
ps_inf_df["Std per Image (s)"] = ps_inf_df["Std (s)"] / ps_inf_df["batch_size"]
ps_inf_df = combine_mean_variance(
    ps_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)
ps_inf_df = calc_confidence_limits(ps_inf_df, "Mean per Image (s)", "Std per Image (s)")

ps_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ps_inf_df,
    ]
)
ps_sl_inf_df["method_parent"] = "ProtoSeg"

ws_inf_df = ws_test_profile_df[
    ws_test_profile_df["Action"] == "[Learner]WeaselUnet.inference"
].drop(columns=["index", "Action", "Percentage (%)"])
ws_inf_df["Mean per Image (s)"] = ws_inf_df["Mean (s)"] / ws_inf_df["batch_size"]
ws_inf_df["Std per Image (s)"] = ws_inf_df["Std (s)"] / ws_inf_df["batch_size"]
ws_inf_df = combine_mean_variance(
    ws_inf_df,
    groupby_cols=["method", "batch_size"],
    mean_col="Mean per Image (s)",
    std_col="Std per Image (s)",
    num_items_col="Num Calls",
)
ws_inf_df = calc_confidence_limits(ws_inf_df, "Mean per Image (s)", "Std per Image (s)")
ws_inf_df["CL 95 L"] = ws_inf_df["CL 95 L"].clip(lower=0)

ws_sl_inf_df = pd.concat(
    [
        sl_inf_df[sl_inf_df["batch_size"] <= 16].drop(
            columns=["shot", "Mean (s)", "Std (s)", "Sum (s)"]
        ),
        ws_inf_df,
    ]
)
ws_sl_inf_df["method_parent"] = "WeaSeL"

all_inf_df = pd.concat([ps_sl_inf_df, ws_sl_inf_df])
all_inf_df = all_inf_df[all_inf_df["batch_size"] >= 4]

line_chart = (
    alt.Chart(all_inf_df)
    .mark_line()
    .encode(
        x="batch_size",
        y="Mean per Image (s)",
        color=color,
    )
)

error_chart = (
    alt.Chart(all_inf_df)
    .mark_errorband(opacity=0.2)
    .encode(
        x=alt.X("batch_size").title("Batch Size"),
        y=alt.Y("CL 95 U").title("Time per Image (s)"),
        y2="CL 95 L",
        color=color,
    )
)

(
    (error_chart + line_chart)
    .properties(width=335, height=200)
    .facet(row=alt.Row("method_parent", header=alt.Header(title=None)), spacing=10)
    .resolve_scale(y="independent")
    .configure_axis(labelFontSize=14, titleFontSize=14)
    .configure_header(labelFontSize=14, titleFontSize=14)
    .configure_legend(labelFontSize=14, titleFontSize=16)
)

## Hypothesis Testing


In [None]:
def test_two_methods_time_paired(
    faster_method: str, slower_method: str, action_name: str
) -> tuple[float, float]:
    df = test_profile_df[test_profile_df["Action"] == action_name]
    df = df.drop(
        columns=["index", "Action", "Std (s)", "Num Calls", "Sum (s)", "Percentage (%)"]
    )
    slower_df = df[df["method"] == slower_method]
    faster_df = df[df["method"] == faster_method]

    merged_df = pd.merge(
        slower_df, faster_df, on=["batch_size", "shot"], suffixes=(" S", " F")
    )
    n = len(merged_df)

    merged_df["diff"] = merged_df["Mean (s) F"] - merged_df["Mean (s) S"]

    t_value = merged_df["diff"].mean() / (merged_df["diff"].std() / np.sqrt(n))
    p_value = stats.t.cdf(t_value, n - 1)
    assert isinstance(p_value, float)

    return t_value, p_value

In [None]:
test_two_methods_time_paired(
    "EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.evaluation_process"
)

(-10.385751948442765, 9.936293244658502e-17)

In [None]:
test_two_methods_time_paired(
    "EO-WeaSeL", "WeaSeL", "[Learner]WeaselUnet.evaluation_process"
)

(-4.87778803956851, 2.728479075161107e-06)

In [None]:
def test_two_methods_time(
    faster_method: str, slower_method: str, action_name: str
) -> tuple[float, float]:
    def combine_variance(df, mean_value):
        within = ((df["Num Calls"] - 1) * df["Std (s)"] ** 2).sum()
        between = (df["Num Calls"] * (df["Mean (s)"] - mean_value) ** 2).sum()
        return (within + between) / df["Num Calls"].sum()

    df = test_profile_df[test_profile_df["Action"] == action_name]
    df = df.drop(columns=["index", "Action", "Percentage (%)"])
    slower_df = df[df["method"] == slower_method]
    faster_df = df[df["method"] == faster_method]

    slower_n = slower_df["Num Calls"].sum()
    faster_n = faster_df["Num Calls"].sum()

    slower_mean = slower_df["Sum (s)"].sum() / slower_n
    faster_mean = faster_df["Sum (s)"].sum() / faster_n

    slower_var = combine_variance(slower_df, slower_mean) / slower_n
    faster_var = combine_variance(faster_df, faster_mean) / faster_n

    t_value = (faster_mean - slower_mean) / ((faster_var + slower_var) ** 0.5)
    dof = ((faster_var + slower_var) ** 2) / (
        (faster_var**2) / (faster_n - 1) + (slower_var**2) / (slower_n - 1)
    )
    p_value = stats.t.cdf(t_value, dof)
    assert isinstance(p_value, float)

    return t_value, p_value

In [182]:
test_two_methods_time(
    "EO-ProtoSeg", "ProtoSeg", "[Learner]ProtosegUnet.evaluation_process"
)

(-6.0824738433666345, 6.077224072465954e-10)

In [183]:
test_two_methods_time("EO-WeaSeL", "WeaSeL", "[Learner]WeaselUnet.evaluation_process")

(-0.7862817673489974, 0.21585812798470522)

# Others


In [None]:
data = [
    ["CFEA [86]", "UDA", None, 79.78, 70.52, 88.96, 75.86, 60.08, 46.53],
    ["pOSAL [25]", "UDA", "5.8M", 91.42, 72.30, 90.83, 78.31, 76.75, 62.59],
    ["SIFA [182]", "UDA", "43.3M", 83.04, 57.29, 85.69, 69.57, 74.67, 52.84],
    ["WGAN [77]", "UDA", None, 91.20, 72.40, None, None, None, None],
    ["IOSUDA [88]", "UDA", "42.8M", 89.53, 65.56, 91.04, 71.03, 83.26, 60.07],
    ["CADA [87]", "UDA", "9.7M", 80.18, 72.41, 90.44, 77.21, 62.13, 47.1],
    ["SCUDA [90]", "UDA", None, 90.34, 66.61, None, None, 84.89, 61.65],
    ["GrabCut+UNet [65]", "WSS", None, 86.37, None, None, None, None, None],
    ["MERU [176]", "FSS", None, None, None, 83.92, 61.47, None, None],
    ["RDMT [178]", "SSS", None, None, None, None, 70.93, None, None],
    ["EO-ProtoSeg 1s", "FWS", "1.9M", 84.96, 63.69, 88.15, 71.17, 79.92, 44.01],
    ["EO-ProtoSeg 5s", "FWS", "1.9M", 85.30, 68.61, 88.18, 73.11, 80.46, 50.27],
    ["EO-ProtoSeg 10s", "FWS", "1.9M", 85.02, 68.93, 88.18, 73.52, 80.57, 52.42],
    ["EO-ProtoSeg best", "FWS", "1.9M", 86.80, 71.78, 88.21, 73.70, 80.39, 52.65],
]

columns = [
    "method",
    "method_type",
    "params",
    "drishti_od",
    "drishti_oc",
    "refuge_od",
    "refuge_oc",
    "rimone_od",
    "rimone_oc",
]

df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,method,method_type,params,drishti_od,drishti_oc,refuge_od,refuge_oc,rimone_od,rimone_oc
0,CFEA [86],UDA,,79.78,70.52,88.96,75.86,60.08,46.53
1,pOSAL [25],UDA,5.8M,91.42,72.3,90.83,78.31,76.75,62.59
2,SIFA [182],UDA,43.3M,83.04,57.29,85.69,69.57,74.67,52.84
3,WGAN [77],UDA,,91.2,72.4,,,,
4,IOSUDA [88],UDA,42.8M,89.53,65.56,91.04,71.03,83.26,60.07
5,CADA [87],UDA,9.7M,80.18,72.41,90.44,77.21,62.13,47.1
6,SCUDA [90],UDA,,90.34,66.61,,,84.89,61.65
7,GrabCut+UNet [65],WSS,,86.37,,,,,
8,MERU [176],FSS,,,,83.92,61.47,,
9,RDMT [178],SSS,,,,,70.93,,


In [None]:
other_df = df[~df["method"].str.contains("EO-ProtoSeg")]

other_df.mean(axis=0, numeric_only=True, skipna=True)


drishti_od    86.482500
drishti_oc    68.155714
refuge_od     88.480000
refuge_oc     72.054286
rimone_od     73.630000
rimone_oc     55.130000
dtype: float64

In [None]:
other_df_diff = other_df.copy()
for col in other_df.columns[3:]:
    other_df_diff[col] = other_df_diff[col] - df.iloc[-1][col]

other_df_diff

Unnamed: 0,method,method_type,params,drishti_od,drishti_oc,refuge_od,refuge_oc,rimone_od,rimone_oc
0,CFEA [86],UDA,,-7.02,-1.26,0.75,2.16,-20.31,-6.12
1,pOSAL [25],UDA,5.8M,4.62,0.52,2.62,4.61,-3.64,9.94
2,SIFA [182],UDA,43.3M,-3.76,-14.49,-2.52,-4.13,-5.72,0.19
3,WGAN [77],UDA,,4.4,0.62,,,,
4,IOSUDA [88],UDA,42.8M,2.73,-6.22,2.83,-2.67,2.87,7.42
5,CADA [87],UDA,9.7M,-6.62,0.63,2.23,3.51,-18.26,-5.55
6,SCUDA [90],UDA,,3.54,-5.17,,,4.5,9.0
7,GrabCut+UNet [65],WSS,,-0.43,,,,,
8,MERU [176],FSS,,,,-4.29,-12.23,,
9,RDMT [178],SSS,,,,,-2.77,,
