## Set Up Dependencies


In [None]:
import datetime
import os

from cliffs_delta import cliffs_delta
import pandas as pd
import pecking
import matplotlib as mpl
from matplotlib import pyplot as plt
import pecking
import seaborn as sns
from slugify import slugify
from teeplot import teeplot as tp

from pylib._describe_effect import describe_effect
from pylib._styler_to_pdf import styler_to_pdf


## Data Retrieval and Preprocessing


In [None]:
df = pd.read_parquet("https://osf.io/ah7v5/download")


In [None]:
df["Num Reconstructed Inner Nodes"] = df["num_reconstructed_inner_nodes"]
df["Num True Inner Nodes"] = df["num_true_inner_nodes"]
df["Raw Triplet Distance"] = df["triplet_distance_raw"]
df["Sampled Triplet Distance Strict Reconst"] = df[
    "sampled_triplet_distance_strict_reconst"
]
df["Strict Triplet Distance"] = df["sampled_triplet_distance_strict"]
df["Lax Triplet Distance"] = df["sampled_triplet_distance_lax"]
df["Differentia Width (bits)"] = df["differentia_width_bits"]
df["Annotation Size (bits)"] = df["annotation_size_bits"]
df["Algorithm"] = df["algorithm"]
df["Population Size"] = df["population_size"]
df["Num Generations"] = df["num_generations"]
df["Num Islands"] = df["num_islands"]
df["Num Niches"] = df["num_niches"]
df["Tournament Size"] = df["tournament_size"]
df["Replicate"] = df["replicate"]
df["Downsample"] = df["downsample"]


In [None]:
df["Inner Node Resolution"] = (
    df["Num Reconstructed Inner Nodes"] / df["Num True Inner Nodes"]
)
df["Inner Node Loss"] = 1 - df["Inner Node Resolution"]


In [None]:
df["Dynamics"] = (
    "islands"
    + df["Num Islands"].astype(str)
    + "-niches"
    + df["Num Niches"].astype(str)
    + "-tsize"
    + df["Tournament Size"].astype(str)
)


In [None]:
df["Scale"] = (
    "npop"
    + df["Population Size"].astype(str)
    + "-ngen"
    + df["Num Generations"].astype(str)
)


In [None]:
df["Scenario"] = df["Dynamics"].map(
    {
        "islands1-niches1-tsize2": "plain",
        "islands1-niches1-tsize1": "drift",
        "islands4-niches2-tsize2": "mild\nstructure",
        "islands64-niches8-tsize2": "rich\nstructure",
    }
)


In [None]:
df["Policy"] = df["Algorithm"].map(
    {
        "surf-tilted": "Tilted",
        "col-tilted": "Tilted",
        "surf-hybrid": "Hybrid",
        "surf-steady": "Steady",
        "col-steady": "Steady",
    }
)


In [None]:
df["Implementation"] = df["Algorithm"].map(
    {
        "surf-tilted": "Surface",
        "col-tilted": "Column",
        "surf-hybrid": "Surface",
        "surf-steady": "Surface",
        "col-steady": "Column",
    }
)


In [None]:
for col in df.columns:
    df[col.replace(" ", "\n")] = df[col]


In [None]:
sensitivity_analysis_variables = [
    "Differentia\nWidth\n(bits)",
    "Population\nSize",
    "Downsample",
    "Num\nGenerations",
    "Annotation\nSize\n(bits)",
]


In [None]:
dfx = df.melt(
    id_vars=[
        "Algorithm",
        "Annotation Size (bits)",
        "Annotation\nSize\n(bits)",
        "annotation_size_bits",
        "Differentia Width (bits)",
        "Differentia\nWidth\n(bits)",
        "differentia_width_bits",
        "Downsample",
        "downsample",
        "Dynamics",
        "Implementation",
        "Policy",
        "Scenario",
        "Scale",
        "Population Size",
        "Population\nSize",
        "population_size",
        "Num Generations",
        "Num\nGenerations",
        "num_generations",
        "Num Islands",
        "Num Niches",
        "Tournament Size",
        "Replicate",
    ],
    value_vars=[
        "Strict\nTriplet\nDistance",
        # "Lax\nTriplet\nDistance",
        "Inner\nNode\nLoss",
    ],
)
dfx.head()


## Visualize Main Metrics


In [None]:
for name, group in dfx.groupby(sensitivity_analysis_variables):
    group_variables = dict(zip(sensitivity_analysis_variables, name))

    group["Scenario"] = group["Scenario"].apply(
        lambda x: x.split("\n")[0],
    )
    row_order = [
        "Strict\nTriplet\nDistance",
        "Inner\nNode\nLoss",
    ]

    tp.tee(
        pecking.peckplot,
        data=group.reset_index(drop=True).rename(
            columns={"Implementation": "Impl"},
        ),
        score="value",
        x="Scenario",
        y="value",
        # col="Num Generations",
        col="Policy",
        col_order=[
            "Tilted",
            "Steady",
        ],
        row="variable",
        row_order=row_order,
        hue="Impl",
        hue_order=[
            "Column",
            "Surface",
        ],
        x_group="outer",
        order=[
            "plain",
            "mild",
            "rich",
            "drift",
        ],
        skim_hatches=("*",),
        skim_labels=("Best",),
        skimmers=[pecking.skim_lowest],
        margin_titles=True,
        height=1.7,
        aspect=2,
        facet_kws={
            "ylim": (0, 1),
        },
        teeplot_outattrs={
            slugify(k): str(v) for k, v in group_variables.items()
        },
        teeplot_outexclude=["post", "teeplot_postprocess"],
        teeplot_postprocess="teed.figure.subplots_adjust(right=0.72); teed.set_titles(row_template='{row_name}')",
        # clip_on=False,
    )
    display(group_variables)


## Make Stat Table


In [None]:

lookup_effect = {}


def cached_delta(x, y):
    d = cliffs_delta(x, y)[0]
    lookup_effect[d] = describe_effect(x, y)
    return d


vars = [
    "Policy",
    "Scenario",
    *sensitivity_analysis_variables,
]

records = []
for name, group in df[df["Policy"].isin(["Steady", "Tilted"])].reset_index().groupby(vars):
    group_variables = dict(zip(vars, name))
    records.append(
        {
            **group_variables,
            **{
                "Strict<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Implementation"] == "Column"][
                        "Strict Triplet Distance"
                    ],
                    group[group["Implementation"] == "Surface"][
                        "Strict Triplet Distance"
                    ],
                ),
                "Lax<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Implementation"] == "Column"][
                        "Lax Triplet Distance"
                    ],
                    group[group["Implementation"] == "Surface"][
                        "Lax Triplet Distance"
                    ],
                ),
                "Inner<br/>Node<br/>Loss": (
                    cached_delta(
                        group[group["Implementation"] == "Column"][
                            "Inner Node Loss"
                        ],
                        group[group["Implementation"] == "Surface"][
                            "Inner Node Loss"
                        ],
                    )
                    # if group_variables["Differentia\nWidth\n(bits)"] > 1
                    # else float("nan")
                ),
            },
        }
    )


dfr = pd.DataFrame.from_records(records)
dfr


In [None]:
cmap = mpl.colormaps.get_cmap("coolwarm")
cmap.set_bad(color="white")


In [None]:
dfr["Pop Size"] = dfr["Population\nSize"]
dfr["Unit"] = dfr["Differentia\nWidth\n(bits)"].map({1: "bit", 8: "byte"})
dfr["DSamp"] = dfr["Downsample"]
dfr["Size<br/>(bits)"] = dfr["Annotation\nSize\n(bits)"]


In [None]:
vars = [
    "Policy",
    "Scenario",
    "Unit",
    "Pop Size",
    "DSamp",
    "Size<br/>(bits)",
]
dfs = (
    dfr[(dfr["Num\nGenerations"] == 100000)]
    .set_index(vars)
    .unstack(level=0)[
        [
            "Strict<br/>Triplet<br/>Distance",
            # "Lax<br/>Triplet<br/>Distance",
            "Inner<br/>Node<br/>Loss",
        ]
    ]
    .swaplevel(axis="columns")
    .sort_index(axis=1, level=0, ascending=False)
)
styled_dfs = (
    dfs.style.background_gradient(cmap=cmap, vmin=-1, vmax=1, axis=None)
    .format(lambda x: lookup_effect.get(x, "n/a"))
)

props = [
    ("cellpadding", "0px"),
    ("cellspacing", "0px"),
    ("border", "1px solid black"),
    ("border-collapse", "collapse"),
    ("max-width", "80px !important"),
    ("word-wrap", "break-word"),
]
moreprops = [
    *props,
    ("padding-top", "0px"),
    ("padding-bottom", "0px"),
    ("margin", "0px"),
    ("height", "0px"),
]
smallprops = [*moreprops, ("font-size", "9px"), ("text-align", "center"), ("width", "75px")]
bigprops = [
    *moreprops,
    ("font-size", "11px"),
    ("color", "white"),
    ("background-color", "DimGray"),
]

divider_style = [
    # Header cell borders
    {"selector": "th", "props": bigprops},
    # Data cell borders
    {"selector": "td", "props": smallprops},
    # Row borders
    {"selector": "tr", "props": smallprops},
]
styled_dfs = styled_dfs.set_table_styles(divider_style).set_table_attributes(
    'style="cellspacing:0;border-collapse:collapse;font-family:sans-serif;"'
)

styled_dfs


In [None]:
os.makedirs("outplots", exist_ok=True)
styler_to_pdf(styled_dfs, "outplots/surf-vs-col-table.pdf")


## Make Table Summarization


In [None]:
import numpy as np

dfp = dfs.stack(level=0).reset_index().copy()
dfp["Strict Triplet Distance"] = dfp["Strict<br/>Triplet<br/>Distance"].apply(
    lambda x: {
        -1: "column better",
        0: "neutral",
        1: "surface better",
    }[
        ("*" in lookup_effect.get(x))
        * np.sign(x)
    ],
)
dfp["Inner Node Loss"] = dfp["Inner<br/>Node<br/>Loss"].apply(
    lambda x: {
        -1: "column better",
        0: "neutral",
        1: "surface better",
    }[
        ("*" in lookup_effect.get(x))
        * np.sign(x)
    ],
)
dfp


In [None]:
dfp = dfp.rename(
    columns={
        "Inner<br/>Node<br/>Loss": "Inner Node Loss Numerical",
        "Strict<br/>Triplet<br/>Distance": "Strict Triplet Distance Numerical",
    },
)


In [None]:
id_vars = [
    "Policy",
    "Scenario",
    "Unit",
    "Pop Size",
    "DSamp",
    # "Num Generations",
    "Size<br/>(bits)",
]
dfm = pd.DataFrame.merge(
    dfp.melt(
        id_vars=id_vars,
        value_vars=[
            "Strict Triplet Distance",
            "Inner Node Loss",
        ],
    ),
    dfp.melt(
        id_vars=id_vars,
        value_vars=[
            "Strict Triplet Distance Numerical",
            "Inner Node Loss Numerical",
        ],
    ),
    on=id_vars,
)


In [None]:
dfm = dfm[
    (
        dfm["variable_x"].isin(["Strict Triplet Distance"])
        & dfm["variable_y"].isin(["Strict Triplet Distance Numerical"])
    )
    | (
        dfm["variable_x"].isin(["Inner Node Loss"])
        & dfm["variable_y"].isin(["Inner Node Loss Numerical"])
    )
].reset_index()


In [None]:
dfm["Scenario"] = dfm["Scenario"].apply(lambda x: x.split("\n")[0])


In [None]:
tp.tee(
    sns.catplot,
    dfm.rename(
        {
            "variable_x": "Metric",
            "value_y": "Cliff's Delta",
            "value_x": "Relative Grade",
        },
        axis=1,
    ),
    x="Cliff's Delta",
    # y="Policy",
    row="Policy",
    row_order=["Tilted", "Steady"],
    order=[
        "plain",
        "mild",
        "rich",
        "drift",
    ],
    hue="Relative Grade",
    hue_order=[
        "surface better",
        "neutral",
        "column better",
    ],
    col="Metric",
    # row="Scenario",
    y="Scenario",
    margin_titles=True,
    aspect=2,
    height=1.5,
    palette=sns.color_palette("hls", 8)[:3][::-1],
    alpha=0.3,
    s=50,
    kind="strip",
    teeplot_outexclude=["post", "teeplot_postprocess"],
    teeplot_postprocess="teed.set_titles(col_template='{col_name}', row_template='{row_name}')",
)
plt.show()


## Reproducibility


In [None]:
datetime.datetime.now().isoformat()


In [None]:
%load_ext watermark
%watermark


In [None]:
!pip freeze


In [None]:
!inkscape --version
