## Set Up Dependencies


In [None]:
import datetime
import os

from cliffs_delta import cliffs_delta
import pandas as pd
import pecking
import matplotlib as mpl
import pecking
from slugify import slugify
from teeplot import teeplot as tp

from pylib._describe_effect import describe_effect
from pylib._styler_to_pdf import styler_to_pdf


## Data Retrieval and Preprocessing


In [None]:
df = pd.read_parquet("https://osf.io/ah7v5/download")


In [None]:
df["Num Reconstructed Inner Nodes"] = df["num_reconstructed_inner_nodes"]
df["Num True Inner Nodes"] = df["num_true_inner_nodes"]
df["Raw Triplet Distance"] = df["triplet_distance_raw"]
df["Sampled Triplet Distance Strict Reconst"] = df[
    "sampled_triplet_distance_strict_reconst"
]
df["Strict Triplet Distance"] = df["sampled_triplet_distance_strict"]
df["Lax Triplet Distance"] = df["sampled_triplet_distance_lax"]
df["Differentia Width (bits)"] = df["differentia_width_bits"]
df["Annotation Size (bits)"] = df["annotation_size_bits"]
df["Algorithm"] = df["algorithm"]
df["Population Size"] = df["population_size"]
df["Num Generations"] = df["num_generations"]
df["Num Islands"] = df["num_islands"]
df["Num Niches"] = df["num_niches"]
df["Tournament Size"] = df["tournament_size"]
df["Replicate"] = df["replicate"]
df["Downsample"] = df["downsample"]


In [None]:
df["Inner Node Resolution"] = (
    df["Num Reconstructed Inner Nodes"] / df["Num True Inner Nodes"]
)
df["Inner Node Loss"] = 1 - df["Inner Node Resolution"]


In [None]:
df["Dynamics"] = (
    "islands"
    + df["Num Islands"].astype(str)
    + "-niches"
    + df["Num Niches"].astype(str)
    + "-tsize"
    + df["Tournament Size"].astype(str)
)


In [None]:
df["Scale"] = (
    "npop"
    + df["Population Size"].astype(str)
    + "-ngen"
    + df["Num Generations"].astype(str)
)


In [None]:
df["Scenario"] = df["Dynamics"].map(
    {
        "islands1-niches1-tsize2": "plain",
        "islands1-niches1-tsize1": "drift",
        "islands4-niches2-tsize2": "mild\nstructure",
        "islands64-niches8-tsize2": "rich\nstructure",
    }
)


In [None]:
df["Policy"] = df["Algorithm"].map(
    {
        "surf-tilted": "Tilted",
        "col-tilted": "Tilted",
        "surf-hybrid": "Hybrid",
        "surf-steady": "Steady",
        "col-steady": "Steady",
    }
)


In [None]:
df["Implementation"] = df["Algorithm"].map(
    {
        "surf-tilted": "Surface",
        "col-tilted": "Column",
        "surf-hybrid": "Surface",
        "surf-steady": "Surface",
        "col-steady": "Column",
    }
)


In [None]:
for col in df.columns:
    df[col.replace(" ", "\n")] = df[col]


In [None]:
sensitivity_analysis_variables = [
    "Differentia\nWidth\n(bits)",
    "Algorithm",
    # "Num\nGenerations",
    "Annotation\nSize\n(bits)",
]


In [None]:
dfx = df.melt(
    id_vars=[
        "Algorithm",
        "Annotation Size (bits)",
        "Annotation\nSize\n(bits)",
        "annotation_size_bits",
        "Differentia Width (bits)",
        "Differentia\nWidth\n(bits)",
        "differentia_width_bits",
        "Downsample",
        "downsample",
        "Dynamics",
        "Implementation",
        "Policy",
        "Scenario",
        "Scale",
        "Population Size",
        "Population\nSize",
        "population_size",
        "Num Generations",
        "Num\nGenerations",
        "num_generations",
        "Num Islands",
        "Num Niches",
        "Tournament Size",
        "Replicate",
    ],
    value_vars=[
        "Strict\nTriplet\nDistance",
        "Lax\nTriplet\nDistance",
        "Inner\nNode\nLoss",
    ],
)
dfx.head()


## Visualize Main Metrics -- Population Size


In [None]:
for name, group in dfx[
    (dfx["downsample"] == 500)
    & (dfx["Algorithm"].isin(["surf-tilted", "surf-hybrid"]))
].groupby(sensitivity_analysis_variables):
    group_variables = dict(zip(sensitivity_analysis_variables, name))

    row_order = [
        "Strict\nTriplet\nDistance",
        "Lax\nTriplet\nDistance",
    ]
    if group_variables["Differentia\nWidth\n(bits)"] > 1:
        row_order.append(
            "Inner\nNode\nLoss",
        )

    tp.tee(
        pecking.peckplot,
        data=group.reset_index(drop=True),
        score="value",
        x="Scenario",
        y="value",
        col="Num Generations",
        row="variable",
        row_order=row_order,
        hue="Population Size",
        x_group="outer",
        order=[
            "plain",
            "mild\nstructure",
            "rich\nstructure",
            "drift",
        ],
        skim_hatches=("*",),
        skim_labels=("Best",),
        skimmers=[pecking.skim_lowest],
        margin_titles=True,
        height=1.7,
        aspect=2.5,
        facet_kws={
            "ylim": (0, 1),
        },
        teeplot_outattrs={
            slugify(k): slugify(v) for k, v in group_variables.items()
        },
        teeplot_outexclude="teeplot_postprocess",
        teeplot_postprocess="teed.figure.subplots_adjust(right=0.75); teed.set_titles(row_template='{row_name}')",
    )
    display(group_variables)


## Visualize Main Metrics -- Downsample


In [None]:
for name, group in dfx[
    (dfx["Population Size"] == 65536)
    & (dfx["Algorithm"].isin(["surf-tilted", "surf-hybrid"]))
].groupby(sensitivity_analysis_variables):
    group_variables = dict(zip(sensitivity_analysis_variables, name))

    row_order = [
        "Strict\nTriplet\nDistance",
        "Lax\nTriplet\nDistance",
    ]
    if group_variables["Differentia\nWidth\n(bits)"] > 1:
        row_order.append(
            "Inner\nNode\nLoss",
        )

    tp.tee(
        pecking.peckplot,
        data=group.reset_index(drop=True),
        score="value",
        x="Scenario",
        y="value",
        col="Num Generations",
        row="variable",
        row_order=row_order,
        hue="Downsample",
        x_group="outer",
        order=[
            "zero\nstructure",
            "some\nstructure",
            "rich\nstructure",
            "drift",
        ],
        skim_hatches=("*",),
        skim_labels=("Best",),
        skimmers=[pecking.skim_lowest],
        margin_titles=True,
        height=1.7,
        aspect=2.5,
        facet_kws={
            "ylim": (0, 1),
        },
        teeplot_outattrs={
            slugify(k): slugify(v) for k, v in group_variables.items()
        },
        teeplot_outexclude="teeplot_postprocess",
        teeplot_postprocess="teed.figure.subplots_adjust(right=0.75); teed.set_titles(row_template='{row_name}')",
    )
    display(group_variables)


## Make Stat Table


In [None]:
lookup_effect = {}


def cached_delta(x, y):
    d = -cliffs_delta(x, y)[0]
    lookup_effect[d] = describe_effect(x, y)
    return d


vars = [
    "Policy",
    "Scenario",
    *sensitivity_analysis_variables,
    "Num Generations",
    "Algorithm",
]

records = []
for name, group in (
    df[
        (df["downsample"] == 500)
    ]
    .reset_index()
    .groupby(vars)
):
    group_variables = dict(zip(vars, name))
    records.append(
        {
            **group_variables,
            **{
                "Strict<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Population Size"] == 4096][
                        "Strict Triplet Distance"
                    ],
                    group[group["Population Size"] == 65536][
                        "Strict Triplet Distance"
                    ],
                ),
                "Lax<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Population Size"] == 4096][
                        "Lax Triplet Distance"
                    ],
                    group[group["Population Size"] == 65536][
                        "Lax Triplet Distance"
                    ],
                ),
                "Inner<br/>Node<br/>Loss": (
                    cached_delta(
                        group[group["Population Size"] == 4096][
                            "Inner Node Loss"
                        ],
                        group[group["Population Size"] == 65536][
                            "Inner Node Loss"
                        ],
                    )
                    if group_variables["Differentia\nWidth\n(bits)"] > 1
                    else float("nan")
                ),
            },
            "Scaling Factor": "Population Size",
        }
    )

for name, group in (
    df[
        (df["Population Size"] == 65536)
    ]
    .reset_index()
    .groupby(vars)
):
    group_variables = dict(zip(vars, name))
    records.append(
        {
            **group_variables,
            **{
                "Strict<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Downsample"] == 500][
                        "Strict Triplet Distance"
                    ],
                    group[group["Downsample"] == 8000][
                        "Strict Triplet Distance"
                    ],
                ),
                "Lax<br/>Triplet<br/>Distance": cached_delta(
                    group[group["Downsample"] == 500]["Lax Triplet Distance"],
                    group[group["Downsample"] == 8000]["Lax Triplet Distance"],
                ),
                "Inner<br/>Node<br/>Loss": (
                    cached_delta(
                        group[group["Downsample"] == 500]["Inner Node Loss"],
                        group[group["Downsample"] == 8000]["Inner Node Loss"],
                    )
                    if group_variables["Differentia\nWidth\n(bits)"] > 1
                    else float("nan")
                ),
            },
            "Scaling Factor": "Sample Size",
        }
    )


dfr = pd.DataFrame.from_records(records)
dfr


In [None]:
cmap = mpl.colormaps.get_cmap("coolwarm")
cmap.set_bad(color="white")


In [None]:
dfr["Unit"] = dfr["Differentia\nWidth\n(bits)"].map({1: "bit", 8: "byte"})
dfr["Size<br/>(bits)"] = dfr["Annotation\nSize\n(bits)"]


## Draw Stat Table --- tilted


In [None]:
vars = [
    "Scaling Factor",
    "Scenario",
    "Unit",
    # "Num Generations",
    "Size<br/>(bits)",
]
styled_dfs = (
    dfr[
        (dfr["Num Generations"] == 100000)
        & (dfr["Algorithm"].isin(["surf-tilted"]))
    ]
    .set_index(vars)
    .unstack(level=0)[
        [
            "Strict<br/>Triplet<br/>Distance",
            "Lax<br/>Triplet<br/>Distance",
            "Inner<br/>Node<br/>Loss",
        ]
    ]
    .swaplevel(axis="columns")
    .sort_index(axis=1, level=0, ascending=False)
    .style.background_gradient(cmap=cmap, vmin=-1, vmax=1, axis=None)
    .format(lambda x: lookup_effect.get(x, "n/a"))
)

props = [
    ("cellpadding", "0px"),
    ("cellspacing", "0px"),
    ("border", "1px solid black"),
    ("border-collapse", "collapse"),
    ("max-width", "80px !important"),
    ("word-wrap", "break-word"),
]
moreprops = [
    *props,
    ("padding-top", "0px"),
    ("padding-bottom", "0px"),
    ("margin", "0px"),
    ("height", "0px"),
]
smallprops = [
    *moreprops,
    ("font-size", "9px"),
    ("text-align", "center"),
    ("width", "65px"),
]
bigprops = [
    *moreprops,
    ("font-size", "11px"),
    ("color", "white"),
    ("background-color", "DimGray"),
]

divider_style = [
    # Header cell borders
    {"selector": "th", "props": bigprops},
    # Data cell borders
    {"selector": "td", "props": smallprops},
    # Row borders
    {"selector": "tr", "props": smallprops},
]
styled_dfs = styled_dfs.set_table_styles(divider_style).set_table_attributes(
    'style="cellspacing:0;border-collapse:collapse;font-family:sans-serif;"'
)

styled_dfs


In [None]:
os.makedirs("outplots", exist_ok=True)
styler_to_pdf(styled_dfs, "outplots/dsamp-popsize-scale-tilted.pdf")


## Draw Stat Table --- hybrid


In [None]:
vars = [
    "Scaling Factor",
    "Scenario",
    "Unit",
    # "Num Generations",
    "Size<br/>(bits)",
]
styled_dfs = (
    dfr[
        (dfr["Num Generations"] == 100000)
        & (dfr["Algorithm"].isin(["surf-hybrid"]))
    ]
    .set_index(vars)
    .unstack(level=0)[
        [
            "Strict<br/>Triplet<br/>Distance",
            "Lax<br/>Triplet<br/>Distance",
            "Inner<br/>Node<br/>Loss",
        ]
    ]
    .swaplevel(axis="columns")
    .sort_index(axis=1, level=0, ascending=False)
    .style.background_gradient(cmap=cmap, vmin=-1, vmax=1, axis=None)
    .format(lambda x: lookup_effect.get(x, "n/a"))
)

props = [
    ("cellpadding", "0px"),
    ("cellspacing", "0px"),
    ("border", "1px solid black"),
    ("border-collapse", "collapse"),
    ("max-width", "80px !important"),
    ("word-wrap", "break-word"),
]
moreprops = [
    *props,
    ("padding-top", "0px"),
    ("padding-bottom", "0px"),
    ("margin", "0px"),
    ("height", "0px"),
]
smallprops = [
    *moreprops,
    ("font-size", "9px"),
    ("text-align", "center"),
    ("width", "65px"),
]
bigprops = [
    *moreprops,
    ("font-size", "11px"),
    ("color", "white"),
    ("background-color", "DimGray"),
]

divider_style = [
    # Header cell borders
    {"selector": "th", "props": bigprops},
    # Data cell borders
    {"selector": "td", "props": smallprops},
    # Row borders
    {"selector": "tr", "props": smallprops},
]
styled_dfs = styled_dfs.set_table_styles(divider_style).set_table_attributes(
    'style="cellspacing:0;border-collapse:collapse;font-family:sans-serif;"'
)

styled_dfs


In [None]:
os.makedirs("outplots", exist_ok=True)
styler_to_pdf(styled_dfs, "outplots/dsamp-popsize-scale-hybrid.pdf")


## Reproducibility


In [None]:
datetime.datetime.now().isoformat()


In [None]:
%load_ext watermark
%watermark


In [None]:
!pip freeze


In [None]:
!inkscape --version
