In [None]:
import json
import math
import os

from iterpop import iterpop as ip
import pandas as pd
import seaborn as sns
from slugify import slugify
from teeplot import teeplot as tp


# load data from JSON

The source json file was manually extracted from notebook file provided by Emily.
(The notebook can also be found in `assets/`.)

In [None]:
with open("assets/reconstruction_quality.json") as file:
    data = json.load(file)

records = [
    {
        data['columns'][int(k) - 1]['label'][0] : v
        for k, v in row.items()
    }
    for row in data['data']
]

df = pd.DataFrame.from_records(records)


In [None]:
df


In [None]:
numeric_cols = [
    "target",
    "differentia",
    "score",
]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce').astype({
    "differentia" : "Int64",
    "target" : "Int64",
})

df


In [None]:
df["Tree Comparison Metric"] = df["dist_fun"].apply(
    lambda x: {
        "rf" : "Generalized Robinson-Foulds Similarity",
        "info" : "Mutual Clustering Information",
        "cid" : "Clustering Information Distance",
    }[x]
)
df["Num Column Bits"] = df["target"]
df["Num Differentia Bits"] = df["differentia"]


In [None]:
# filter out trueTree rows
df_results = df[df['Treatment'] != "trueTree"]

# save as csv for inclusion in PDF appendices
os.makedirs("outplots", exist_ok=True)
df_results.to_csv("outplots/reconstruction_quality_results.csv", index=False)

df_results


# plot scores

In [None]:
for metric, df_group in df_results.groupby("Tree Comparison Metric"):
    print(metric)
    tp.tee(
        sns.catplot,
        x="condition",
        y="score",
        hue="policy",
        col="Num Differentia Bits",
        row="Num Column Bits",
        data=df_group,
        kind="bar",
        margin_titles=True,
        teeplot_outattrs={
            "tree-comparison-metric" : slugify(metric),
        }
    )


In [None]:
for metric, df_group in df_results.groupby("Tree Comparison Metric"):
    print(metric)
    tp.tee(
        sns.catplot,
        x="Num Column Bits",
        y="score",
        hue="Num Differentia Bits",
        col="condition",
        row="policy",
        data=df_group,
        kind="bar",
        margin_titles=True,
        teeplot_outattrs={
            "tree-comparison-metric" : slugify(metric),
        }
    )


# summarize RPR vs TDPR outcomes

In [None]:
for metric, df_distfun in df_results.groupby("Tree Comparison Metric"):

    num_RPR_greater, num_TDPR_greater, num_equivalent, num_nan = 0, 0, 0, 0

    for __, df_group in df_distfun.groupby([
        "condition",
        "Num Differentia Bits",
        "Num Column Bits",
    ]):
        result = {
            policy : ip.popsingleton(df_subgroup["score"])
            for policy, df_subgroup in df_group.groupby("policy")
        }
        num_RPR_greater += result["RPR"] > result["TDPR"]
        num_TDPR_greater += result["TDPR"] > result["RPR"]
        num_equivalent += result["TDPR"] == result["RPR"]
        num_nan += math.isnan(result["RPR"]) or math.isnan(result["TDPR"])
        
    print(f"{metric=}")
    print(f"   {num_RPR_greater=}")
    print(f"   {num_TDPR_greater=}")
    print(f"   {num_equivalent=}")
    print(f"   {num_nan=}")
    

In [None]:
df_policy = df_results.pivot(
    index=[
        "Tree Comparison Metric",
        "condition",
        "Num Differentia Bits",
        "Num Column Bits",
    ],
    columns="policy",
    values="score",
).reset_index()
df_policy["sign"] = df_policy["TDPR"] > df_policy["RPR"]
df_policy.loc[
    df_policy["RPR"].isna() | df_policy["TDPR"].isna(),
    "sign",  
] = float('nan')

# save as csv for inclusion in PDF appendices
os.makedirs("outplots", exist_ok=True)
df_policy.to_csv("outplots/reconstruction_quality_results_by_policy.csv", index=False)

df_policy


# summarize differentia bits outcomes

In [None]:
for metric, df_distfun in df_results.groupby("Tree Comparison Metric"):

    num_1bit_greater, num_64bit_greater, num_equivalent, num_nan = 0, 0, 0, 0

    for __, df_group in df_distfun.groupby([
        "condition",
        "policy",
        "Num Column Bits",
    ]):
        result = {
            int(num_differentia_bits) : ip.popsingleton(df_subgroup["score"])
            for num_differentia_bits, df_subgroup in df_group.groupby("Num Differentia Bits")
        }
        num_1bit_greater += result[1] > result[64]
        num_64bit_greater += result[64] > result[1]
        num_equivalent += result[64] == result[1]
        num_nan += math.isnan(result[64]) or math.isnan(result[1])
        
    print(f"{metric=}")
    print(f"   {num_1bit_greater=}")
    print(f"   {num_64bit_greater=}")
    print(f"   {num_equivalent=}")
    print(f"   {num_nan=}")
    

In [None]:
df_diffbits = df_results.pivot(
    index=[
        "Tree Comparison Metric",
        "condition",
        "policy",
        "Num Column Bits",
    ],
    columns="Num Differentia Bits",
    values="score",
).reset_index()
df_diffbits["sign"] = df_diffbits[64] > df_diffbits[1]
df_diffbits.loc[
    df_diffbits[64].isna() | df_diffbits[1].isna(),
    "sign",  
] = float('nan')

# save as csv for inclusion in PDF appendices
os.makedirs("outplots", exist_ok=True)
df_diffbits.to_csv("outplots/reconstruction_quality_results_by_differentia_bits.csv", index=False)

df_diffbits
