In [None]:
%load_ext watermark


In [None]:
from downstream import dstream
from hstrat import hstrat
from hstrat import _auxiliary_lib as hstrat_aux
import pandas as pd


In [None]:
%watermark -diwmuv -iv


In [None]:
teeplot_subdir = "2025-05-09-reconstruction-quality"
teeplot_subdir


## Prep Data


In [None]:
true_phylo_df = pd.read_csv("https://osf.io/ypqvb/download")
raw_genome_df = pd.read_parquet("https://osf.io/4x7p8/download")


In [None]:
true_phylo_df = hstrat_aux.alifestd_prune_extinct_lineages_asexual(
    true_phylo_df,
)
true_phylo_df = hstrat_aux.alifestd_mark_leaves(true_phylo_df)
true_phylo_df = hstrat_aux.alifestd_collapse_unifurcations(
    true_phylo_df,
)
true_phylo_df


In [None]:
raw_genome_df = raw_genome_df.loc[
    raw_genome_df["taxon_label"].isin(true_phylo_df["taxon_label"])
].copy()
raw_genome_df


In [None]:
kwargs = dict(
    dstream_algo=eval(
        raw_genome_df["dstream_algo"].unique().item(),
        {"dstream": dstream},
    ),
    dstream_S=raw_genome_df["dstream_S"].unique().item(),
    dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
    .unique()
    .item(),
    dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
    .unique()
    .item(),
    dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"].unique().item(),
    dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
)

population = [
    hstrat.surf_from_hex(
        genome_hex,
        **kwargs,
    )
    for genome_hex in raw_genome_df["data_hex"]
]
len(population)


## Naive Reconstruction vs. Ground Truth


In [None]:
naive_df = hstrat.build_tree_trie(
    population,
    taxon_labels=raw_genome_df["taxon_label"],
    force_common_ancestry=True,
)
naive_df


In [None]:
hstrat_aux.alifestd_estimate_triplet_distance_asexual(
    naive_df,
    true_phylo_df,
    taxon_label_key="taxon_label",
)


## Shortcut Reconstruction vs. Ground Truth


In [None]:
shortcut_df = hstrat.build_tree_searchtable(
    population,
    taxon_labels=raw_genome_df["taxon_label"],
    force_common_ancestry=True,
)
shortcut_df


In [None]:
hstrat_aux.alifestd_estimate_triplet_distance_asexual(
    shortcut_df,
    true_phylo_df,
    taxon_label_key="taxon_label",
)


## Naive Reconstruction vs. Shortcut Reconstruction


In [None]:
hstrat_aux.alifestd_estimate_triplet_distance_asexual(
    shortcut_df,
    naive_df,
    taxon_label_key="taxon_label",
)
