In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-13T00:25:25.321965+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1030-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
numpy                             : 2.1.2
pandas                            : 2.2.3
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10160.62it/s]
100%|██████████| 100/100 [00:00<00:00, 393.39it/s]
6109it [00:00, 573670.73it/s]
100%|██████████| 100/100 [00:00<00:00, 244423.31it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.087056,-0.292316,0.657395,-0.134938,0.606297,0.284014,-0.479115,0.310926,...,-0.096436,-0.190809,0.362885,-0.074767,-0.056105,-0.433653,0.118569,-0.225436,0.361813,-0.463387
1,0.000000,,0.000000,0.000000,0.000000,0.000000,-0.201210,0.000000,0.000000,-0.310408,...,0.000000,0.000000,-0.119720,0.000000,0.000000,0.000000,-0.398274,0.000000,-0.119138,0.000000
2,-0.087056,0.000000,,0.550273,-0.350023,-0.168884,0.000000,-0.310949,0.103334,0.000000,...,0.000000,0.089387,0.000000,-0.155340,0.000000,-0.110336,0.000000,0.603522,0.000000,-0.092291
3,-0.292316,0.000000,0.550273,,-0.108262,0.135427,0.000000,-0.096285,-0.307741,0.000000,...,0.656459,-0.000007,0.000000,0.057729,0.000000,-0.391314,0.000000,-0.264875,0.000000,-0.313647
4,0.657395,0.000000,-0.350023,-0.108262,,-0.439012,0.719218,-0.362516,-0.152982,0.370609,...,-0.398746,-0.134358,0.409007,-0.290114,-0.067959,0.429122,0.145352,-0.082789,0.407647,0.342359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.433653,0.000000,-0.110336,-0.391314,0.429122,-0.200514,0.678685,1.140113,-0.751941,0.453211,...,-0.125850,-0.268371,0.585405,-0.091313,-0.085013,,0.185040,-0.280082,0.583084,0.000000
96,0.118569,-0.398274,0.000000,0.000000,0.145352,0.000000,0.107958,0.120249,0.000000,0.173401,...,0.000000,0.000000,0.062395,0.000000,0.000000,0.185040,,0.000000,0.062079,0.131243
97,-0.225436,0.000000,0.603522,-0.264875,-0.082789,0.082509,0.000000,-0.075598,-0.178021,0.000000,...,0.995714,-0.000004,0.000000,0.045335,0.000000,-0.280082,0.000000,,0.000000,-0.237915
98,0.361813,-0.119138,0.000000,0.000000,0.407647,0.000000,-0.478605,0.454498,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.583084,0.062079,0.000000,,0.479814


np.nanmean(norm_err)=np.float64(-0.00683177028433345)
    np.nanmean(np.abs(norm_err))=np.float64(0.18087078706043386)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.10028324413717772)
    


100%|██████████| 100/100 [00:00<00:00, 28441.74it/s]
100%|██████████| 100/100 [00:00<00:00, 395.24it/s]
5972it [00:00, 590095.73it/s]
100%|██████████| 100/100 [00:00<00:00, 280180.63it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.956537e-07,-4.657802e-07,0.026212,1.056728e-02,-0.003464,-4.285675e-07,-2.504316e-07,0.034118,-4.005798e-07,...,-1.986335e-07,-3.400446e-07,9.607677e-03,-3.718447e-07,-3.080692e-07,-4.015507e-07,1.109213e-02,-2.244117e-07,-1.922497e-07,-2.356545e-07
1,-4.956537e-07,,-4.557492e-07,-0.003198,-3.641522e-07,0.003420,-4.262199e-07,-2.264855e-07,-0.004142,-3.931381e-07,...,-1.981278e-07,-3.294569e-07,-1.657195e-07,-3.083988e-07,-3.074606e-07,-3.966341e-07,-1.910024e-07,-2.524587e-07,-1.917759e-07,-2.305205e-07
2,-4.657802e-07,-4.557492e-07,,0.012154,1.436068e-01,-0.003007,-3.722096e-07,-2.139454e-07,0.015484,-3.787498e-07,...,-1.856079e-07,-4.327029e-07,1.305483e-01,-2.856041e-07,-2.921691e-07,-3.749535e-07,1.507468e-01,-2.079261e-07,-1.800221e-07,-2.206888e-07
3,2.621233e-02,-3.198296e-03,1.215355e-02,,1.008493e-02,0.000000,-1.457339e-02,2.435848e-02,0.101709,1.075657e-02,...,-8.995085e-02,1.623608e-02,9.307123e-03,3.115542e-02,-6.516708e-03,5.392585e-03,1.050068e-02,-2.953411e-03,-5.230070e-03,1.226857e-02
4,1.056728e-02,-3.641522e-07,1.436068e-01,0.010085,,-0.148643,-2.638182e-07,9.875499e-03,0.012276,1.906326e-02,...,-1.540469e-07,1.913946e-01,-1.527895e-07,1.237480e-02,-2.515982e-07,3.809856e-03,-1.740282e-07,-1.691123e-07,-1.501794e-07,2.157082e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.015507e-07,-3.966341e-07,-3.749535e-07,0.005393,3.809856e-03,-0.002417,-2.993375e-07,-1.865355e-07,0.006668,-3.315162e-07,...,-1.655149e-07,-2.511986e-07,3.514632e-03,-2.387677e-07,-2.666879e-07,,3.967908e-03,-1.830344e-07,-1.610585e-07,-1.892640e-07
96,1.109213e-02,-1.910024e-07,1.507468e-01,0.010501,-1.740282e-07,-0.160312,0.000000e+00,1.033240e-02,0.012897,1.985401e-02,...,0.000000e+00,2.042906e-01,0.000000e+00,1.310067e-02,-1.299966e-07,3.967908e-03,,0.000000e+00,0.000000e+00,2.258879e-02
97,-2.244117e-07,-2.524587e-07,-2.079261e-07,-0.002953,-1.691123e-07,0.002902,0.000000e+00,0.000000e+00,-0.003740,-1.815445e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.443851e-07,-1.830344e-07,0.000000e+00,,0.000000e+00,0.000000e+00
98,-1.922497e-07,-1.917759e-07,-1.800221e-07,-0.005230,-1.501794e-07,0.002288,0.000000e+00,0.000000e+00,-0.006421,-1.599037e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.822172e-02,-1.610585e-07,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-7.077548846772133e-05)
    np.nanmean(np.abs(norm_err))=np.float64(0.00967085150122307)
    np.nanmedian(norm_err)=np.float64(-1.437481344618381e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0009676191508429756)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35137.00it/s]
100%|██████████| 100/100 [00:00<00:00, 418.48it/s]
5937it [00:00, 553540.72it/s]
100%|██████████| 100/100 [00:00<00:00, 338796.77it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31636.02it/s]
100%|██████████| 100/100 [00:00<00:00, 984.04it/s]
5947it [00:00, 565587.18it/s]
100%|██████████| 100/100 [00:00<00:00, 322887.14it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.038680e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036360e-07,0.000000e+00,-1.038406e-07,...,-1.036792e-07,-1.067926e-07,0.000000e+00,0.000000e+00,-1.129436e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036103e-07,-1.066655e-07
1,-1.038680e-07,,-1.039262e-07,-1.097144e-07,-1.039220e-07,-1.037113e-07,-1.037508e-07,-2.193198e-07,-1.166386e-07,-2.136243e-07,...,-2.132828e-07,-2.078989e-07,-1.038865e-07,-1.068570e-07,-2.075899e-07,-1.099525e-07,-1.038793e-07,-1.065891e-07,-2.192622e-07,-2.076581e-07
2,0.000000e+00,-1.039262e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036939e-07,0.000000e+00,-1.038988e-07,...,-1.037372e-07,-1.068541e-07,0.000000e+00,0.000000e+00,-1.097607e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036681e-07,-1.067268e-07
3,0.000000e+00,-1.097144e-07,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,-1.161192e-07,0.000000e+00,-1.066183e-07,...,-1.064481e-07,-1.037658e-07,0.000000e+00,0.000000e+00,-1.036119e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.277497e-07,-1.036458e-07
4,0.000000e+00,-1.039220e-07,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,-1.036897e-07,0.000000e+00,-1.038946e-07,...,-1.037330e-07,-1.068496e-07,0.000000e+00,0.000000e+00,-1.130074e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036640e-07,-1.067224e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.099525e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.200485e-07,0.000000e+00,-1.068430e-07,...,-1.066722e-07,-1.039787e-07,0.000000e+00,0.000000e+00,-1.038241e-07,,0.000000e+00,0.000000e+00,-1.163534e-07,-1.038582e-07
96,0.000000e+00,-1.038793e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036472e-07,0.000000e+00,-1.038519e-07,...,-1.036904e-07,-1.068045e-07,0.000000e+00,0.000000e+00,-1.097083e-07,0.000000e+00,,0.000000e+00,-1.036215e-07,-1.066774e-07
97,0.000000e+00,-1.065891e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.063448e-07,0.000000e+00,-1.128659e-07,...,-1.094426e-07,-1.037109e-07,0.000000e+00,0.000000e+00,-1.035571e-07,0.000000e+00,0.000000e+00,,-1.063177e-07,-1.035910e-07
98,-1.036103e-07,-2.192622e-07,-1.036681e-07,-1.277497e-07,-1.036640e-07,-1.034544e-07,-1.034936e-07,-2.320518e-07,-1.096283e-07,-2.130792e-07,...,-2.127394e-07,-2.073825e-07,-1.036287e-07,-1.065842e-07,-2.070751e-07,-1.163534e-07,-1.036215e-07,-1.063177e-07,,-2.071429e-07


np.nanmean(norm_err)=np.float64(-1.0030811084184818e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0030811084184818e-07)
    np.nanmedian(norm_err)=np.float64(-1.038890826177626e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.038890826177626e-07)
    
