In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-06T00:26:11.454433+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
pandas                            : 2.2.3
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10781.16it/s]
100%|██████████| 100/100 [00:00<00:00, 431.48it/s]
6108it [00:00, 614846.49it/s]
100%|██████████| 100/100 [00:00<00:00, 243430.30it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000001,0.008923,0.000000,0.008676,0.008340,0.000000,0.107672,0.000000,...,0.000000,0.000001,0.013259,0.000001,0.461892,0.161686,0.0,0.000000,0.013544,0.007357
1,0.000000,,0.000002,0.010103,0.000000,0.009787,0.009362,0.000000,0.130684,0.000000,...,0.000000,0.000002,0.016042,0.000002,0.000000,0.186317,0.0,0.000000,0.016461,0.008141
2,0.000001,0.000002,,0.049806,0.000002,0.734738,0.693260,-0.128337,0.000002,0.000002,...,-0.524013,0.000000,0.078441,0.000000,0.000002,-0.041981,0.0,-0.062640,0.080446,0.580194
3,0.008923,0.010103,0.049806,,0.016845,-0.047799,-0.045288,0.019437,0.014343,0.010775,...,0.009061,0.052241,0.000000,0.051107,0.011103,0.012138,0.0,0.019618,0.000000,-0.038333
4,0.000000,0.000000,0.000002,0.016845,,0.015986,0.014882,0.000000,-0.169562,0.000000,...,0.000000,0.000003,0.044019,0.000003,0.000000,0.451923,0.0,0.000000,0.047328,0.012017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.161686,0.186317,-0.041981,0.012138,0.451923,0.011686,0.011085,-0.090236,0.282635,0.200760,...,-0.038750,-0.044149,0.021864,-0.043139,0.207907,,0.0,-0.301092,0.022650,0.009413
96,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000
97,0.000000,0.000000,-0.062640,0.019618,0.000000,0.018462,0.017005,0.000000,0.000000,0.000000,...,0.000000,-0.067593,0.069833,-0.065252,0.000000,-0.301092,0.0,,0.078547,0.013363
98,0.013544,0.016461,0.080446,0.000000,0.047328,-0.043059,-0.039148,0.075722,0.031762,0.018325,...,0.013866,0.086995,0.000000,0.083896,0.019293,0.022650,0.0,0.078547,,-0.029801


np.nanmean(norm_err)=np.float64(0.009387748127987256)
    np.nanmean(np.abs(norm_err))=np.float64(0.05851757783136589)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.008163914877391754)
    


100%|██████████| 100/100 [00:00<00:00, 33803.22it/s]
100%|██████████| 100/100 [00:00<00:00, 397.55it/s]
5966it [00:00, 624456.42it/s]
100%|██████████| 100/100 [00:00<00:00, 275941.05it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.400448e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.342981e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,4.410844e-02,0.000000e+00,0.000000e+00,-1.593707e-07,-1.457124e-07,-3.813425e-07
1,-1.400448e-07,,-6.484007e-03,5.359658e-02,-1.737569e-03,8.187015e-03,-1.421780e-07,-2.060641e-02,-2.189583e-07,-2.017613e-02,...,-2.037525e-02,-1.349177e-07,7.932112e-03,-1.225372e-07,-1.346249e-07,-4.806635e-03,-5.251169e-03,-2.517449e-07,-1.340892e-02,-4.389709e-07
2,0.000000e+00,-6.484007e-03,,-1.881878e-02,3.326744e-03,-7.135731e-03,0.000000e+00,0.000000e+00,-1.910418e-07,0.000000e+00,...,0.000000e+00,-2.915108e-04,-6.745666e-03,0.000000e+00,0.000000e+00,-1.436841e-01,0.000000e+00,-2.533768e-04,-2.172836e-07,-1.523442e-06
3,0.000000e+00,5.359658e-02,-1.881878e-02,,-1.163150e-01,5.939024e-02,0.000000e+00,-1.039752e-02,-1.981658e-07,-9.630637e-03,...,-9.974972e-03,0.000000e+00,5.591437e-02,0.000000e+00,0.000000e+00,-6.083829e-03,-7.814273e-03,-2.592910e-07,-3.461112e-03,-2.135691e-06
4,0.000000e+00,-1.737569e-03,3.326744e-03,-1.163150e-01,,-1.830049e-03,0.000000e+00,1.772826e-02,-1.090408e-07,1.735998e-02,...,1.753042e-02,-1.374652e-04,-1.775629e-03,0.000000e+00,0.000000e+00,1.491150e-02,2.697237e-03,-1.284174e-04,1.155698e-02,-2.177305e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-4.806635e-03,-1.436841e-01,-6.083829e-03,1.491150e-02,-5.155659e-03,0.000000e+00,1.813699e-01,-1.440304e-07,4.839288e-02,...,1.115157e-01,-1.961215e-04,-4.948899e-03,0.000000e+00,0.000000e+00,,-1.175039e-01,-1.781407e-04,5.879496e-02,-4.228458e-07
96,0.000000e+00,-5.251169e-03,0.000000e+00,-7.814273e-03,2.697237e-03,-5.670570e-03,0.000000e+00,0.000000e+00,-1.566444e-07,0.000000e+00,...,0.000000e+00,-2.196406e-04,-5.421446e-03,0.000000e+00,0.000000e+00,-1.175039e-01,,-1.973132e-04,-1.738614e-07,-5.537600e-07
97,-1.593707e-07,-2.517449e-07,-2.533768e-04,-2.592910e-07,-1.284174e-04,-1.333822e-07,-1.621391e-07,-2.177872e-04,-2.418887e-07,-2.124765e-04,...,-2.149295e-04,-1.607614e-07,-1.289575e-07,-1.370822e-07,-1.523890e-07,-1.781407e-04,-1.973132e-04,,-1.338147e-04,-5.419735e-07
98,-1.457124e-07,-1.340892e-02,-2.172836e-07,-3.461112e-03,1.155698e-02,-1.415575e-02,-1.480232e-07,-2.039608e-07,-2.258249e-07,-1.916757e-07,...,-2.040839e-07,-1.436641e-04,-1.371633e-02,-1.268545e-07,-1.398541e-07,5.879496e-02,-1.738614e-07,-1.338147e-04,,-4.674676e-07


np.nanmean(norm_err)=np.float64(-0.001178122375979573)
    np.nanmean(np.abs(norm_err))=np.float64(0.008399473999735979)
    np.nanmedian(norm_err)=np.float64(-1.824831920794567e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(3.792611061075128e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34357.01it/s]
100%|██████████| 100/100 [00:00<00:00, 477.35it/s]
5922it [00:00, 675239.05it/s]
100%|██████████| 100/100 [00:00<00:00, 368568.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33085.94it/s]
100%|██████████| 100/100 [00:00<00:00, 1021.42it/s]
5959it [00:00, 676259.03it/s]
100%|██████████| 100/100 [00:00<00:00, 408403.51it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.067091e-07,-1.096287e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064500e-07,0.000000e+00,-1.036449e-07,0.000000e+00,0.000000e+00,-1.064830e-07
1,-1.067091e-07,,-2.135285e-07,-1.039621e-07,-1.038592e-07,-1.068165e-07,-1.067934e-07,-1.097243e-07,-1.036718e-07,-1.482170e-07,...,-1.066041e-07,-1.037982e-07,-1.099500e-07,-1.066255e-07,-2.479559e-07,-1.038303e-07,-2.076646e-07,-1.099112e-07,-1.068979e-07,-2.260022e-07
2,-1.096287e-07,-2.135285e-07,,-1.038263e-07,-1.037238e-07,-1.097421e-07,-1.129668e-07,-1.065137e-07,-1.035368e-07,-1.065998e-07,...,-1.127549e-07,-1.036629e-07,-1.067263e-07,-1.095405e-07,-2.130097e-07,-1.036949e-07,-2.073938e-07,-1.066898e-07,-1.098279e-07,-2.130758e-07
3,0.000000e+00,-1.039621e-07,-1.038263e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037161e-07,0.000000e+00,-1.130245e-07,0.000000e+00,0.000000e+00,-1.037474e-07
4,0.000000e+00,-1.038592e-07,-1.037238e-07,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036138e-07,0.000000e+00,-1.129030e-07,0.000000e+00,0.000000e+00,-1.036450e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.038303e-07,-1.036949e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.035850e-07,,-1.065629e-07,0.000000e+00,0.000000e+00,-1.036162e-07
96,-1.036449e-07,-2.076646e-07,-2.073938e-07,-1.130245e-07,-1.129030e-07,-1.037463e-07,-1.037245e-07,-1.035953e-07,-1.094485e-07,-1.036768e-07,...,-1.035458e-07,-1.065290e-07,-1.037964e-07,-1.035661e-07,-2.071739e-07,-1.065629e-07,,-1.037619e-07,-1.038230e-07,-2.072364e-07
97,0.000000e+00,-1.099112e-07,-1.066898e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.096364e-07,0.000000e+00,-1.037619e-07,,0.000000e+00,-1.096714e-07
98,0.000000e+00,-1.068979e-07,-1.098279e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066378e-07,0.000000e+00,-1.038230e-07,0.000000e+00,,-1.066709e-07


np.nanmean(norm_err)=np.float64(-8.12189324519958e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.12189324519958e-08)
    np.nanmedian(norm_err)=np.float64(-1.0363489550683863e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0363489550683863e-07)
    
