In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-28T00:23:26.817635+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
pandas                            : 2.2.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10291.00it/s]
100%|██████████| 100/100 [00:00<00:00, 383.73it/s]
6135it [00:00, 591786.37it/s]
100%|██████████| 100/100 [00:00<00:00, 168108.38it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.000000,0.000000,-0.161777,0.000000,0.0,-0.465122,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,-0.064964
1,0.000000,,0.000000,0.000000,0.000000,0.000000,0.0,0.155872,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000
2,0.000000,0.0,,0.000000,-0.038830,0.000000,0.0,-0.184758,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,-0.149135,0.298196,0.0,-0.144968,-0.316242,-0.028600
3,0.000000,0.0,0.000000,,-0.059474,0.000000,0.0,-0.254948,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,-0.223687,0.447209,0.0,-0.214441,-0.489072,-0.038423
4,-0.161777,0.0,-0.038830,-0.059474,,0.000000,0.0,-0.227377,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,-0.073848,-0.064318,0.0,-0.069870,-0.071348,-0.157665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.0,0.298196,0.447209,-0.064318,0.000000,0.0,-0.269448,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,,0.0,0.000000,0.000000,-0.040388
96,0.000000,0.0,0.000000,0.000000,0.000000,0.087706,0.0,0.056842,0.0,0.159513,...,-0.547109,0.0,0.134401,0.0,0.000000,0.000000,,0.000000,0.000000,0.000000
97,0.000000,0.0,-0.144968,-0.214441,-0.069870,0.000000,0.0,-0.285277,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,,0.000000,-0.042509
98,0.000000,0.0,-0.316242,-0.489072,-0.071348,0.000000,0.0,-0.289357,0.0,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,,-0.043052


np.nanmean(norm_err)=np.float64(0.018992680474001024)
    np.nanmean(np.abs(norm_err))=np.float64(0.10146223608398992)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32868.15it/s]
100%|██████████| 100/100 [00:00<00:00, 463.12it/s]
5964it [00:00, 573616.20it/s]
100%|██████████| 100/100 [00:00<00:00, 204900.05it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.581334e-07,-4.141501e-07,-4.295709e-07,-3.457147e-07,-3.636360e-07,-3.646480e-07,-1.913857e-07,-1.665795e-07,-0.001906,...,-1.028518e-06,-3.512365e-07,-2.242598e-07,-2.307740e-07,-3.474149e-07,-4.813898e-07,-0.039628,-1.484917e-07,-6.060343e-07,-1.929203e-07
1,-2.581334e-07,,-1.855759e-07,-1.917445e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.001686,...,-3.993597e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-2.121295e-07,-0.036446,0.000000e+00,-2.505840e-07,0.000000e+00
2,-4.141501e-07,-1.855759e-07,,-3.461850e-07,-2.491276e-07,-2.583011e-07,-2.588113e-07,-1.575672e-07,-1.403590e-07,-0.001512,...,-6.523172e-07,-2.519823e-07,-1.791934e-07,-1.790265e-07,-2.629335e-07,-3.744182e-07,-0.016178,-1.272940e-07,-4.043771e-07,-1.586059e-07
3,-4.295709e-07,-1.917445e-07,-3.461850e-07,,-1.268117e-01,-3.267794e-02,-2.758678e-07,-1.650587e-07,-1.462728e-07,0.017726,...,-7.220855e-07,2.260993e-02,-1.872074e-07,-1.847607e-07,-2.613359e-07,-3.869772e-07,0.072662,-1.530399e-02,-4.190657e-07,1.374584e-02
4,-3.457147e-07,0.000000e+00,-2.491276e-07,-1.268117e-01,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.002437,...,-9.935375e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-2.936847e-07,0.001464,0.000000e+00,-3.323065e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.813898e-07,-2.121295e-07,-3.744182e-07,-3.869772e-07,-2.936847e-07,-3.065175e-07,-3.072362e-07,-1.742918e-07,-1.534780e-07,-0.001703,...,-8.140011e-07,-2.976599e-07,-2.011438e-07,-2.036148e-07,-2.949107e-07,,-0.027096,-1.379912e-07,-4.682361e-07,-1.755636e-07
96,-3.962760e-02,-3.644573e-02,-1.617827e-02,7.266217e-02,1.464417e-03,1.502716e-03,-1.047219e-02,-1.032640e-03,-9.404136e-04,0.011772,...,-4.301598e-07,1.636323e-02,-7.972693e-03,-1.127401e-01,-2.176914e-02,-2.709594e-02,,8.681341e-04,-3.891881e-02,1.148888e-02
97,-1.484917e-07,0.000000e+00,-1.272940e-07,-1.530399e-02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.001137,...,-2.062566e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.379912e-07,0.000868,,-1.459620e-07,0.000000e+00
98,-6.060343e-07,-2.505840e-07,-4.043771e-07,-4.190657e-07,-3.323065e-07,-3.488314e-07,-3.497626e-07,-1.872041e-07,-1.634027e-07,-0.001856,...,-9.702817e-07,-3.374051e-07,-2.185398e-07,-2.247213e-07,-3.338771e-07,-4.682361e-07,-0.038919,-1.459620e-07,,-1.886721e-07


np.nanmean(norm_err)=np.float64(-0.0001398790826301162)
    np.nanmean(np.abs(norm_err))=np.float64(0.009700615774699075)
    np.nanmedian(norm_err)=np.float64(-2.3421845057133363e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.10112571994655e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35741.83it/s]
100%|██████████| 100/100 [00:00<00:00, 472.99it/s]
5961it [00:00, 630637.29it/s]
100%|██████████| 100/100 [00:00<00:00, 386215.84it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29135.20it/s]
100%|██████████| 100/100 [00:00<00:00, 1011.30it/s]
5959it [00:00, 579500.52it/s]
100%|██████████| 100/100 [00:00<00:00, 341000.33it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.071125e-07,-1.065337e-07,-2.074907e-07,-2.126857e-07,-2.068730e-07,-1.238269e-07,-1.065876e-07,-2.072789e-07,-1.065234e-07,...,-1.034785e-07,-1.065058e-07,-1.094402e-07,-2.074352e-07,-1.034838e-07,-1.128555e-07,-1.034520e-07,-1.162957e-07,-2.132847e-07,-1.065619e-07
1,-2.071125e-07,,-1.036411e-07,-2.132893e-07,-2.069211e-07,-2.187329e-07,-1.036432e-07,-1.036922e-07,-2.130655e-07,-1.036315e-07,...,-1.063627e-07,-1.036147e-07,-1.035033e-07,-2.193615e-07,-1.126505e-07,-1.036576e-07,-1.093838e-07,-1.036573e-07,-2.074880e-07,-1.036679e-07
2,-1.065337e-07,-1.036411e-07,,-1.038306e-07,-1.161548e-07,-1.035212e-07,0.000000e+00,0.000000e+00,-1.037245e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.038028e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.098046e-07,0.000000e+00
3,-2.074907e-07,-2.132893e-07,-1.038306e-07,,-2.072986e-07,-2.130353e-07,-1.038326e-07,-1.038818e-07,-2.196103e-07,-1.038209e-07,...,-1.128680e-07,-1.038041e-07,-1.036922e-07,-2.136316e-07,-1.065678e-07,-1.038471e-07,-1.065341e-07,-1.038467e-07,-2.078676e-07,-1.038574e-07
4,-2.126857e-07,-2.069211e-07,-1.161548e-07,-2.072986e-07,,-2.066821e-07,-1.064346e-07,-1.127828e-07,-2.070872e-07,-1.127110e-07,...,-1.033829e-07,-1.197674e-07,-1.062870e-07,-2.072432e-07,-1.033882e-07,-1.064497e-07,-1.033565e-07,-1.064494e-07,-2.192039e-07,-1.095170e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.128555e-07,-1.036576e-07,0.000000e+00,-1.038471e-07,-1.064497e-07,-1.035376e-07,0.000000e+00,0.000000e+00,-1.037410e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.038193e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.067498e-07,0.000000e+00
96,-1.034520e-07,-1.093838e-07,0.000000e+00,-1.065341e-07,-1.033565e-07,-1.158881e-07,0.000000e+00,0.000000e+00,-1.064224e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.162411e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.036394e-07,0.000000e+00
97,-1.162957e-07,-1.036573e-07,0.000000e+00,-1.038467e-07,-1.064494e-07,-1.035373e-07,0.000000e+00,0.000000e+00,-1.037407e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.038190e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.067495e-07,0.000000e+00
98,-2.132847e-07,-2.074880e-07,-1.098046e-07,-2.078676e-07,-2.192039e-07,-2.072477e-07,-1.067346e-07,-1.098620e-07,-2.076551e-07,-1.097938e-07,...,-1.036660e-07,-1.097750e-07,-1.065862e-07,-2.078119e-07,-1.036713e-07,-1.067498e-07,-1.036394e-07,-1.067495e-07,,-1.130908e-07


np.nanmean(norm_err)=np.float64(-9.810754856048882e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.810754856048882e-08)
    np.nanmedian(norm_err)=np.float64(-1.0380137772460093e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0380137772460093e-07)
    
