In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-24T00:23:13.034147+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10439.82it/s]
100%|██████████| 100/100 [00:00<00:00, 387.75it/s]
6130it [00:00, 636554.77it/s]
100%|██████████| 100/100 [00:00<00:00, 246144.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.204108,0.608263,0.192651,0.215774,0.207374,0.271901,0.166352,-0.244818,0.128257,...,0.000000,0.424172,0.027991,-0.359068,0.242814,0.244605,0.440405,0.268117,-0.172982,0.024388
1,0.204108,,0.000000,0.292553,0.349407,0.327902,-0.148894,-0.108638,0.027373,-0.078275,...,0.205453,0.000000,0.000000,0.022445,0.426271,0.620373,0.000000,-0.510907,0.163303,0.000000
2,0.608263,0.000000,,0.000000,0.000000,0.000000,0.000000,-1.212215,0.052927,-0.582218,...,0.614804,0.000000,0.000000,0.037155,0.000000,0.000000,0.000000,0.000000,0.255056,0.000000
3,0.192651,0.292553,0.000000,,0.000000,0.000000,0.000000,-0.099217,0.024980,-0.073263,...,0.193849,0.000000,0.000000,0.020811,0.000000,0.000000,0.000000,0.000000,0.152424,0.000000
4,0.215774,0.349407,0.000000,0.000000,,0.000000,0.000000,-0.118902,0.029985,-0.083467,...,0.217278,0.000000,0.000000,0.024172,0.000000,0.000000,0.000000,0.000000,0.174635,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.244605,0.620373,0.000000,0.000000,0.000000,0.000000,0.000000,-0.147674,0.037328,-0.096691,...,0.246539,0.000000,0.000000,0.028728,0.000000,,0.000000,0.000000,0.203798,0.000000
96,0.440405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.592949,0.030588,-0.387729,...,0.443824,0.000000,0.000000,0.024562,0.000000,0.000000,,0.000000,0.177178,0.000000
97,0.268117,-0.510907,0.000000,0.000000,0.000000,0.000000,0.000000,-0.175555,0.044477,-0.107913,...,0.270442,0.000000,0.000000,0.032783,0.000000,0.000000,0.000000,,0.228885,0.000000
98,-0.172982,0.163303,0.255056,0.152424,0.174635,0.166449,0.233039,0.220482,-0.360635,0.158202,...,-0.174434,0.267419,0.041828,-0.885692,0.201937,0.203798,0.177178,0.228885,,0.034264


np.nanmean(norm_err)=np.float64(0.05178132705549374)
    np.nanmean(np.abs(norm_err))=np.float64(0.12165681153878141)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.028550552286878167)
    


100%|██████████| 100/100 [00:00<00:00, 34030.86it/s]
100%|██████████| 100/100 [00:00<00:00, 385.45it/s]
5966it [00:00, 547266.59it/s]
100%|██████████| 100/100 [00:00<00:00, 170361.66it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.426647e-01,-2.685504e-04,0.000219,-8.747541e-02,1.421918e-02,-1.251861e-03,-3.048749e-04,0.000000e+00,-2.323887e-04,...,-7.375284e-04,-3.774144e-04,0.000000e+00,0.000234,-8.878543e-04,-1.164448e-07,-2.637825e-04,0.000000e+00,0.002265,2.302137e-02
1,-3.426647e-01,,1.001432e-03,0.001547,0.000000e+00,-1.037652e-06,0.000000e+00,1.285793e-03,0.000000e+00,7.762227e-04,...,-2.085116e-07,0.000000e+00,0.000000e+00,0.001735,-2.954040e-07,-1.978300e-07,0.000000e+00,0.000000e+00,0.000525,-1.000916e-03
2,-2.685504e-04,1.001432e-03,,0.022148,1.208991e-03,-3.104298e-07,5.442232e-02,-2.086283e-07,0.000000e+00,2.424285e-02,...,2.289321e-02,-2.965294e-03,0.000000e+00,0.014317,3.468254e-02,-1.367527e-07,-1.895853e-03,0.000000e+00,0.000324,-1.641854e-01
3,2.190283e-04,1.546507e-03,2.214801e-02,,1.787319e-03,4.410495e-04,3.501509e-02,2.523925e-02,-4.613902e-03,1.908411e-02,...,5.767421e-02,7.519419e-04,-4.566521e-02,-0.072826,2.447536e-02,-2.756188e-02,5.209442e-04,-3.609119e-03,-0.097530,3.529075e-04
4,-8.747541e-02,0.000000e+00,1.208991e-03,0.001787,,-3.823821e-06,0.000000e+00,1.649475e-03,0.000000e+00,8.953710e-04,...,-2.442778e-07,0.000000e+00,0.000000e+00,0.002044,-3.727173e-07,-2.297452e-07,0.000000e+00,0.000000e+00,0.000650,-1.523501e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.164448e-07,-1.978300e-07,-1.367527e-07,-0.027562,-2.297452e-07,-4.782464e-07,-2.175672e-07,-3.121522e-07,2.892192e-02,-1.176587e-07,...,-2.493681e-07,-1.955900e-07,-2.388063e-07,-0.029579,-3.025916e-07,,-1.342226e-07,-2.307347e-07,-0.033954,-3.797676e-07
96,-2.637825e-04,0.000000e+00,-1.895853e-03,0.000521,0.000000e+00,-2.976915e-07,0.000000e+00,-2.203840e-03,0.000000e+00,-6.105204e-02,...,-1.389765e-07,0.000000e+00,0.000000e+00,0.000563,-1.728681e-07,-1.342226e-07,,0.000000e+00,0.000317,-3.231639e-03
97,0.000000e+00,0.000000e+00,0.000000e+00,-0.003609,0.000000e+00,-3.831117e-06,0.000000e+00,-4.022192e-07,0.000000e+00,0.000000e+00,...,-2.439413e-07,0.000000e+00,0.000000e+00,-0.004129,-3.719346e-07,-2.307347e-07,0.000000e+00,,-0.005534,-7.432433e-07
98,2.265046e-03,5.248096e-04,3.242153e-04,-0.097530,6.498693e-04,5.868810e-03,6.002423e-04,3.827059e-04,-5.925884e-03,2.699746e-04,...,2.893700e-04,5.166133e-04,-6.636648e-02,-0.129311,3.678175e-04,-3.395436e-02,3.168216e-04,-5.533568e-03,,4.217155e-03


np.nanmean(norm_err)=np.float64(0.002514799507034193)
    np.nanmean(np.abs(norm_err))=np.float64(0.01238500095437321)
    np.nanmedian(norm_err)=np.float64(-2.0399410384828303e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.00044225958612532126)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35699.24it/s]
100%|██████████| 100/100 [00:00<00:00, 468.15it/s]
5935it [00:00, 637143.44it/s]
100%|██████████| 100/100 [00:00<00:00, 386571.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33180.16it/s]
100%|██████████| 100/100 [00:00<00:00, 1014.88it/s]
5940it [00:00, 639579.14it/s]
100%|██████████| 100/100 [00:00<00:00, 342671.90it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.034931e-07,-1.034896e-07,0.000000e+00,0.000000e+00,-1.037908e-07,0.000000e+00,-1.034712e-07,-1.066672e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.239070e-07,-1.097622e-07,-1.034855e-07,0.000000e+00,0.000000e+00,-1.064855e-07
1,-1.034931e-07,,-2.124786e-07,-1.036377e-07,-1.034461e-07,-2.257238e-07,-1.126943e-07,-2.318035e-07,-2.072779e-07,-1.034914e-07,...,-1.035683e-07,-1.036876e-07,-1.035203e-07,-1.063430e-07,-2.071418e-07,-2.073252e-07,-2.124700e-07,-1.095711e-07,-1.033546e-07,-2.069349e-07
2,-1.034896e-07,-2.124786e-07,,-1.036343e-07,-1.034427e-07,-2.131061e-07,-1.064036e-07,-2.124326e-07,-2.072710e-07,-1.034880e-07,...,-1.035648e-07,-1.036841e-07,-1.035169e-07,-1.160440e-07,-2.071349e-07,-2.073182e-07,-2.185488e-07,-1.065081e-07,-1.033511e-07,-2.069279e-07
3,0.000000e+00,-1.036377e-07,-1.036343e-07,,0.000000e+00,-1.039363e-07,0.000000e+00,-1.036158e-07,-1.283928e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.067486e-07,-1.068460e-07,-1.036301e-07,0.000000e+00,0.000000e+00,-1.281296e-07
4,0.000000e+00,-1.034461e-07,-1.034427e-07,0.000000e+00,,-1.037436e-07,0.000000e+00,-1.034243e-07,-1.066173e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.128491e-07,-1.097093e-07,-1.034386e-07,0.000000e+00,0.000000e+00,-1.064358e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.097622e-07,-2.073252e-07,-2.073182e-07,-1.068460e-07,-1.097093e-07,-2.079225e-07,-1.038190e-07,-2.072813e-07,-2.136946e-07,-1.066905e-07,...,-1.165595e-07,-1.167106e-07,-1.067212e-07,-1.037578e-07,-2.196994e-07,,-2.073099e-07,-1.039184e-07,-1.162889e-07,-2.133299e-07
96,-1.034855e-07,-2.124700e-07,-2.185488e-07,-1.036301e-07,-1.034386e-07,-2.130974e-07,-1.063993e-07,-2.124239e-07,-2.072627e-07,-1.034839e-07,...,-1.035607e-07,-1.036799e-07,-1.035128e-07,-1.093841e-07,-2.071266e-07,-2.073099e-07,,-1.065037e-07,-1.033470e-07,-2.069197e-07
97,0.000000e+00,-1.095711e-07,-1.065081e-07,0.000000e+00,0.000000e+00,-1.099049e-07,0.000000e+00,-1.095466e-07,-1.038947e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038263e-07,-1.039184e-07,-1.065037e-07,,0.000000e+00,-1.037223e-07
98,0.000000e+00,-1.033546e-07,-1.033511e-07,0.000000e+00,0.000000e+00,-1.036514e-07,0.000000e+00,-1.033327e-07,-1.065201e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.095039e-07,-1.162889e-07,-1.033470e-07,0.000000e+00,,-1.063389e-07


np.nanmean(norm_err)=np.float64(-1.0033433251065463e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0033433251065463e-07)
    np.nanmedian(norm_err)=np.float64(-1.038459179967977e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.038459179967977e-07)
    
