In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-14T00:22:31.013236+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
hstrat                            : 1.20.10
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10415.71it/s]
100%|██████████| 100/100 [00:00<00:00, 386.70it/s]
6128it [00:00, 599004.75it/s]
100%|██████████| 100/100 [00:00<00:00, 177950.95it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.078681,-0.034810,-0.041705,-0.093335,-0.152586,0.303674,0.311172,0.000000,-0.054110,...,-0.452577,-0.135619,-0.062341,-0.038385,-0.038265,-0.048347,-0.069532,0.0,0.276417,-0.073567
1,-0.078681,,0.062470,0.112070,0.000000,-0.030021,-0.067172,-0.068080,-0.094477,0.141012,...,0.085185,-0.385960,0.159273,0.104014,0.067974,0.083418,0.461227,0.0,-0.063698,0.118423
2,-0.034810,0.062470,,-0.035000,0.071364,0.049487,-0.030227,-0.030594,-0.040856,-0.043334,...,0.073321,0.045773,-0.048456,-0.032632,0.000000,0.000000,-0.108242,0.0,-0.028812,0.174533
3,-0.041705,0.112070,-0.035000,,0.131699,0.064682,-0.035293,-0.035795,-0.050692,0.226571,...,0.935724,0.058480,0.000000,-0.121992,-0.038493,-0.048707,0.000000,0.0,-0.033380,0.000000
4,-0.093335,0.000000,0.071364,0.131699,,-0.039477,-0.077569,-0.078782,-0.116426,0.173562,...,0.102626,-0.519215,0.202078,0.120713,0.078637,0.100071,0.625719,0.0,-0.072973,0.155053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.048347,0.083418,0.000000,-0.048707,0.100071,0.082185,-0.039936,-0.040580,-0.060854,-0.066506,...,0.103962,0.072427,-0.079385,-0.044240,0.000000,,-0.191615,0.0,-0.037503,0.327895
96,-0.069532,0.461227,-0.108242,0.000000,0.625719,0.170398,-0.053368,-0.054523,-0.098709,0.000000,...,0.241167,0.133190,0.000000,0.000000,-0.125905,-0.191615,,0.0,-0.049110,0.000000
97,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000
98,0.276417,-0.063698,-0.028812,-0.033380,-0.072973,-0.104786,0.000000,0.000000,0.361280,-0.040880,...,-0.289804,-0.096496,-0.045411,-0.031218,-0.031139,-0.037503,-0.049110,0.0,,-0.051089


np.nanmean(norm_err)=np.float64(-0.014658018280737069)
    np.nanmean(np.abs(norm_err))=np.float64(0.1310930820185011)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.07351631011856796)
    


100%|██████████| 100/100 [00:00<00:00, 32683.74it/s]
100%|██████████| 100/100 [00:00<00:00, 393.54it/s]
5981it [00:00, 555027.48it/s]
100%|██████████| 100/100 [00:00<00:00, 265462.28it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,7.257014e-04,-2.170553e-07,1.128931e-03,0.000000e+00,-1.202982e-02,0.007363,-2.663544e-07,-1.108338e-02,0.000000e+00,...,-4.129767e-03,0.009260,-1.813657e-07,0.008307,0.000000e+00,0.000000e+00,0.000000e+00,-0.009131,0.002737,-0.012052
1,7.257014e-04,,1.079535e-03,-1.577697e-07,1.575748e-03,1.675231e-03,0.000695,-9.909085e-03,1.558722e-03,2.633365e-03,...,6.799487e-04,0.000848,9.265770e-04,0.000772,1.188046e-03,1.270082e-03,6.995541e-04,0.000706,0.000721,0.000896
2,-2.170553e-07,1.079535e-03,,2.304214e-03,-1.886732e-07,-1.212055e-02,0.038088,-2.043913e-06,-1.067703e-02,-6.074395e-07,...,6.538531e-02,0.118362,-7.218635e-07,0.046390,-8.465447e-07,-9.850523e-07,-2.086596e-07,-0.007575,0.004351,-0.012214
3,1.128931e-03,-1.577697e-07,2.304214e-03,,2.360431e-03,2.590929e-03,0.001056,-2.724687e-02,2.322428e-03,5.925693e-03,...,1.022024e-03,0.001455,1.703783e-03,0.001245,2.861660e-03,3.389012e-03,1.066886e-03,0.001081,0.001118,0.001602
4,0.000000e+00,1.575748e-03,-1.886732e-07,2.360431e-03,,1.108876e-02,-0.000970,-2.277851e-07,1.027963e-02,0.000000e+00,...,-1.156836e-07,-0.078208,-1.602683e-07,-0.025213,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,-0.136738,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,1.270082e-03,-9.850523e-07,3.389012e-03,0.000000e+00,-5.255697e-03,0.004902,-7.962639e-06,-4.514615e-03,0.000000e+00,...,-2.293963e-07,0.007931,-5.116287e-07,0.006220,0.000000e+00,,0.000000e+00,0.007136,0.014950,0.013216
96,0.000000e+00,6.995541e-04,-2.086596e-07,1.066886e-03,0.000000e+00,-7.329667e-03,0.065533,-2.456494e-07,-6.775686e-03,0.000000e+00,...,1.598013e-02,0.029241,-1.705631e-07,0.054570,1.328684e-01,0.000000e+00,,-0.004632,0.002626,-0.006033
97,-9.130588e-03,7.056006e-04,-7.574891e-03,1.081016e-03,0.000000e+00,-2.885122e-02,0.002545,-2.502765e-07,-1.470706e-02,0.000000e+00,...,-1.252143e-02,0.003174,-4.090420e-02,0.002859,-8.478689e-03,7.136059e-03,-4.631700e-03,,0.006375,-0.005583
98,2.736801e-03,7.210931e-04,4.350594e-03,1.117916e-03,-1.367384e-01,-1.317785e-07,0.001564,-5.253322e-07,-1.215341e-07,-4.301123e-02,...,2.543523e-03,-0.082524,3.626793e-03,-0.024410,4.889867e-03,1.495027e-02,2.625758e-03,0.006375,,0.008386


np.nanmean(norm_err)=np.float64(0.0003990899816046996)
    np.nanmean(np.abs(norm_err))=np.float64(0.013189405783855846)
    np.nanmedian(norm_err)=np.float64(0.0008043706702327111)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002222385043612779)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36730.92it/s]
100%|██████████| 100/100 [00:00<00:00, 461.82it/s]
5964it [00:00, 654649.95it/s]
100%|██████████| 100/100 [00:00<00:00, 388361.48it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35380.04it/s]
100%|██████████| 100/100 [00:00<00:00, 1009.97it/s]
5934it [00:00, 653941.14it/s]
100%|██████████| 100/100 [00:00<00:00, 432402.47it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.132900e-07,-2.131691e-07,-2.126781e-07,-1.126284e-07,-2.253185e-07,-2.071415e-07,-1.035606e-07,-1.035812e-07,-2.130720e-07,...,-2.190106e-07,-1.095892e-07,-2.126493e-07,-1.064266e-07,-2.073944e-07,-2.072196e-07,-1.037443e-07,-2.194920e-07,-2.075458e-07,-1.064043e-07
1,-2.132900e-07,,-2.262159e-07,-2.256631e-07,-1.065392e-07,-2.131336e-07,-2.075031e-07,-1.037414e-07,-1.037621e-07,-2.195985e-07,...,-2.132811e-07,-1.067201e-07,-2.475980e-07,-1.096831e-07,-2.077569e-07,-2.075815e-07,-1.039257e-07,-2.137376e-07,-2.079088e-07,-1.163486e-07
2,-2.131691e-07,-2.262159e-07,,-2.474740e-07,-1.064788e-07,-2.130128e-07,-2.073887e-07,-1.036841e-07,-1.037048e-07,-2.194703e-07,...,-2.131602e-07,-1.066596e-07,-2.254954e-07,-1.096191e-07,-2.076422e-07,-2.074669e-07,-1.038683e-07,-2.136161e-07,-2.077939e-07,-1.128372e-07
3,-2.126781e-07,-2.256631e-07,-2.474740e-07,,-1.062338e-07,-2.125226e-07,-2.069239e-07,-1.034518e-07,-1.034724e-07,-2.189500e-07,...,-2.126693e-07,-1.064137e-07,-2.249461e-07,-1.093595e-07,-2.071763e-07,-2.070019e-07,-1.036351e-07,-2.131231e-07,-2.073273e-07,-1.125622e-07
4,-1.126284e-07,-1.065392e-07,-1.064788e-07,-1.062338e-07,,-1.195979e-07,-1.034709e-07,0.000000e+00,0.000000e+00,-1.064304e-07,...,-1.093937e-07,0.000000e+00,-1.062195e-07,0.000000e+00,-1.035972e-07,-1.035099e-07,0.000000e+00,-1.096339e-07,-1.036727e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.072196e-07,-2.075815e-07,-2.074669e-07,-2.070019e-07,-1.035099e-07,-2.070719e-07,-2.130135e-07,-1.095545e-07,-1.128182e-07,-2.073750e-07,...,-2.072112e-07,-1.036807e-07,-2.069746e-07,-1.035839e-07,-2.132809e-07,,-1.066902e-07,-2.076420e-07,-2.195841e-07,-1.035627e-07
96,-1.037443e-07,-1.039257e-07,-1.038683e-07,-1.036351e-07,0.000000e+00,-1.036703e-07,-1.200770e-07,0.000000e+00,0.000000e+00,-1.038222e-07,...,-1.037401e-07,0.000000e+00,-1.036215e-07,0.000000e+00,-1.098581e-07,-1.066902e-07,,-1.039560e-07,-1.068632e-07,0.000000e+00
97,-2.194920e-07,-2.137376e-07,-2.136161e-07,-2.131231e-07,-1.096339e-07,-2.193263e-07,-2.075636e-07,-1.037716e-07,-1.037923e-07,-2.135187e-07,...,-2.328817e-07,-1.130812e-07,-2.130942e-07,-1.066495e-07,-2.078176e-07,-2.076420e-07,-1.039560e-07,,-2.079695e-07,-1.066271e-07
98,-2.075458e-07,-2.079088e-07,-2.077939e-07,-2.073273e-07,-1.036727e-07,-2.073976e-07,-2.133582e-07,-1.281724e-07,-1.097600e-07,-2.077016e-07,...,-2.075374e-07,-1.038440e-07,-2.073000e-07,-1.037469e-07,-2.136265e-07,-2.195841e-07,-1.068632e-07,-2.079695e-07,,-1.037257e-07


np.nanmean(norm_err)=np.float64(-1.1750298170548979e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1750298170548979e-07)
    np.nanmedian(norm_err)=np.float64(-1.0654198129658507e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0654198129658507e-07)
    
