In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-17T00:24:49.810919+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
numpy                             : 2.1.2
downstream                        : 1.14.3
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10420.63it/s]
100%|██████████| 100/100 [00:00<00:00, 378.62it/s]
6131it [00:00, 632866.83it/s]
100%|██████████| 100/100 [00:00<00:00, 224294.33it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,-0.395636,0.0,0.000000,0.000000,-0.303931,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.583238,0.000000,0.000000,0.000000,-0.350311
1,0.000000,,0.000000,0.000000,-0.194143,0.0,0.000000,0.000000,-0.220293,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.230530,0.000000,0.000000,0.000000,-0.243678
2,0.000000,0.000000,,0.000000,0.000000,0.0,0.000000,0.000000,0.587623,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.667342
3,0.000000,0.000000,0.000000,,0.000000,0.0,0.000000,0.000000,0.149638,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162742
4,-0.395636,-0.194143,0.000000,0.000000,,0.0,0.167771,0.151171,-0.419248,0.148161,...,-0.187386,0.000000,0.156246,0.000000,0.000000,0.000000,0.000000,-0.314850,0.000000,-0.483144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.583238,-0.230530,0.000000,0.000000,0.000000,0.0,0.217133,0.190114,-0.475088,0.185377,...,-0.221065,0.000000,0.198211,0.000000,0.000000,,0.000000,-0.423172,0.000000,-0.558838
96,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,-0.073850,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,-0.085387
97,0.000000,0.000000,0.000000,0.000000,-0.314850,0.0,0.000000,0.000000,-0.277851,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.423172,0.000000,,0.000000,-0.316112
98,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.246476,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.267255


np.nanmean(norm_err)=np.float64(0.0017702470646567788)
    np.nanmean(np.abs(norm_err))=np.float64(0.06061884890596845)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32750.09it/s]
100%|██████████| 100/100 [00:00<00:00, 425.49it/s]
5968it [00:00, 582170.99it/s]
100%|██████████| 100/100 [00:00<00:00, 278506.24it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,3.581923e-03,-1.816341e-02,2.208951e-03,0.158849,0.000000,5.056360e-03,0.022734,-5.137129e-07,-1.481177e-02,...,2.277382e-03,-6.329008e-07,-2.214816e-02,-7.391308e-07,2.567515e-03,1.668504e-03,1.979297e-03,7.123811e-03,2.581007e-03,4.391260e-03
1,0.003582,,-4.147382e-07,-2.215346e-07,0.001732,0.002163,-3.860583e-07,0.002379,2.560102e-03,-1.773969e-07,...,1.735433e-03,2.897576e-03,-2.395542e-07,2.812413e-03,-2.343817e-07,-3.357170e-07,1.575250e-01,-8.855047e-07,-5.015544e-07,-7.063426e-07
2,-0.018163,-4.147382e-07,,-1.607407e-07,-0.002696,-0.003186,-2.400129e-07,-0.003412,-3.593538e-03,-4.120784e-02,...,-3.272721e-07,-3.910471e-03,-5.351125e-02,-1.551314e-02,-1.750813e-07,-2.701795e-07,-3.010208e-07,-5.400024e-07,-3.511954e-07,-4.537568e-07
3,0.002209,-2.215346e-07,-1.607407e-07,,0.001332,0.001573,0.000000e+00,0.001684,1.772606e-03,0.000000e+00,...,1.338800e-03,1.928070e-03,0.000000e+00,1.889993e-03,0.000000e+00,-1.359383e-07,-1.206470e-01,-2.734123e-07,-2.414700e-07,-2.363860e-07
4,0.158849,1.732062e-03,-2.696485e-03,1.331815e-03,,-0.016767,2.016574e-03,-0.007245,-9.057319e-02,-2.375265e-03,...,1.356323e-03,-1.362253e-01,-3.021033e-03,7.959572e-02,1.454280e-03,1.114149e-03,1.244671e-03,2.280377e-03,1.458521e-03,1.901554e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.001669,-3.357170e-07,-2.701795e-07,-1.359383e-07,0.001114,0.001278,-1.886251e-07,0.001350,1.406848e-03,-1.216953e-07,...,1.122002e-03,1.503046e-03,-1.480475e-07,1.479802e-03,-1.474670e-07,,-2.570979e-07,-4.190034e-07,-2.928294e-07,-3.608332e-07
96,0.001979,1.575250e-01,-3.010208e-07,-1.206470e-01,0.001245,0.001453,-2.266534e-07,0.001547,1.621566e-03,-1.340695e-07,...,1.252098e-03,1.750717e-03,-1.667734e-07,1.719262e-03,-1.642499e-07,-2.570979e-07,,-4.901554e-07,-1.323017e-01,-4.298092e-07
97,0.007124,-8.855047e-07,-5.400024e-07,-2.734123e-07,0.002280,0.003092,-6.239383e-07,0.003552,3.971612e-03,-2.213151e-07,...,2.274589e-03,4.847463e-03,-3.272478e-07,4.613738e-03,-3.909169e-07,-4.190034e-07,-4.901554e-07,,-6.387501e-07,-1.084642e-06
98,0.002581,-5.015544e-07,-3.511954e-07,-2.414700e-07,0.001459,0.001753,-2.887838e-07,0.001892,2.004454e-03,-1.536194e-07,...,1.464645e-03,2.205580e-03,-1.981400e-07,2.155888e-03,-1.945881e-07,-2.928294e-07,-1.323017e-01,-6.387501e-07,,-5.399564e-07


np.nanmean(norm_err)=np.float64(0.0006736998981920841)
    np.nanmean(np.abs(norm_err))=np.float64(0.010141433246726693)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0013754989813654346)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33375.54it/s]
100%|██████████| 100/100 [00:00<00:00, 423.72it/s]
5949it [00:00, 599128.74it/s]
100%|██████████| 100/100 [00:00<00:00, 308404.71it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30801.97it/s]
100%|██████████| 100/100 [00:00<00:00, 1013.12it/s]
5954it [00:00, 536370.76it/s]
100%|██████████| 100/100 [00:00<00:00, 334740.94it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.256359e-07,-1.037427e-07,-2.071740e-07,-1.065330e-07,-2.259189e-07,-2.074798e-07,-1.035705e-07,-1.066134e-07,-2.256260e-07,...,-2.068252e-07,-1.065453e-07,-1.037217e-07,-2.325302e-07,-1.095370e-07,-1.036225e-07,-1.097325e-07,-1.064578e-07,-2.194402e-07,-1.325087e-07
1,-2.256359e-07,,-1.038496e-07,-2.073872e-07,-1.066458e-07,-2.330822e-07,-2.076937e-07,-1.036770e-07,-1.067264e-07,-2.400957e-07,...,-2.070377e-07,-1.066581e-07,-1.038286e-07,-2.259057e-07,-1.096562e-07,-1.037292e-07,-1.098521e-07,-1.065704e-07,-2.196795e-07,-1.130267e-07
2,-1.037427e-07,-1.038496e-07,,-1.097847e-07,0.000000e+00,-1.039695e-07,-1.068758e-07,0.000000e+00,0.000000e+00,-1.038454e-07,...,-1.065285e-07,0.000000e+00,0.000000e+00,-1.038567e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.039779e-07,0.000000e+00
3,-2.071740e-07,-2.073872e-07,-1.097847e-07,,-1.037083e-07,-2.076263e-07,-2.134213e-07,-1.279747e-07,-1.037845e-07,-2.073789e-07,...,-2.127287e-07,-1.037199e-07,-1.097613e-07,-2.074014e-07,-1.036575e-07,-1.065864e-07,-1.038326e-07,-1.036370e-07,-2.076431e-07,-1.037629e-07
4,-1.065330e-07,-1.066458e-07,0.000000e+00,-1.037083e-07,,-1.067722e-07,-1.038615e-07,0.000000e+00,0.000000e+00,-1.066413e-07,...,-1.035335e-07,0.000000e+00,0.000000e+00,-1.066533e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.067811e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036225e-07,-1.037292e-07,0.000000e+00,-1.065864e-07,0.000000e+00,-1.038488e-07,-1.130768e-07,0.000000e+00,0.000000e+00,-1.037250e-07,...,-1.126881e-07,0.000000e+00,0.000000e+00,-1.037362e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.038572e-07,0.000000e+00
96,-1.097325e-07,-1.098521e-07,0.000000e+00,-1.038326e-07,0.000000e+00,-1.099863e-07,-1.039862e-07,0.000000e+00,0.000000e+00,-1.098474e-07,...,-1.036574e-07,0.000000e+00,0.000000e+00,-1.098601e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.243365e-07,0.000000e+00
97,-1.064578e-07,-1.065704e-07,0.000000e+00,-1.036370e-07,0.000000e+00,-1.066966e-07,-1.037900e-07,0.000000e+00,0.000000e+00,-1.065659e-07,...,-1.034624e-07,0.000000e+00,0.000000e+00,-1.065779e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.067055e-07,0.000000e+00
98,-2.194402e-07,-2.196795e-07,-1.039779e-07,-2.076431e-07,-1.067811e-07,-2.199477e-07,-2.079503e-07,-1.038049e-07,-1.068619e-07,-2.196701e-07,...,-2.072927e-07,-1.067934e-07,-1.039569e-07,-2.196954e-07,-1.165061e-07,-1.038572e-07,-1.243365e-07,-1.067055e-07,,-1.099175e-07


np.nanmean(norm_err)=np.float64(-8.320683407236476e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.320683407236476e-08)
    np.nanmedian(norm_err)=np.float64(-1.0366561081915348e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0366561081915348e-07)
    
