In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-08T00:23:54.174839+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10477.12it/s]
100%|██████████| 100/100 [00:00<00:00, 416.46it/s]
6113it [00:00, 660002.58it/s]
100%|██████████| 100/100 [00:00<00:00, 250705.56it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.042050,0.000000,-0.059333,0.000000,-0.036903,0.000000,0.000000,0.000000,-0.034445,...,0.000000,0.000000,-0.019686,0.000000,0.000000,-0.248843,-0.147777,-0.020201,0.039179,-0.061432
1,-0.042050,,-0.008115,0.000000,-0.010569,-0.040681,-0.048316,-0.177101,-0.010105,-0.039440,...,-0.068503,-0.010955,0.354657,-0.049126,-0.008497,0.000000,0.457070,0.563908,-0.008213,0.000000
2,0.000000,-0.008115,,-0.009761,-0.286806,0.408724,0.000000,0.000000,-0.266069,0.391532,...,0.000000,0.305031,-0.135867,0.000000,0.320580,-0.015640,-0.013854,-0.137728,-0.123796,-0.009929
3,-0.059333,0.000000,-0.009761,,-0.013544,-0.048960,-0.060457,-0.247042,-0.012791,-0.047173,...,-0.095771,-0.014185,0.445957,-0.061731,-0.010319,0.000000,0.650011,0.727961,-0.009903,0.000000
4,0.000000,-0.010569,-0.286806,-0.013544,,0.167902,0.000000,0.000000,0.000000,0.159283,...,0.000000,0.000000,-0.193827,0.000000,0.000000,-0.028307,-0.022950,-0.197637,-0.175640,-0.013868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.248843,0.000000,-0.015640,0.000000,-0.028307,-0.078593,-0.113131,-0.276025,-0.025204,-0.074089,...,-0.364924,-0.031258,0.068706,-0.117673,-0.017122,,0.000000,0.070267,-0.016008,0.000000
96,-0.147777,0.457070,-0.013854,0.650011,-0.022950,-0.484418,-0.472529,-0.189686,-0.020868,-0.455022,...,-0.227826,-0.024853,0.713382,-0.491394,-0.015004,0.000000,,0.732140,-0.014141,0.673655
97,-0.020201,0.563908,-0.137728,0.727961,-0.197637,-0.009798,-0.012100,-0.017680,-0.185383,-0.009440,...,-0.019176,-0.208203,0.330277,-0.012355,-0.358667,0.070267,0.732140,,-0.139909,0.746006
98,0.039179,-0.008213,-0.123796,-0.009903,-0.175640,0.000000,0.878548,0.225955,-0.165130,0.000000,...,0.075911,-0.679458,-0.714183,1.121821,-0.131296,-0.016008,-0.014141,-0.139909,,-0.010076


np.nanmean(norm_err)=np.float64(0.003371061381580662)
    np.nanmean(np.abs(norm_err))=np.float64(0.14582984264460538)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.014791920085556767)
    


100%|██████████| 100/100 [00:00<00:00, 33721.69it/s]
100%|██████████| 100/100 [00:00<00:00, 722.08it/s]
5937it [00:00, 637390.78it/s]
100%|██████████| 100/100 [00:00<00:00, 262472.09it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.258910e-02,-3.999247e-07,-1.759702e-02,-1.631533e-07,-3.714229e-07,-0.009535,4.685741e-02,-3.264003e-07,-3.804752e-07,...,6.772761e-02,3.473256e-02,-1.208711e-02,-1.055719e-01,4.236233e-02,-1.225794e-02,-2.003676e-07,-8.950040e-07,4.065670e-02,-9.782080e-07
1,-1.258910e-02,,5.079322e-02,2.348490e-02,1.946871e-02,-1.278097e-02,0.020784,-4.874025e-07,4.064291e-02,-6.218099e-03,...,-3.557042e-07,-3.592513e-07,1.580679e-02,1.024225e-01,-2.198609e-07,1.604021e-02,2.413881e-02,1.607240e-02,-2.108414e-07,6.366215e-02
2,-3.999247e-07,5.079322e-02,,5.856076e-02,0.000000e+00,0.000000e+00,0.037126,-2.503144e-07,-2.517001e-02,-1.939194e-07,...,0.000000e+00,-1.831930e-07,2.576438e-02,3.212080e-02,0.000000e+00,6.673119e-02,0.000000e+00,1.697946e-02,0.000000e+00,-6.183013e-07
3,-1.759702e-02,2.348490e-02,5.856076e-02,,3.769384e-02,-3.592235e-02,0.018970,-3.813819e-07,3.314003e-02,-1.026268e-02,...,-2.530347e-07,-2.981588e-07,-3.191924e-07,1.027643e-02,-1.757768e-07,1.064273e-02,1.672518e-01,1.029094e-02,-1.699638e-07,4.465955e-02
4,-1.631533e-07,1.946871e-02,0.000000e+00,3.769384e-02,,0.000000e+00,0.017062,-1.311697e-07,-4.273507e-02,-1.138237e-07,...,0.000000e+00,-1.100417e-07,1.503161e-02,1.592122e-02,0.000000e+00,2.283869e-02,0.000000e+00,5.879370e-03,0.000000e+00,-1.906189e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.225794e-02,1.604021e-02,6.673119e-02,1.064273e-02,2.283869e-02,-5.040161e-02,0.013797,-2.914171e-07,2.350786e-02,-8.183814e-03,...,-1.795025e-07,-2.401894e-07,-1.676309e-02,7.677628e-03,-1.368371e-07,,2.632180e-02,6.812848e-03,-1.332883e-07,2.877225e-02
96,-2.003676e-07,2.413881e-02,0.000000e+00,1.672518e-01,0.000000e+00,0.000000e+00,0.020545,-1.541941e-07,-2.589679e-02,-1.307679e-07,...,0.000000e+00,-1.258007e-07,1.729132e-02,1.891365e-02,0.000000e+00,2.632180e-02,,7.396228e-03,0.000000e+00,-2.434458e-07
97,-8.950040e-07,1.607240e-02,1.697946e-02,1.029094e-02,5.879370e-03,-4.554505e-07,0.011408,-1.112741e-01,1.313131e-02,-4.089196e-07,...,-3.847020e-01,-7.867481e-02,6.707251e-03,9.766281e-03,-2.079651e-01,6.812848e-03,7.396228e-03,,-4.600890e-02,2.225886e-02
98,4.065670e-02,-2.108414e-07,0.000000e+00,-1.699638e-07,0.000000e+00,1.346018e-02,0.000000,2.126648e-02,0.000000e+00,6.233121e-02,...,2.690749e-02,1.718509e-02,-1.319619e-07,-1.651811e-07,7.118643e-02,-1.332883e-07,0.000000e+00,-4.600890e-02,,-2.393158e-07


np.nanmean(norm_err)=np.float64(0.004277414756886063)
    np.nanmean(np.abs(norm_err))=np.float64(0.015063057282870386)
    np.nanmedian(norm_err)=np.float64(-1.5532672246655337e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(8.446841802179622e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35356.18it/s]
100%|██████████| 100/100 [00:00<00:00, 428.21it/s]
5958it [00:00, 627502.59it/s]
100%|██████████| 100/100 [00:00<00:00, 265798.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32809.01it/s]
100%|██████████| 100/100 [00:00<00:00, 1007.56it/s]
5946it [00:00, 635947.87it/s]
100%|██████████| 100/100 [00:00<00:00, 384798.53it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.256338e-07,-2.324691e-07,-2.069521e-07,-1.063591e-07,-2.131774e-07,-1.035787e-07,-2.072848e-07,-1.035103e-07,-2.132827e-07,...,-2.070297e-07,-2.074352e-07,-1.063991e-07,-2.191823e-07,-2.072365e-07,-1.036330e-07,-2.070312e-07,-1.035553e-07,-2.073859e-07,-1.065324e-07
1,-2.256338e-07,,-2.257581e-07,-2.070891e-07,-1.064315e-07,-2.133228e-07,-1.036473e-07,-2.074223e-07,-1.035789e-07,-2.134282e-07,...,-2.071668e-07,-2.075729e-07,-1.064716e-07,-2.193360e-07,-2.073739e-07,-1.037018e-07,-2.071684e-07,-1.036240e-07,-2.075235e-07,-1.066050e-07
2,-2.324691e-07,-2.257581e-07,,-2.070566e-07,-1.064143e-07,-2.132883e-07,-1.036310e-07,-2.073896e-07,-1.035626e-07,-2.133937e-07,...,-2.071343e-07,-2.075402e-07,-1.064544e-07,-2.192995e-07,-2.073413e-07,-1.036855e-07,-2.071358e-07,-1.036077e-07,-2.074909e-07,-1.065878e-07
3,-2.069521e-07,-2.070891e-07,-2.070566e-07,,-1.033957e-07,-2.072254e-07,-1.094355e-07,-2.254862e-07,-1.125869e-07,-2.073248e-07,...,-2.126326e-07,-2.559460e-07,-1.034336e-07,-2.071157e-07,-2.254292e-07,-1.064410e-07,-2.126342e-07,-1.063590e-07,-2.130084e-07,-1.035595e-07
4,-1.063591e-07,-1.064315e-07,-1.064143e-07,-1.033957e-07,,-1.198927e-07,0.000000e+00,-1.035618e-07,0.000000e+00,-1.163020e-07,...,-1.034344e-07,-1.036369e-07,0.000000e+00,-1.064455e-07,-1.035377e-07,0.000000e+00,-1.034352e-07,0.000000e+00,-1.036123e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036330e-07,-1.037018e-07,-1.036855e-07,-1.064410e-07,0.000000e+00,-1.037701e-07,0.000000e+00,-1.066170e-07,0.000000e+00,-1.038200e-07,...,-1.095397e-07,-1.066966e-07,0.000000e+00,-1.037151e-07,-1.065915e-07,,-1.095405e-07,0.000000e+00,-1.164384e-07,0.000000e+00
96,-2.070312e-07,-2.071684e-07,-2.071358e-07,-2.126342e-07,-1.034352e-07,-2.073047e-07,-1.064254e-07,-2.129854e-07,-1.063533e-07,-2.074043e-07,...,-2.394170e-07,-2.131442e-07,-1.034731e-07,-2.071949e-07,-2.129345e-07,-1.095405e-07,,-1.094537e-07,-2.192150e-07,-1.035991e-07
97,-1.035553e-07,-1.036240e-07,-1.036077e-07,-1.063590e-07,0.000000e+00,-1.036922e-07,0.000000e+00,-1.065347e-07,0.000000e+00,-1.037420e-07,...,-1.094528e-07,-1.066142e-07,0.000000e+00,-1.036372e-07,-1.065092e-07,0.000000e+00,-1.094537e-07,,-1.128972e-07,0.000000e+00
98,-2.073859e-07,-2.075235e-07,-2.074909e-07,-2.130084e-07,-1.036123e-07,-2.076604e-07,-1.066129e-07,-2.133608e-07,-1.065405e-07,-2.077602e-07,...,-2.192132e-07,-2.135202e-07,-1.036503e-07,-2.075502e-07,-2.133097e-07,-1.164384e-07,-2.192150e-07,-1.128972e-07,,-1.037767e-07


np.nanmean(norm_err)=np.float64(-1.0884946966234275e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0884946966234275e-07)
    np.nanmedian(norm_err)=np.float64(-1.0634123979355233e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0634123979355233e-07)
    
