In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-10T00:24:41.059777+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10509.67it/s]
100%|██████████| 100/100 [00:00<00:00, 350.82it/s]
6141it [00:00, 679377.02it/s]
100%|██████████| 100/100 [00:00<00:00, 232758.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.274405,0.000000,0.089621,-0.000002,-0.561579,-0.095507,0.102952,0.095472,0.000000,...,0.271267,-0.261479,0.398111,0.256230,-0.424097,-0.014907,-0.235269,0.000000,0.192267,-0.000003
1,0.274405,,0.294792,0.148850,0.068075,0.055989,0.058247,-0.419532,0.273929,0.312198,...,0.000000,0.074332,0.114342,0.000000,0.055847,-0.015963,0.067492,0.310823,0.000000,0.078912
2,0.000000,0.294792,,0.097614,-0.000003,-0.625794,-0.104636,0.109277,0.104596,0.000000,...,0.291174,-0.296948,0.436608,0.273919,-0.469583,-0.016734,-0.263598,0.000000,0.215569,-0.000003
3,0.089621,0.148850,0.097614,,-0.648139,-0.000003,-0.000003,0.663109,0.089438,0.104615,...,0.146795,-0.000004,0.195355,0.137084,-0.000003,-0.024580,-0.000004,0.104056,0.563480,-0.228962
4,-0.000002,0.068075,-0.000003,-0.648139,,0.186075,0.202373,0.058113,-0.000002,-0.000003,...,0.066961,0.381764,0.094954,0.061770,0.185099,-0.043941,0.288053,-0.000003,0.182623,-0.469201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.014907,-0.015963,-0.016734,-0.024580,-0.043941,-0.025878,-0.028430,-0.013287,-0.014866,-0.018429,...,-0.015657,-0.060376,-0.023916,-0.014254,-0.025726,,-0.042747,-0.018290,-0.060657,-0.079045
96,-0.235269,0.067492,-0.263598,-0.000004,0.288053,0.000000,0.248732,0.057688,-0.234637,-0.289783,...,0.066397,0.000000,0.093824,0.061290,0.000000,-0.042747,,-0.287642,0.178489,0.000000
97,0.000000,0.310823,0.000000,0.104056,-0.000003,-1.101381,-0.112074,0.114145,0.378482,0.000000,...,0.306803,-0.327816,0.468025,0.287707,-0.507358,-0.018290,-0.287642,,0.235372,-0.000003
98,0.192267,0.000000,0.215569,0.563480,0.182623,0.115651,0.125719,0.122556,0.191747,0.237136,...,0.000000,0.235896,0.000000,0.000000,0.115047,-0.060657,0.178489,0.235372,,0.289157


np.nanmean(norm_err)=np.float64(0.06532922824438943)
    np.nanmean(np.abs(norm_err))=np.float64(0.14967845816262706)
    np.nanmedian(norm_err)=np.float64(3.859732931575711e-06)
    np.nanmedian(np.abs(norm_err))=np.float64(0.09425838473672099)
    


100%|██████████| 100/100 [00:00<00:00, 33274.92it/s]
100%|██████████| 100/100 [00:00<00:00, 447.64it/s]
5975it [00:00, 572835.18it/s]
100%|██████████| 100/100 [00:00<00:00, 287872.61it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.873876e-07,-8.775866e-03,5.462920e-02,-1.417911e-07,0.000000e+00,1.249775e-01,5.335653e-02,3.664838e-02,-3.290447e-07,...,0.000000e+00,-3.148726e-03,5.435523e-03,3.699244e-03,0.000000e+00,3.376340e-02,-2.253687e-07,-3.684768e-03,-2.945734e-02,5.510286e-03
1,-1.873876e-07,,-5.091918e-07,-4.181835e-07,-2.778778e-07,-1.979786e-07,-2.972079e-07,-1.349167e-07,-1.430655e-07,-8.341734e-07,...,-1.323560e-02,-1.881694e-07,-1.605190e-02,-2.199621e-07,-1.437078e-07,-2.445935e-07,-4.928780e-07,-2.191301e-07,-2.835631e-07,-1.643698e-07
2,-8.775866e-03,-5.091918e-07,,5.111990e-03,-3.543657e-07,-9.502369e-03,9.169508e-03,2.808187e-03,3.027252e-03,-1.226722e-06,...,-1.691215e-07,-4.331001e-03,7.364180e-02,5.444968e-03,-6.118162e-03,1.295772e-02,-6.604075e-07,-5.414327e-03,-3.370821e-02,4.433018e-02
3,5.462920e-02,-4.181835e-07,5.111990e-03,,-3.077547e-07,5.833083e-02,-4.159889e-07,-1.550089e-07,-1.734027e-07,-8.047781e-07,...,-1.477605e-07,-1.427143e-02,5.987803e-03,4.232707e-03,4.015208e-02,3.938938e-02,-5.150353e-07,-8.480547e-03,-5.764087e-03,6.078641e-03
4,-1.417911e-07,-2.778778e-07,-3.543657e-07,-3.077547e-07,,-1.477728e-07,-1.968216e-07,-1.095522e-07,-1.148647e-07,-4.081652e-07,...,1.074852e-02,2.834231e-02,-2.537679e-07,-1.339711e-01,-1.152783e-07,-2.803989e-03,-3.175587e-07,2.591260e-03,7.789831e-02,-2.085908e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3.376340e-02,-2.445935e-07,1.295772e-02,3.938938e-02,-2.803989e-03,3.644631e-02,6.868876e-02,2.199338e-02,2.367197e-02,-5.583521e-07,...,0.000000e+00,0.000000e+00,6.861994e-03,0.000000e+00,2.380643e-02,,-3.135724e-07,0.000000e+00,0.000000e+00,0.000000e+00
96,-2.253687e-07,-4.928780e-07,-6.604075e-07,-5.150353e-07,-3.175587e-07,-2.408657e-07,-4.056320e-07,-1.535481e-07,-1.641916e-07,-1.136127e-06,...,-1.506186e-02,-2.265005e-07,-1.881926e-02,-2.742071e-07,-1.650382e-07,-3.135724e-07,,-2.729154e-07,-3.806345e-07,-1.928833e-07
97,-3.684768e-03,-2.191301e-07,-5.414327e-03,-8.480547e-03,2.591260e-03,-3.939776e-03,-1.342761e-02,-5.038096e-03,-5.389329e-03,-4.412933e-07,...,2.484829e-03,-3.747398e-03,-1.905740e-07,0.000000e+00,-2.694020e-03,0.000000e+00,-2.729154e-07,,0.000000e+00,0.000000e+00
98,-2.945734e-02,-2.835631e-07,-3.370821e-02,-5.764087e-03,7.789831e-02,-3.221710e-02,-1.159400e-02,-2.982516e-03,-3.232996e-03,-8.135899e-07,...,1.797171e-02,5.539841e-02,-2.375094e-07,0.000000e+00,-1.981386e-02,0.000000e+00,-3.806345e-07,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-0.0015526722601040131)
    np.nanmean(np.abs(norm_err))=np.float64(0.010968059200699687)
    np.nanmedian(norm_err)=np.float64(-1.7422934596998777e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(4.88975035783631e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34163.92it/s]
100%|██████████| 100/100 [00:00<00:00, 460.21it/s]
5946it [00:00, 591848.96it/s]
100%|██████████| 100/100 [00:00<00:00, 327680.00it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30972.56it/s]
100%|██████████| 100/100 [00:00<00:00, 1002.59it/s]
5975it [00:00, 647419.63it/s]
100%|██████████| 100/100 [00:00<00:00, 388721.41it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.034913e-07,-1.034379e-07,-1.128064e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.127178e-07,0.000000e+00,-1.095876e-07,0.000000e+00,-1.163701e-07,0.000000e+00,0.000000e+00,-1.066114e-07,-1.063214e-07
1,-1.034913e-07,,-2.126122e-07,-2.072064e-07,-1.062896e-07,-1.126370e-07,-1.036689e-07,-1.037314e-07,-1.065352e-07,-1.037298e-07,...,-1.037260e-07,-2.070568e-07,-1.066248e-07,-2.072444e-07,-1.034026e-07,-2.074067e-07,-1.064035e-07,-1.035236e-07,-2.074035e-07,-2.068547e-07
2,-1.034379e-07,-2.126122e-07,,-2.070994e-07,-1.234183e-07,-1.062998e-07,-1.036154e-07,-1.036778e-07,-1.095361e-07,-1.036762e-07,...,-1.036723e-07,-2.069500e-07,-1.128747e-07,-2.071374e-07,-1.033494e-07,-2.072995e-07,-1.093969e-07,-1.034702e-07,-2.072963e-07,-2.067481e-07
3,-1.128064e-07,-2.072064e-07,-2.070994e-07,,-1.035340e-07,-1.035972e-07,-1.066955e-07,-1.067616e-07,-1.037671e-07,-1.098338e-07,...,-1.098295e-07,-2.476829e-07,-1.038520e-07,-2.194263e-07,-1.127012e-07,-2.261170e-07,-1.036421e-07,-1.162848e-07,-2.134605e-07,-2.128792e-07
4,0.000000e+00,-1.062896e-07,-1.234183e-07,-1.035340e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.034593e-07,0.000000e+00,-1.035530e-07,0.000000e+00,-1.036340e-07,0.000000e+00,0.000000e+00,-1.036324e-07,-1.033584e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.163701e-07,-2.074067e-07,-2.072995e-07,-2.261170e-07,-1.036340e-07,-1.036974e-07,-1.068017e-07,-1.068680e-07,-1.038676e-07,-1.099464e-07,...,-1.099421e-07,-2.259389e-07,-1.039527e-07,-2.196510e-07,-1.162581e-07,,-1.037424e-07,-1.129638e-07,-2.136731e-07,-2.130906e-07
96,0.000000e+00,-1.064035e-07,-1.093969e-07,-1.036421e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.035673e-07,0.000000e+00,-1.036611e-07,0.000000e+00,-1.037424e-07,,0.000000e+00,-1.037407e-07,-1.034661e-07
97,0.000000e+00,-1.035236e-07,-1.034702e-07,-1.162848e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.161906e-07,0.000000e+00,-1.096239e-07,0.000000e+00,-1.129638e-07,0.000000e+00,,-1.066458e-07,-1.063556e-07
98,-1.066114e-07,-2.074035e-07,-2.072963e-07,-2.134605e-07,-1.036324e-07,-1.036958e-07,-1.098762e-07,-1.099464e-07,-1.038659e-07,-1.068646e-07,...,-1.068605e-07,-2.133018e-07,-1.039511e-07,-2.135008e-07,-1.065174e-07,-2.136731e-07,-1.037407e-07,-1.066458e-07,,-2.192097e-07


np.nanmean(norm_err)=np.float64(-1.0237676073327664e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0237676073327664e-07)
    np.nanmedian(norm_err)=np.float64(-1.0388626573304283e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0388626573304283e-07)
    
