In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-07T00:22:29.833339+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
hstrat                            : 1.20.10
downstream                        : 1.14.3
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10481.83it/s]
100%|██████████| 100/100 [00:00<00:00, 376.97it/s]
6132it [00:00, 673161.26it/s]
100%|██████████| 100/100 [00:00<00:00, 233926.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.049680,0.000000,-0.520823,0.000000,-0.451383,0.116568,-0.319432,0.000000,-0.406372,...,0.046837,0.126484,0.000000,0.000000,0.053297,-0.333015,0.000000,0.063278,-0.321421,0.048687
1,0.049680,,0.046358,0.038502,-0.124099,0.529723,0.000000,0.426372,0.055531,0.497394,...,-0.414740,0.000000,0.048805,0.053932,0.000000,0.031858,0.055529,-0.275392,0.614720,0.000000
2,0.000000,0.046358,,0.000000,0.000000,-0.058024,0.099789,-0.044004,0.000000,-0.053457,...,0.043873,0.106968,0.000000,0.000000,0.049492,0.000000,0.000000,0.057985,-0.044230,0.045492
3,-0.520823,0.038502,0.000000,,0.000000,-0.313588,0.069336,-0.243662,0.000000,-0.291181,...,0.036772,0.072728,-0.326786,0.000000,0.040639,0.000000,0.222701,0.046195,-0.244818,0.037903
4,0.000000,-0.124099,0.000000,0.000000,,0.000000,-0.437743,0.000000,0.000000,0.000000,...,-0.115357,-0.513279,0.000000,0.000000,-0.135587,0.000000,0.000000,-0.169609,0.000000,-0.121015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.333015,0.031858,0.000000,0.000000,0.000000,-0.244425,0.050405,-0.199745,0.000000,-0.230595,...,0.030665,0.052174,-0.217063,0.000000,0.033307,,0.138941,0.036950,-0.200521,0.031447
96,0.000000,0.055529,0.000000,0.222701,0.000000,-0.151821,0.154834,-0.107157,0.000000,-0.136559,...,0.052001,0.172832,0.000000,0.000000,0.060086,0.138941,,0.073082,-0.107828,0.054291
97,0.063278,-0.275392,0.057985,0.046195,-0.169609,0.336043,0.000000,0.265938,0.073086,0.313858,...,0.268842,0.000000,0.061865,0.070340,-0.297781,0.036950,0.073082,,0.267118,-0.269301
98,-0.321421,0.614720,-0.044230,-0.244818,0.000000,-0.239502,0.195263,-0.659644,-0.107832,-0.846676,...,0.220469,0.096082,-0.285303,-0.051069,0.648746,-0.200521,-0.107828,0.267118,,0.605173


np.nanmean(norm_err)=np.float64(0.02079239980805022)
    np.nanmean(np.abs(norm_err))=np.float64(0.13685628409972908)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.05493143890399118)
    


100%|██████████| 100/100 [00:00<00:00, 33613.59it/s]
100%|██████████| 100/100 [00:00<00:00, 365.45it/s]
5982it [00:00, 559614.73it/s]
100%|██████████| 100/100 [00:00<00:00, 187078.68it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.314768e-06,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.356486e-02,-1.890106e-02,-9.112151e-02,...,-0.000738,-1.836505e-06,-2.119666e-07,0.000000e+00,-0.000251,0.000000e+00,-8.066373e-07,-2.255846e-07,-0.000845,-2.089015e-07
1,0.000000e+00,,-5.438452e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.929698e-07,-2.411416e-07,-5.739512e-07,...,0.012955,-6.289654e-07,-1.735170e-07,0.000000e+00,0.006184,0.000000e+00,-4.376148e-07,-1.825375e-07,0.013957,-1.704997e-07
2,-1.314768e-06,-5.438452e-07,,-2.539328e-07,-5.935047e-07,-2.613918e-07,-3.745144e-07,-4.730881e-03,-5.349921e-07,-1.499542e-06,...,-0.011507,-1.647293e-06,-3.712418e-07,-1.321705e-06,-0.027807,-5.799547e-07,-1.047496e-06,-3.919626e-07,-0.189988,-3.708472e-07
3,0.000000e+00,0.000000e+00,-2.539328e-07,,4.017579e-02,2.634572e-02,0.000000e+00,-1.373353e-07,-1.600965e-07,-2.603082e-07,...,0.014231,4.541893e-02,2.105765e-02,0.000000e+00,0.018103,3.979168e-02,1.880596e-01,-1.046643e-01,0.007258,-1.255587e-07
4,0.000000e+00,0.000000e+00,-5.935047e-07,4.017579e-02,,0.000000e+00,0.000000e+00,-1.988741e-07,-2.504327e-07,-6.295419e-07,...,0.028295,-8.287170e-07,-1.858772e-07,0.000000e+00,0.026376,0.000000e+00,-5.082792e-07,-1.878120e-07,0.014970,-1.750927e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,-5.799547e-07,3.979168e-02,0.000000e+00,0.000000e+00,0.000000e+00,-1.973292e-07,-2.479879e-07,-6.143176e-07,...,0.027814,-8.470883e-07,-2.433953e-07,0.000000e+00,0.026168,,-4.983086e-07,-1.864336e-07,0.014697,-1.738941e-07
96,-8.066373e-07,-4.376148e-07,-1.047496e-06,1.880596e-01,-5.082792e-07,-2.434162e-07,-3.050387e-07,-3.807555e-07,-4.742149e-07,-1.103226e-06,...,0.025759,-1.336337e-06,-3.527436e-07,-8.312866e-07,0.025228,-4.983086e-07,,-3.604329e-07,0.013537,-3.369460e-07
97,-2.255846e-07,-1.825375e-07,-3.919626e-07,-1.046643e-01,-1.878120e-07,-1.339541e-07,-1.545241e-07,-2.368005e-07,-2.698797e-07,-3.995144e-07,...,0.005555,-4.120584e-07,-2.215602e-07,-2.274709e-07,0.003780,-1.864336e-07,-3.604329e-07,,0.035846,-2.190849e-07
98,-8.453089e-04,1.395690e-02,-1.899882e-01,7.257782e-03,1.496954e-02,7.449180e-03,-4.081095e-02,-4.517184e-03,-2.448479e-04,-5.745566e-04,...,0.003300,1.924092e-02,5.451212e-03,2.662560e-02,-0.019229,1.469670e-02,1.353652e-02,3.584578e-02,,-1.097777e-02


np.nanmean(norm_err)=np.float64(2.0409441501224713e-05)
    np.nanmean(np.abs(norm_err))=np.float64(0.009266210551958743)
    np.nanmedian(norm_err)=np.float64(-2.7615082488880397e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.111019002497433e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34746.95it/s]
100%|██████████| 100/100 [00:00<00:00, 454.28it/s]
5962it [00:00, 646512.07it/s]
100%|██████████| 100/100 [00:00<00:00, 344359.93it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30008.61it/s]
100%|██████████| 100/100 [00:00<00:00, 951.38it/s]
5930it [00:00, 42845.76it/s]
100%|██████████| 100/100 [00:00<00:00, 379231.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.097233e-07,-1.126996e-07,-1.065781e-07,-2.133465e-07,-2.076094e-07,-2.133057e-07,-1.067102e-07,-1.035005e-07,-2.071525e-07,...,-1.094536e-07,-2.073112e-07,-2.193753e-07,-2.192182e-07,-1.065414e-07,-2.129671e-07,-2.131432e-07,-1.036012e-07,-1.036815e-07,-1.035758e-07
1,-1.097233e-07,,0.000000e+00,0.000000e+00,-1.068054e-07,-1.039298e-07,-1.067849e-07,0.000000e+00,0.000000e+00,-1.037008e-07,...,0.000000e+00,-1.037804e-07,-1.202101e-07,-1.326459e-07,0.000000e+00,-1.066152e-07,-1.067034e-07,0.000000e+00,0.000000e+00,0.000000e+00
2,-1.126996e-07,0.000000e+00,,0.000000e+00,-1.065611e-07,-1.036985e-07,-1.065408e-07,0.000000e+00,0.000000e+00,-1.034705e-07,...,0.000000e+00,-1.035498e-07,-1.095691e-07,-1.094908e-07,0.000000e+00,-1.063719e-07,-1.064597e-07,0.000000e+00,0.000000e+00,0.000000e+00
3,-1.065781e-07,0.000000e+00,0.000000e+00,,-1.130537e-07,-1.038563e-07,-1.130309e-07,0.000000e+00,0.000000e+00,-1.036276e-07,...,0.000000e+00,-1.037070e-07,-1.066763e-07,-1.066020e-07,0.000000e+00,-1.095987e-07,-1.096920e-07,0.000000e+00,0.000000e+00,0.000000e+00
4,-2.133465e-07,-1.068054e-07,-1.065611e-07,-1.130537e-07,,-2.078931e-07,-2.331917e-07,-1.099398e-07,-1.036416e-07,-2.074349e-07,...,-1.065498e-07,-2.075941e-07,-2.135430e-07,-2.133942e-07,-1.097606e-07,-2.193986e-07,-2.195854e-07,-1.037425e-07,-1.038230e-07,-1.037171e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.129671e-07,-1.066152e-07,-1.063719e-07,-1.095987e-07,-2.193986e-07,-2.075329e-07,-2.193555e-07,-1.201035e-07,-1.034625e-07,-2.070763e-07,...,-1.063605e-07,-2.072350e-07,-2.131630e-07,-2.130147e-07,-1.127995e-07,,-2.256667e-07,-1.035631e-07,-1.036433e-07,-1.035378e-07
96,-2.131432e-07,-1.067034e-07,-1.064597e-07,-1.096920e-07,-2.195854e-07,-2.077000e-07,-2.195423e-07,-1.130879e-07,-1.035456e-07,-2.072427e-07,...,-1.064484e-07,-2.074016e-07,-2.133394e-07,-2.131908e-07,-1.163415e-07,-2.256667e-07,,-1.036463e-07,-1.037267e-07,-1.036210e-07
97,-1.036012e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037425e-07,-1.201099e-07,-1.037232e-07,0.000000e+00,0.000000e+00,-1.161562e-07,...,0.000000e+00,-1.065174e-07,-1.036939e-07,-1.036237e-07,0.000000e+00,-1.035631e-07,-1.036463e-07,,0.000000e+00,0.000000e+00
98,-1.036815e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.038230e-07,-1.098338e-07,-1.038037e-07,0.000000e+00,0.000000e+00,-1.095781e-07,...,0.000000e+00,-1.066023e-07,-1.037743e-07,-1.037040e-07,0.000000e+00,-1.036433e-07,-1.037267e-07,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0891749626605205e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0891749626605205e-07)
    np.nanmedian(norm_err)=np.float64(-1.0632123888553772e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0632123888553772e-07)
    
