In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-19T19:59:16.817782+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10660.32it/s]
100%|██████████| 100/100 [00:00<00:00, 367.76it/s]
6129it [00:00, 624787.68it/s]
100%|██████████| 100/100 [00:00<00:00, 183157.38it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.107293,-0.088785,0.000000,0.000000,0.155988,0.000000,0.00000,0.000000,0.000000,...,-0.109272,0.000000,-0.136804,0.555007,0.000000,0.000000,0.000000,0.0,0.298261,0.000000
1,-0.107293,,-0.112325,-0.211691,-0.099569,-0.269181,0.250253,0.39513,-0.091163,-0.161839,...,0.000000,-0.096986,0.000000,-0.157808,-0.263000,-0.383510,-0.111005,0.0,-0.209420,0.151438
2,-0.088785,-0.112325,,-0.189209,-0.088920,-0.820031,0.144687,0.13894,-0.082155,-0.135469,...,0.384059,-0.086854,-0.127980,-0.132634,-0.229170,-0.315579,-0.097930,0.0,-0.177115,0.123623
3,0.000000,-0.211691,-0.189209,,0.641674,-0.183550,0.000000,0.00000,0.464322,0.000000,...,-0.094433,0.500873,-0.241368,-0.057045,0.000000,0.000000,0.042471,0.0,-0.109857,0.000000
4,0.000000,-0.099569,-0.088920,0.641674,,-0.046330,0.000000,0.00000,0.000000,-0.309895,...,-0.323229,0.000000,-0.113653,-0.130526,0.684266,0.224637,-0.782478,0.0,-0.058832,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,-0.383510,-0.315579,0.000000,0.224637,-0.404537,0.000000,0.00000,0.190044,0.000000,...,-0.165169,0.213416,-0.493419,-0.177769,0.000000,,0.085912,0.0,-0.317417,0.000000
96,0.000000,-0.111005,-0.097930,0.042471,-0.782478,-0.052999,0.000000,0.00000,-0.532875,-0.392488,...,-0.365290,-0.586946,-0.128800,-0.158651,0.054463,0.085912,,0.0,-0.070022,0.000000
97,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000
98,0.298261,-0.209420,-0.177115,-0.109857,-0.058832,0.299091,0.000000,0.00000,-0.051471,-0.145910,...,-0.149697,-0.056498,-0.258143,0.076713,-0.153591,-0.317417,-0.070022,0.0,,0.000000


np.nanmean(norm_err)=np.float64(-0.06326592053732814)
    np.nanmean(np.abs(norm_err))=np.float64(0.1543593357260121)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0861265988780331)
    


100%|██████████| 100/100 [00:00<00:00, 31571.73it/s]
100%|██████████| 100/100 [00:00<00:00, 412.52it/s]
5978it [00:00, 566198.84it/s]
100%|██████████| 100/100 [00:00<00:00, 277034.61it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.922469e-03,3.008145e-03,-0.057194,-5.841489e-04,2.556383e-03,-3.404281e-03,1.874917e-03,0.273346,-3.022159e-03,...,-4.267834e-02,0.032839,0.036541,1.748174e-03,-5.444375e-04,3.049058e-03,0.001945,4.452041e-02,0.002647,3.063808e-02
1,-0.001922,,-4.663441e-04,-0.002975,-1.741246e-07,-6.073609e-03,-5.394603e-07,-4.237094e-03,-0.005006,-4.639180e-07,...,-2.136488e-03,-0.004972,-0.005622,-3.915125e-03,-3.205157e-07,-4.733259e-04,-0.004418,-2.010642e-07,-0.006332,-2.617759e-07
2,0.003008,-4.663441e-04,,0.009417,-5.922946e-02,9.537712e-03,-8.220924e-04,9.302073e-02,0.003771,-7.306595e-04,...,7.088345e-03,0.003749,0.004161,8.650515e-02,-1.348353e-02,-1.310580e-07,0.131165,5.400079e-03,0.020516,3.746721e-03
3,-0.057194,-2.974561e-03,9.417245e-03,,-1.044180e-03,3.404454e-02,-9.110164e-03,8.576684e-03,0.003590,-6.806935e-03,...,-4.329251e-07,0.003555,0.004255,7.722986e-03,-9.235784e-04,9.605198e-03,0.009078,1.084414e-02,0.015447,5.749424e-03
4,-0.000584,-1.741246e-07,-5.922946e-02,-0.001044,,-1.067342e-03,-4.847689e-07,-6.560209e-04,-0.000815,-3.750183e-07,...,-6.674281e-04,-0.000808,-0.000956,-5.937868e-04,-2.235890e-07,-6.039001e-02,-0.000692,0.000000e+00,-0.001134,-1.675918e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.003049,-4.733259e-04,-1.310580e-07,0.009605,-6.039001e-02,9.730552e-03,-8.442064e-04,9.445339e-02,0.003835,-7.480544e-04,...,7.194338e-03,0.003812,0.004240,8.774282e-02,-1.372382e-02,,0.133300,5.516683e-03,0.020948,3.802523e-03
96,0.001945,-4.418292e-03,1.311646e-01,0.009078,-6.920531e-04,-1.884110e-07,-8.805065e-03,-1.445958e-07,-0.035999,-4.646610e-02,...,6.503175e-03,-0.035751,-0.114007,-1.324627e-07,-6.369668e-04,1.333000e-01,,6.606333e-03,0.000000,4.290300e-03
97,0.044520,-2.010642e-07,5.400079e-03,0.010844,0.000000e+00,1.114942e-02,-7.731784e-07,6.208813e-03,0.066191,-5.271313e-07,...,6.333280e-03,0.135387,0.080759,5.541025e-03,-2.633502e-07,5.516683e-03,0.006606,,0.012054,-2.027810e-07
98,0.002647,-6.332262e-03,2.051571e-02,0.015447,-1.134135e-03,-3.252205e-07,-2.214317e-02,-1.873192e-07,-0.023381,-6.455210e-02,...,9.229728e-03,-0.023137,-0.028164,-1.674496e-07,-9.932100e-04,2.094759e-02,0.000000,1.205437e-02,,6.072773e-03


np.nanmean(norm_err)=np.float64(-0.00162899772525488)
    np.nanmean(np.abs(norm_err))=np.float64(0.013836995265801108)
    np.nanmedian(norm_err)=np.float64(-6.612706257370819e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.003599620366056754)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36049.02it/s]
100%|██████████| 100/100 [00:00<00:00, 451.96it/s]
5920it [00:00, 661964.27it/s]
100%|██████████| 100/100 [00:00<00:00, 309542.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32456.12it/s]
100%|██████████| 100/100 [00:00<00:00, 1012.16it/s]
5935it [00:00, 663747.71it/s]
100%|██████████| 100/100 [00:00<00:00, 361577.93it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.196619e-07,0.000000e+00,-1.036170e-07,-1.035422e-07,-1.034835e-07,0.000000e+00,-1.062344e-07,0.000000e+00,0.000000e+00,...,-1.035940e-07,-1.034656e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064626e-07,0.000000e+00,0.000000e+00,-1.096432e-07
1,-1.196619e-07,,-1.036727e-07,-2.073955e-07,-2.072455e-07,-2.071280e-07,-1.036113e-07,-2.126386e-07,-1.037657e-07,-1.036124e-07,...,-2.073493e-07,-2.070921e-07,-1.063170e-07,-1.064096e-07,-1.035682e-07,-1.037476e-07,-2.130957e-07,-1.126963e-07,-1.036970e-07,-2.194672e-07
2,0.000000e+00,-1.036727e-07,,-1.130851e-07,-1.066762e-07,-1.129261e-07,0.000000e+00,-1.035901e-07,0.000000e+00,0.000000e+00,...,-1.067311e-07,-1.065949e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038071e-07,0.000000e+00,0.000000e+00,-1.039185e-07
3,-1.036170e-07,-2.073955e-07,-1.130851e-07,,-2.134055e-07,-2.401327e-07,-1.097604e-07,-2.072303e-07,-1.166574e-07,-1.097617e-07,...,-2.135155e-07,-2.132428e-07,-1.036130e-07,-1.037009e-07,-1.164078e-07,-1.068351e-07,-2.076645e-07,-1.037004e-07,-1.067814e-07,-2.078875e-07
4,-1.035422e-07,-2.072455e-07,-1.066762e-07,-2.134055e-07,,-2.131223e-07,-1.066112e-07,-2.070806e-07,-1.067746e-07,-1.066124e-07,...,-2.480386e-07,-2.256911e-07,-1.035381e-07,-1.036259e-07,-1.065655e-07,-1.165397e-07,-2.075141e-07,-1.036254e-07,-1.164758e-07,-2.077368e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.037476e-07,0.000000e+00,-1.068351e-07,-1.165397e-07,-1.066932e-07,0.000000e+00,-1.036649e-07,0.000000e+00,0.000000e+00,...,-1.166054e-07,-1.129936e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.038822e-07,0.000000e+00,0.000000e+00,-1.039938e-07
96,-1.064626e-07,-2.130957e-07,-1.038071e-07,-2.076645e-07,-2.075141e-07,-2.073963e-07,-1.037456e-07,-2.190341e-07,-1.039003e-07,-1.037467e-07,...,-2.076182e-07,-2.073603e-07,-1.095146e-07,-1.199532e-07,-1.037023e-07,-1.038822e-07,,-1.065507e-07,-1.038315e-07,-2.136151e-07
97,0.000000e+00,-1.126963e-07,0.000000e+00,-1.037004e-07,-1.036254e-07,-1.035667e-07,0.000000e+00,-1.063221e-07,0.000000e+00,0.000000e+00,...,-1.036773e-07,-1.035487e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.065507e-07,,0.000000e+00,-1.097366e-07
98,0.000000e+00,-1.036970e-07,0.000000e+00,-1.067814e-07,-1.164758e-07,-1.066396e-07,0.000000e+00,-1.036144e-07,0.000000e+00,0.000000e+00,...,-1.165414e-07,-1.129336e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038315e-07,0.000000e+00,,-1.039429e-07


np.nanmean(norm_err)=np.float64(-9.394898319933301e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.394898319933301e-08)
    np.nanmedian(norm_err)=np.float64(-1.0373878201233708e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0373878201233708e-07)
    
