In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T17:47:32.459975+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 11542.79it/s]
100%|██████████| 100/100 [00:00<00:00, 399.69it/s]
6131it [00:00, 635400.11it/s]
100%|██████████| 100/100 [00:00<00:00, 238991.68it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.436474,0.294261,-0.100508,-0.052992,0.000000,0.000000,-0.204469,0.000000,-0.118185,...,0.000000,0.000000,0.000000,-0.040628,0.318957,-0.051909,0.000006,0.000000,0.000000,0.000000
1,0.436474,,0.000000,-0.082444,-0.047504,0.379197,0.100111,0.691596,0.000006,-0.093974,...,0.327254,0.166962,0.229576,-0.037323,0.000000,-0.046632,-0.057106,0.000007,0.000011,-0.073665
2,0.294261,0.000000,,-0.056598,-0.037608,0.267065,0.078389,0.524289,0.000004,-0.061803,...,0.240212,0.114189,0.140372,-0.030929,0.000000,-0.037060,-0.043388,0.000005,0.000007,-0.052326
3,-0.100508,-0.082444,-0.056598,,0.000000,-0.085612,-0.058907,-0.051681,-0.081650,0.000000,...,-0.072597,-0.111450,-0.175322,0.000000,-0.061154,0.000000,-0.071391,-0.107848,-0.222498,-0.099304
4,-0.052992,-0.047504,-0.037608,0.000000,,-0.048539,-0.038614,-0.035372,-0.047240,0.000000,...,-0.044061,-0.055885,-0.068376,0.000000,-0.039567,0.000000,-0.043614,-0.054964,-0.074539,-0.052655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.051909,-0.046632,-0.037060,0.000000,0.000000,-0.047629,-0.038036,-0.034886,-0.046377,0.000000,...,-0.043309,-0.054681,-0.066583,0.000000,-0.038960,,-0.042877,-0.053800,-0.072414,-0.051586
96,0.000006,-0.057106,-0.043388,-0.071391,-0.043614,0.000005,0.000004,0.000004,0.248000,-0.079878,...,0.000005,0.000006,0.000008,-0.034878,-0.046015,-0.042877,,0.984667,0.000009,-0.669280
97,0.000000,0.000007,0.000005,-0.107848,-0.054964,0.000000,0.000000,0.000000,0.000000,-0.128467,...,0.000000,0.000000,0.000000,-0.041777,0.000005,-0.053800,0.984667,,0.000000,0.000000
98,0.000000,0.000011,0.000007,-0.222498,-0.074539,0.000000,0.000000,0.000000,0.000000,-0.332636,...,0.000000,0.000000,0.000000,-0.052196,0.000007,-0.072414,0.000009,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.01809076281022333)
    np.nanmean(np.abs(norm_err))=np.float64(0.08152419998078567)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.04128616241982842)
    


100%|██████████| 100/100 [00:00<00:00, 35876.35it/s]
100%|██████████| 100/100 [00:00<00:00, 408.66it/s]
5974it [00:00, 632523.15it/s]
100%|██████████| 100/100 [00:00<00:00, 257952.28it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.004301,4.425825e-04,0.020279,6.428974e-04,-0.088422,-2.350680e-07,-0.028729,8.959864e-04,-0.003287,...,-5.549652e-03,0.000000e+00,-0.004115,-4.416802e-03,-0.011052,0.000492,-5.592073e-02,0.037449,1.128435e-03,4.983058e-04
1,-0.004301,,9.002573e-03,-0.005609,7.267412e-03,0.000000,-3.856060e-07,-0.005756,5.402247e-04,0.000000,...,-3.652484e-07,0.000000e+00,-0.094140,-2.090332e-07,-0.003938,0.000000,-7.513003e-03,0.000000,7.227752e-04,1.044214e-02
2,0.000443,0.009003,,0.000535,3.635182e-03,0.006281,-1.657556e-03,0.000545,-6.415455e-03,0.014096,...,1.134414e-02,-1.537372e-03,0.000600,1.855250e-02,0.000414,0.000444,-1.507322e-07,0.000677,-5.284477e-04,1.646532e-01
3,0.020279,-0.005609,5.353822e-04,,8.593728e-04,-0.088619,-7.408080e-07,-0.037654,1.083032e-03,-0.004000,...,-7.937886e-03,-3.247510e-07,-0.005563,-5.807431e-03,-0.013393,0.000605,-6.043016e-02,0.053680,1.442169e-03,6.191587e-04
4,0.000643,0.007267,3.635182e-03,0.000859,,0.048451,-3.200125e-02,0.000884,-6.011726e-04,0.004984,...,1.091088e-02,-1.126909e-01,0.001040,7.561010e-03,0.000584,0.000645,-2.601731e-07,0.001295,-2.590960e-02,-2.892237e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000492,0.000000,4.435680e-04,0.000605,6.449788e-04,0.000000,-2.361727e-07,0.039653,4.475933e-02,0.000000,...,-1.937583e-07,0.000000e+00,0.010347,-1.541726e-07,0.000458,,-6.609132e-04,-0.076693,9.220544e-03,4.995554e-04
96,-0.055921,-0.007513,-1.507322e-07,-0.060430,-2.601731e-07,-0.155146,3.447358e-02,-0.046360,-2.996541e-07,-0.005099,...,-1.147452e-02,2.872722e-02,-0.008936,-7.827686e-03,-0.017414,-0.000661,,-0.001360,-4.148036e-07,-1.775344e-07
97,0.037449,0.000000,6.773792e-04,0.053680,1.294883e-03,0.000000,-8.699189e-07,0.001154,1.368251e-03,0.000000,...,-4.816003e-07,0.000000e+00,0.001421,-2.939810e-07,0.002102,-0.076693,-1.360385e-03,,1.996187e-03,8.172525e-04
98,0.001128,0.000723,-5.284477e-04,0.001442,-2.590960e-02,0.001166,-2.164478e-02,0.012048,7.387282e-03,0.000526,...,9.946394e-04,-3.887162e-02,0.020694,7.463470e-04,0.001039,0.009221,-4.148036e-07,0.001996,,-1.898044e-02


np.nanmean(norm_err)=np.float64(-0.0005192446635769421)
    np.nanmean(np.abs(norm_err))=np.float64(0.013763949877814577)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0012953939759138473)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 28059.30it/s]
100%|██████████| 100/100 [00:00<00:00, 459.44it/s]
5972it [00:00, 620762.40it/s]
100%|██████████| 100/100 [00:00<00:00, 297047.03it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32431.02it/s]
100%|██████████| 100/100 [00:00<00:00, 907.99it/s]
5944it [00:00, 46512.43it/s]
100%|██████████| 100/100 [00:00<00:00, 320665.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.481107e-07,-1.164098e-07,-1.066069e-07,-1.039185e-07,-2.077800e-07,-1.068246e-07,-2.191460e-07,-1.066214e-07,-2.077251e-07,...,-2.072345e-07,-2.074946e-07,-2.191211e-07,-2.072960e-07,-1.065406e-07,-1.036163e-07,-2.071723e-07,-2.259891e-07,-1.163097e-07,-2.130608e-07
1,-1.481107e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.038029e-07,0.000000e+00,-1.094761e-07,0.000000e+00,-1.037755e-07,...,-1.035306e-07,-1.036604e-07,-1.094637e-07,-1.035613e-07,0.000000e+00,0.000000e+00,-1.034995e-07,-1.128915e-07,0.000000e+00,-1.064388e-07
2,-1.164098e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.037638e-07,0.000000e+00,-1.094327e-07,0.000000e+00,-1.037364e-07,...,-1.034917e-07,-1.036215e-07,-1.094202e-07,-1.035224e-07,0.000000e+00,0.000000e+00,-1.034607e-07,-1.128453e-07,0.000000e+00,-1.063977e-07
3,-1.066069e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.037263e-07,0.000000e+00,-1.063415e-07,0.000000e+00,-1.036989e-07,...,-1.034544e-07,-1.035841e-07,-1.063297e-07,-1.034851e-07,0.000000e+00,0.000000e+00,-1.034234e-07,-1.065024e-07,0.000000e+00,-1.094087e-07
4,-1.039185e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.068445e-07,0.000000e+00,-1.036663e-07,0.000000e+00,-1.202882e-07,...,-1.199594e-07,-1.066936e-07,-1.036552e-07,-1.065886e-07,0.000000e+00,0.000000e+00,-1.065231e-07,-1.038192e-07,0.000000e+00,-1.036823e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036163e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.065250e-07,0.000000e+00,-1.033656e-07,0.000000e+00,-1.127940e-07,...,-1.125048e-07,-1.063750e-07,-1.033545e-07,-1.062706e-07,0.000000e+00,,-1.062056e-07,-1.035176e-07,0.000000e+00,-1.033815e-07
96,-2.071723e-07,-1.034995e-07,-1.034607e-07,-1.034234e-07,-1.065231e-07,-2.191029e-07,-1.036283e-07,-2.066711e-07,-1.034371e-07,-2.129286e-07,...,-2.124132e-07,-2.187856e-07,-2.066490e-07,-2.185648e-07,-1.033610e-07,-1.062056e-07,,-2.069750e-07,-1.033816e-07,-2.067029e-07
97,-2.259891e-07,-1.128915e-07,-1.128453e-07,-1.065024e-07,-1.038192e-07,-2.075815e-07,-1.067197e-07,-2.189252e-07,-1.065169e-07,-2.075267e-07,...,-2.070371e-07,-2.072967e-07,-2.189004e-07,-2.070984e-07,-1.064363e-07,-1.035176e-07,-2.069750e-07,,-1.127512e-07,-2.128521e-07
98,-1.163097e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036842e-07,0.000000e+00,-1.093442e-07,0.000000e+00,-1.036569e-07,...,-1.034126e-07,-1.035422e-07,-1.093318e-07,-1.034432e-07,0.000000e+00,0.000000e+00,-1.033816e-07,-1.127512e-07,,-1.063141e-07


np.nanmean(norm_err)=np.float64(-1.3017599436930262e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.3017599436930262e-07)
    np.nanmedian(norm_err)=np.float64(-1.0680521725431409e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0680521725431409e-07)
    
