In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-14T00:22:09.143185+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 9325.65it/s]
100%|██████████| 100/100 [00:00<00:00, 409.73it/s]
6130it [00:00, 619917.63it/s]
100%|██████████| 100/100 [00:00<00:00, 240223.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.106316,-0.256212,0.379251,0.000000,0.147660,0.000000,-0.216049,-0.658217,-0.086330,...,-0.242401,0.000000,0.000000,0.000000,-0.169542,-0.122689,-0.113454,0.106706,-1.046245,-0.089313
1,-0.106316,,-0.106330,-0.145530,0.968471,-0.101305,-0.110278,-0.092114,-0.150243,-0.435829,...,-0.101527,-0.088757,-0.097428,-0.134089,-0.316180,-0.696184,0.507713,-1.340383,-0.126945,-0.455000
2,-0.256212,-0.106330,,0.000000,0.000000,-0.241802,-0.267803,-0.484510,-0.395519,-0.086340,...,0.000000,0.098365,-0.230843,0.000000,-0.169577,-0.122707,-0.113470,0.358407,-0.318578,-0.089322
3,0.379251,-0.145530,0.000000,,0.000000,0.348506,0.405211,0.000000,0.000000,-0.110510,...,0.000000,0.000000,0.326186,0.000000,-0.297286,-0.178055,-0.159245,0.146247,0.000000,-0.115445
4,0.000000,0.968471,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.853207,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.122689,-0.696184,-0.122707,-0.178055,0.000000,-0.116063,-0.127995,-0.104157,-0.185162,0.000000,...,-0.116355,-0.099885,-0.111003,-0.161225,0.000000,,-0.150257,-0.595808,-0.151007,0.000000
96,-0.113454,0.507713,-0.113470,-0.159245,0.853207,-0.107765,-0.117977,-0.097425,-0.164905,-0.099128,...,-0.108017,-0.093678,-0.103389,-1.027861,-0.227128,-0.150257,,-0.112630,-0.137257,-0.103080
97,0.106706,-1.340383,0.358407,0.146247,0.000000,0.101659,0.110696,0.092409,0.151006,-0.394248,...,0.340324,0.186360,0.097757,0.000000,-0.262596,-0.595808,-0.112630,,0.127495,-0.409870
98,-1.046245,-0.126945,-0.318578,0.000000,0.000000,-0.296551,-1.125831,0.000000,0.000000,-0.099454,...,-0.297501,0.000000,-0.887017,0.000000,-0.228846,-0.151007,-0.137257,0.127495,,-0.103433


np.nanmean(norm_err)=np.float64(-0.028243133914402536)
    np.nanmean(np.abs(norm_err))=np.float64(0.11725712310225532)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33041.63it/s]
100%|██████████| 100/100 [00:00<00:00, 352.96it/s]
5986it [00:00, 523926.96it/s]
100%|██████████| 100/100 [00:00<00:00, 246723.76it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.841930e-07,-3.598110e-07,1.312022e-02,-7.404706e-07,0.017045,2.633097e-02,-4.154797e-07,3.520472e-02,8.473296e-02,...,2.722992e-03,-1.271082e-01,-3.286931e-07,-2.695529e-07,-2.588964e-07,-3.349868e-03,-3.179431e-07,2.716073e-02,1.338297e-02,-1.787089e-07
1,-3.841930e-07,,-1.498012e-03,-3.544648e-07,1.709033e-02,-0.008532,-2.660511e-02,-4.166761e-07,-3.386431e-07,-3.260779e-07,...,-3.335154e-07,-4.799108e-07,-1.571380e-02,-2.746025e-07,-2.683764e-07,5.451712e-04,-3.323608e-07,-2.745248e-02,-3.615339e-07,8.962896e-03
2,-3.598110e-07,-1.498012e-03,,-3.109619e-07,-4.305251e-06,-0.002413,-6.159931e-03,-4.213346e-07,-2.874024e-07,-2.697583e-07,...,0.000000e+00,-5.743956e-07,2.270358e-03,-8.278550e-04,-2.942973e-03,3.360176e-03,-6.073025e-03,-6.747196e-03,-3.220089e-07,-1.310748e-03
3,1.312022e-02,-3.544648e-07,-3.109619e-07,,-6.374346e-07,0.015844,2.357079e-02,-1.561890e-03,1.037689e-02,1.001596e-02,...,-2.459941e-01,-4.331015e-07,-3.066876e-07,-2.545732e-07,-2.326046e-07,-1.069366e-01,-2.791886e-07,2.423352e-02,3.914553e-02,-1.657747e-07
4,-7.404706e-07,1.709033e-02,-4.305251e-06,-6.374346e-07,,-0.001215,-3.218072e-03,-8.713993e-07,-5.880293e-07,-5.511508e-07,...,-2.025807e-06,-1.202856e-06,-5.653325e-07,9.289096e-03,3.484374e-02,8.317689e-04,7.825764e-02,-3.545773e-03,-6.606650e-07,1.488830e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.349868e-03,5.451712e-04,3.360176e-03,-1.069366e-01,8.317689e-04,0.000000,-1.711468e-07,-4.738667e-03,-4.082987e-03,-3.969959e-03,...,-1.669774e-01,-3.905559e-03,6.644827e-02,4.178946e-04,6.689044e-04,,7.610233e-04,-1.745494e-07,-4.283366e-03,5.175440e-04
96,-3.179431e-07,-3.323608e-07,-6.073025e-03,-2.791886e-07,7.825764e-02,-0.013301,-1.243218e-01,-3.650447e-07,-2.600494e-07,-2.455191e-07,...,0.000000e+00,-4.746219e-07,-4.954376e-02,-1.976709e-07,0.000000e+00,7.610233e-04,,-8.733717e-02,-2.880612e-07,6.374315e-02
97,2.716073e-02,-2.745248e-02,-6.747196e-03,2.423352e-02,-3.545773e-03,0.019796,-9.680697e-07,7.531166e-03,2.275290e-02,2.161290e-02,...,2.266599e-02,9.354010e-03,-2.949978e-02,-2.267891e-02,-5.907316e-02,-1.745494e-07,-8.733717e-02,,2.491196e-02,-1.228383e-02
98,1.338297e-02,-3.615339e-07,-3.220089e-07,3.914553e-02,-6.606650e-07,0.016131,2.421214e-02,-1.444803e-02,-2.690184e-02,-3.125066e-07,...,-3.002768e-07,-4.437019e-07,-3.119652e-07,-2.581991e-07,-2.387309e-07,-4.283366e-03,-2.880612e-07,2.491196e-02,,-1.688630e-07


np.nanmean(norm_err)=np.float64(-0.0003414553817186287)
    np.nanmean(np.abs(norm_err))=np.float64(0.01243288334990608)
    np.nanmedian(norm_err)=np.float64(-3.440304555096932e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0016376167618829397)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 32747.53it/s]
100%|██████████| 100/100 [00:00<00:00, 397.85it/s]
5946it [00:00, 596064.33it/s]
100%|██████████| 100/100 [00:00<00:00, 339894.98it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28673.12it/s]
100%|██████████| 100/100 [00:00<00:00, 987.46it/s]
5920it [00:00, 581750.61it/s]
100%|██████████| 100/100 [00:00<00:00, 358794.18it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.325153e-07,-2.073196e-07,-1.096364e-07,-2.071719e-07,-2.134173e-07,-2.563056e-07,-2.072223e-07,-1.161073e-07,-1.037311e-07,...,-2.191620e-07,-2.074317e-07,-2.134712e-07,-1.038177e-07,-2.075265e-07,-1.200209e-07,-1.128646e-07,-2.070752e-07,-1.036898e-07,-1.035324e-07
1,-2.325153e-07,,-2.072694e-07,-1.096083e-07,-2.071217e-07,-2.133641e-07,-2.327759e-07,-2.071721e-07,-1.197187e-07,-1.037059e-07,...,-2.191058e-07,-2.073814e-07,-2.134179e-07,-1.037925e-07,-2.074762e-07,-1.163283e-07,-1.128348e-07,-2.070251e-07,-1.036647e-07,-1.035073e-07
2,-2.073196e-07,-2.072694e-07,,-1.036865e-07,-2.255614e-07,-2.076289e-07,-2.075268e-07,-2.130219e-07,-1.035152e-07,-1.129528e-07,...,-2.072738e-07,-2.193747e-07,-2.076799e-07,-1.067292e-07,-2.133434e-07,-1.037160e-07,-1.036729e-07,-2.128664e-07,-1.065941e-07,-1.127172e-07
3,-1.096364e-07,-1.096083e-07,-1.036865e-07,,-1.036125e-07,-1.067369e-07,-1.097523e-07,-1.036378e-07,0.000000e+00,0.000000e+00,...,-1.128535e-07,-1.037425e-07,-1.067638e-07,0.000000e+00,-1.037899e-07,0.000000e+00,0.000000e+00,-1.035642e-07,0.000000e+00,0.000000e+00
4,-2.071719e-07,-2.071217e-07,-2.255614e-07,-1.036125e-07,,-2.074807e-07,-2.073788e-07,-2.128659e-07,-1.034416e-07,-1.163062e-07,...,-2.071261e-07,-2.192093e-07,-2.075316e-07,-1.066509e-07,-2.131869e-07,-1.036420e-07,-1.035990e-07,-2.127107e-07,-1.065159e-07,-1.196981e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.200209e-07,-1.163283e-07,-1.037160e-07,0.000000e+00,-1.036420e-07,-1.067682e-07,-1.201598e-07,-1.036672e-07,0.000000e+00,0.000000e+00,...,-1.096437e-07,-1.037720e-07,-1.067951e-07,0.000000e+00,-1.038195e-07,,0.000000e+00,-1.035936e-07,0.000000e+00,0.000000e+00
96,-1.128646e-07,-1.128348e-07,-1.036729e-07,0.000000e+00,-1.035990e-07,-1.067225e-07,-1.129874e-07,-1.036242e-07,0.000000e+00,0.000000e+00,...,-1.095956e-07,-1.037289e-07,-1.067495e-07,0.000000e+00,-1.037764e-07,0.000000e+00,,-1.035506e-07,0.000000e+00,0.000000e+00
97,-2.070752e-07,-2.070251e-07,-2.128664e-07,-1.035642e-07,-2.127107e-07,-2.073837e-07,-2.072819e-07,-2.253316e-07,-1.033933e-07,-1.065084e-07,...,-2.070294e-07,-2.129845e-07,-2.074346e-07,-1.163540e-07,-2.192068e-07,-1.035936e-07,-1.035506e-07,,-1.095215e-07,-1.062989e-07
98,-1.036898e-07,-1.036647e-07,-1.065941e-07,0.000000e+00,-1.065159e-07,-1.038445e-07,-1.037934e-07,-1.096038e-07,0.000000e+00,0.000000e+00,...,-1.036668e-07,-1.066533e-07,-1.038700e-07,0.000000e+00,-1.130265e-07,0.000000e+00,0.000000e+00,-1.095215e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.090220538106774e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.090220538106774e-07)
    np.nanmedian(norm_err)=np.float64(-1.0628807139236533e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0628807139236533e-07)
    
