In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-10-05T00:23:06.995286+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10081.98it/s]
100%|██████████| 100/100 [00:00<00:00, 383.06it/s]
6136it [00:00, 618868.11it/s]
100%|██████████| 100/100 [00:00<00:00, 214323.15it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.113345,0.144852,0.493295,0.080803,0.363269,0.109649,0.136492,0.000000,0.131315,...,0.000000,0.095623,0.133883,0.156417,0.000000,0.237434,0.237060,0.000000,0.000000,0.170907
1,0.113345,,0.020007,0.000000,0.056614,0.000000,0.000000,0.079265,-0.143497,0.000000,...,-0.207592,0.000000,0.000000,0.000000,-0.058693,0.105240,0.105166,-0.185828,-0.056444,0.021846
2,0.144852,0.020007,,0.033780,0.016491,0.031467,0.019656,0.021980,-0.065505,0.021569,...,-0.052795,0.018219,0.021775,0.023421,-0.177974,0.027659,0.027644,-0.047809,-0.171085,0.000000
3,0.493295,0.000000,0.033780,,0.091994,0.000000,0.000000,0.171743,-0.534001,0.000000,...,-0.524833,0.000000,0.000000,0.000000,-0.146136,0.369148,0.368245,-0.404934,-0.132945,0.039378
4,0.080803,0.056614,0.016491,0.091994,,0.086240,-0.018217,0.000000,-0.103805,0.060766,...,-0.182326,-0.016976,0.061310,0.065639,-0.044706,0.000000,0.000000,-0.365101,-0.043389,0.017721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.237434,0.105240,0.027659,0.369148,0.000000,0.291181,-0.032891,0.000000,-0.284827,0.120552,...,-0.412584,-0.029056,0.122712,0.141373,-0.098812,,0.000000,-0.827407,-0.092599,0.031303
96,0.237060,0.105166,0.027644,0.368245,0.000000,0.290619,-0.032869,0.000000,-0.284423,0.120455,...,-0.412160,-0.029039,0.122612,0.141241,-0.098714,0.000000,,-0.561867,-0.092514,0.031283
97,0.000000,-0.185828,-0.047809,-0.404934,-0.365101,-0.359654,-0.210187,-0.557891,-0.116288,-0.205593,...,-0.097335,-0.190141,-0.208273,-0.230398,0.854202,-0.827407,-0.561867,,0.506425,-0.053160
98,0.000000,-0.056444,-0.171085,-0.132945,-0.043389,-0.116137,-0.055057,-0.064632,0.000000,-0.062871,...,0.212793,-0.049579,-0.063750,-0.071063,0.000000,-0.092599,-0.092514,0.506425,,-0.194422


np.nanmean(norm_err)=np.float64(-0.030659279512760432)
    np.nanmean(np.abs(norm_err))=np.float64(0.12112816639465591)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.04961557089168035)
    


100%|██████████| 100/100 [00:00<00:00, 33346.35it/s]
100%|██████████| 100/100 [00:00<00:00, 385.94it/s]
5990it [00:00, 570077.40it/s]
100%|██████████| 100/100 [00:00<00:00, 265462.28it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,6.422980e-03,7.556682e-03,-5.200417e-03,-2.521563e-07,-5.119422e-03,6.231356e-03,-1.034438e-01,3.286499e-03,-6.750187e-03,...,-7.468969e-02,-1.640637e-02,4.986832e-03,1.834997e-03,0.001784,-1.302711e-02,5.007036e-03,2.037931e-02,5.010758e-03,5.097148e-03
1,6.422980e-03,,0.000000e+00,0.000000e+00,-1.668389e-02,2.681778e-02,0.000000e+00,1.162976e-01,-1.044161e-06,-1.145732e-06,...,-1.952487e-07,1.334011e-01,0.000000e+00,-2.053879e-07,-0.003204,3.322781e-03,0.000000e+00,-4.033493e-07,0.000000e+00,0.000000e+00
2,7.556682e-03,0.000000e+00,,0.000000e+00,-4.335973e-03,5.398164e-03,0.000000e+00,5.477649e-03,-2.198718e-07,-2.240544e-07,...,-1.147833e-07,5.756574e-03,0.000000e+00,-1.182140e-07,-0.005658,1.916125e-03,0.000000e+00,-1.656487e-07,0.000000e+00,0.000000e+00
3,-5.200417e-03,0.000000e+00,0.000000e+00,,6.359738e-03,-8.941535e-03,0.000000e+00,-9.160982e-03,-4.610788e-07,-5.971616e-07,...,-1.591367e-07,-9.969635e-03,0.000000e+00,-1.644749e-07,0.000000,-5.370172e-03,0.000000e+00,-2.697785e-07,0.000000e+00,0.000000e+00
4,-2.521563e-07,-1.668389e-02,-4.335973e-03,6.359738e-03,,-3.769466e-07,-2.512998e-01,-1.916353e-07,-4.277223e-03,8.842026e-03,...,2.193608e-01,-4.057300e-07,-7.530842e-02,-2.107827e-03,-0.004102,-2.586250e-07,-1.838198e-02,-6.123619e-03,-1.839866e-02,-1.878733e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.302711e-02,3.322781e-03,1.916125e-03,-5.370172e-03,-2.586250e-07,-7.914124e-03,3.219913e-03,-1.883047e-02,1.385192e-02,-7.038996e-03,...,-1.481838e-02,-1.153472e-01,2.557584e-03,6.237504e-02,0.092240,,2.568258e-03,2.585930e-03,2.570224e-03,2.615894e-03
96,5.007036e-03,0.000000e+00,0.000000e+00,0.000000e+00,-1.838198e-02,2.543776e-02,0.000000e+00,1.726564e-02,-4.176132e-07,-4.329644e-07,...,-1.524733e-07,1.869062e-02,0.000000e+00,-1.585869e-07,-0.002501,2.568258e-03,,-2.553565e-07,0.000000e+00,0.000000e+00
97,2.037931e-02,-4.033493e-07,-1.656487e-07,-2.697785e-07,-6.123619e-03,8.479890e-03,-3.794363e-07,8.677549e-03,-8.507275e-07,-8.826020e-07,...,-3.069888e-07,9.399207e-03,-2.536461e-07,-3.193838e-07,-0.017762,2.585930e-03,-2.553565e-07,,-2.556726e-07,-2.630979e-07
98,5.010758e-03,0.000000e+00,0.000000e+00,0.000000e+00,-1.839866e-02,2.546972e-02,0.000000e+00,1.728774e-02,-4.184593e-07,-4.338740e-07,...,-1.525859e-07,1.871652e-02,0.000000e+00,-1.587088e-07,-0.002503,2.570224e-03,0.000000e+00,-2.556726e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(0.0008336420020222427)
    np.nanmean(np.abs(norm_err))=np.float64(0.011217085263641749)
    np.nanmedian(norm_err)=np.float64(-1.748074404083253e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0016978575317190899)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 32788.49it/s]
100%|██████████| 100/100 [00:00<00:00, 396.04it/s]
5957it [00:00, 630277.71it/s]
100%|██████████| 100/100 [00:00<00:00, 302837.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 34698.08it/s]
100%|██████████| 100/100 [00:00<00:00, 984.83it/s]
5938it [00:00, 558438.02it/s]
100%|██████████| 100/100 [00:00<00:00, 270949.87it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.321602e-07,-2.133455e-07,-1.036154e-07,-2.070605e-07,-2.072066e-07,-1.035778e-07,-1.161670e-07,-2.257888e-07,-2.130517e-07,...,-1.036863e-07,-2.128379e-07,-2.072744e-07,-2.191878e-07,-2.075428e-07,-1.063702e-07,-1.066037e-07,-1.035825e-07,-2.132902e-07,-2.070463e-07
1,-2.321602e-07,,-2.130930e-07,-1.034963e-07,-2.068227e-07,-2.069684e-07,-1.034588e-07,-1.419137e-07,-2.255060e-07,-2.127999e-07,...,-1.035670e-07,-2.125867e-07,-2.070361e-07,-2.189213e-07,-2.073039e-07,-1.062447e-07,-1.064776e-07,-1.034635e-07,-2.130379e-07,-2.068086e-07
2,-2.133455e-07,-2.130930e-07,,-1.037832e-07,-2.073955e-07,-2.075420e-07,-1.037454e-07,-1.066197e-07,-2.135264e-07,-2.260524e-07,...,-1.038543e-07,-2.326991e-07,-2.076101e-07,-2.134212e-07,-2.078793e-07,-1.096084e-07,-1.098564e-07,-1.037501e-07,-2.484295e-07,-2.073813e-07
3,-1.036154e-07,-1.034963e-07,-1.037832e-07,,-1.064283e-07,-1.065055e-07,0.000000e+00,0.000000e+00,-1.037007e-07,-1.036441e-07,...,0.000000e+00,-1.035429e-07,-1.199407e-07,-1.036511e-07,-1.097525e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037570e-07,-1.094749e-07
4,-2.070605e-07,-2.068227e-07,-2.073955e-07,-1.064283e-07,,-2.189387e-07,-1.063886e-07,-1.034803e-07,-2.072309e-07,-2.071178e-07,...,-1.095620e-07,-2.069159e-07,-2.129027e-07,-2.071319e-07,-2.131858e-07,-1.034118e-07,-1.036325e-07,-1.063936e-07,-2.073433e-07,-2.126620e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.063702e-07,-1.062447e-07,-1.096084e-07,0.000000e+00,-1.034118e-07,-1.034847e-07,0.000000e+00,0.000000e+00,-1.064601e-07,-1.094533e-07,...,0.000000e+00,-1.093405e-07,-1.035185e-07,-1.064079e-07,-1.036524e-07,,0.000000e+00,0.000000e+00,-1.095793e-07,-1.034048e-07
96,-1.066037e-07,-1.064776e-07,-1.098564e-07,0.000000e+00,-1.036325e-07,-1.037057e-07,0.000000e+00,0.000000e+00,-1.066940e-07,-1.097006e-07,...,0.000000e+00,-1.095873e-07,-1.037397e-07,-1.066415e-07,-1.038741e-07,0.000000e+00,,0.000000e+00,-1.098271e-07,-1.036254e-07
97,-1.035825e-07,-1.034635e-07,-1.037501e-07,0.000000e+00,-1.063936e-07,-1.064707e-07,0.000000e+00,0.000000e+00,-1.036678e-07,-1.036112e-07,...,0.000000e+00,-1.035101e-07,-1.095656e-07,-1.036182e-07,-1.164119e-07,0.000000e+00,0.000000e+00,,-1.037240e-07,-1.236246e-07
98,-2.132902e-07,-2.130379e-07,-2.484295e-07,-1.037570e-07,-2.073433e-07,-2.074897e-07,-1.037193e-07,-1.065921e-07,-2.134710e-07,-2.259904e-07,...,-1.038281e-07,-2.326334e-07,-2.075578e-07,-2.133659e-07,-2.078269e-07,-1.095793e-07,-1.098271e-07,-1.037240e-07,,-2.073291e-07


np.nanmean(norm_err)=np.float64(-1.1952629551047025e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1952629551047025e-07)
    np.nanmedian(norm_err)=np.float64(-1.0653310540510621e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0653310540510621e-07)
    
