In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-24T00:23:05.381669+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10669.81it/s]
100%|██████████| 100/100 [00:00<00:00, 378.63it/s]
6123it [00:00, 649972.75it/s]
100%|██████████| 100/100 [00:00<00:00, 218339.61it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.480585,0.0,-0.170925,-0.154630,-0.029661,-0.006356,0.285301,-0.408693,0.000000,...,-0.006508,-0.134665,0.348057,0.254914,-0.007520,0.399389,-0.006812,-0.184073,0.000000,0.290059
1,-0.480585,,0.0,-0.094262,-0.089463,0.082969,0.133311,0.000000,0.000000,-0.286161,...,0.267527,0.107208,-0.101822,-0.060398,0.299378,-0.076547,0.818488,-0.052983,-0.506495,-0.064991
2,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,-0.170925,-0.094262,0.0,,0.000000,-0.016131,-0.005388,0.015668,-0.084205,-0.171652,...,-0.005496,-0.248257,-0.348934,-0.166854,-0.006201,-0.228756,-0.005712,-0.090187,-0.114280,-0.183485
4,-0.154630,-0.089463,0.0,0.000000,,-0.014411,-0.005181,0.014041,-0.080355,-0.155225,...,-0.005281,-0.233470,-0.319479,-0.159808,-0.005929,-0.215717,-0.005480,-0.079568,-0.106759,-0.175001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.399389,-0.076547,0.0,-0.228756,-0.215717,-0.010649,-0.004597,0.098104,-0.069780,0.182171,...,-0.004676,-0.796714,-0.000005,0.000000,-0.005177,,-0.004831,-0.057237,1.157731,0.000000
96,-0.006812,0.818488,0.0,-0.005712,-0.005480,0.000000,0.000000,0.000000,0.395898,-0.006826,...,0.113461,0.150217,-0.006067,-0.003962,0.125755,-0.004831,,-0.007842,-0.005507,-0.004216
97,-0.184073,-0.052983,0.0,-0.090187,-0.079568,-0.069211,-0.007243,0.000000,-0.045109,-0.186110,...,-0.007440,-0.061523,-0.110617,-0.037660,-0.008793,-0.057237,-0.007842,,-0.080695,-0.042533
98,0.000000,-0.506495,0.0,-0.114280,-0.106759,-0.014596,-0.005205,0.135656,-0.445558,0.000000,...,-0.005306,-0.098984,0.198095,0.748066,-0.005960,1.157731,-0.005507,-0.080695,,0.848651


np.nanmean(norm_err)=np.float64(0.009413751730682578)
    np.nanmean(np.abs(norm_err))=np.float64(0.14067988127354805)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.045351296969808746)
    


100%|██████████| 100/100 [00:00<00:00, 32288.71it/s]
100%|██████████| 100/100 [00:00<00:00, 424.46it/s]
5999it [00:00, 621136.78it/s]
100%|██████████| 100/100 [00:00<00:00, 268178.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.027713,-1.158861e-06,-7.111541e-06,-6.884049e-07,0.014919,-2.082867e-07,-2.138750e-02,-2.423497e-07,-4.654704e-07,...,-2.790618e-06,-1.117865e-06,6.920150e-03,-3.915395e-07,-3.262981e-07,0.037893,2.794864e-02,-1.163571e-06,-8.580785e-07,-3.618979e-07
1,2.771341e-02,,-2.369635e-03,-3.574983e-03,1.545127e-01,0.002539,-3.603016e-03,-1.357905e-07,-1.448352e-03,-3.815570e-03,...,-2.439359e-03,-2.355107e-03,-1.114441e-07,-4.791673e-03,-4.457478e-03,0.003196,1.949494e-02,-2.030985e-03,-6.018739e-03,-3.794160e-02
2,-1.158861e-06,-0.002370,,-1.625238e-06,-2.956459e-07,-0.004778,7.979602e-04,9.540748e-03,0.000000e+00,-2.159459e-07,...,-9.706208e-07,0.000000e+00,-2.396501e-03,1.419249e-03,0.000000e+00,-0.009983,0.000000e+00,-1.002659e-02,0.000000e+00,0.000000e+00
3,-7.111541e-06,-0.003575,-1.625238e-06,,-7.140967e-07,-0.016719,-9.001617e-04,4.266031e-03,-2.578219e-07,-4.944362e-07,...,-4.263801e-06,-1.545737e-06,-3.620899e-03,-1.777941e-03,-3.554968e-07,-0.052154,-4.399320e-07,2.786594e-03,-1.094479e-06,-3.981696e-07
4,-6.884049e-07,0.154513,-2.956459e-07,-7.140967e-07,,0.006933,-1.366012e-07,-1.172636e-02,-1.504715e-07,-2.934170e-07,...,-6.180097e-07,-2.929055e-07,4.510433e-03,-1.971015e-07,-1.790770e-07,0.009653,-1.347294e-02,-4.718812e-07,-2.713776e-07,-1.892965e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3.789293e-02,0.003196,-9.982754e-03,-5.215440e-02,9.653182e-03,0.000000,-3.153395e-03,0.000000e+00,-2.712756e-03,-3.495671e-03,...,-1.134609e-02,-9.729891e-03,2.284839e-02,-5.591113e-03,-4.755637e-03,,1.143883e-02,-5.862665e-03,-1.070528e-02,-5.215751e-03
96,2.794864e-02,0.019495,0.000000e+00,-4.399320e-07,-1.347294e-02,0.007809,0.000000e+00,-1.295144e-02,0.000000e+00,-1.590186e-07,...,-3.692036e-07,0.000000e+00,4.865392e-03,0.000000e+00,0.000000e+00,0.011439,,-2.694915e-07,0.000000e+00,0.000000e+00
97,-1.163571e-06,-0.002031,-1.002659e-02,2.786594e-03,-4.718812e-07,-0.003576,-1.669152e-07,2.835404e-03,-7.709515e-04,3.739262e-03,...,-2.832364e-02,-9.873623e-03,-2.050663e-03,-2.670931e-07,2.042303e-02,-0.005863,-2.694915e-07,,8.144081e-02,2.199763e-02
98,-8.580785e-07,-0.006019,0.000000e+00,-1.094479e-06,-2.713776e-07,-0.005714,-6.802503e-03,3.256001e-03,0.000000e+00,-3.340744e-03,...,-7.389746e-07,0.000000e+00,-1.841232e-02,-2.241431e-01,1.350411e-02,-0.010705,0.000000e+00,8.144081e-02,,1.958343e-02


np.nanmean(norm_err)=np.float64(-0.00032093019012684643)
    np.nanmean(np.abs(norm_err))=np.float64(0.007615646977221634)
    np.nanmedian(norm_err)=np.float64(-2.752312026492754e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0010155162223726777)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 28667.24it/s]
100%|██████████| 100/100 [00:00<00:00, 476.33it/s]
5953it [00:00, 676841.74it/s]
100%|██████████| 100/100 [00:00<00:00, 292693.93it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32860.42it/s]
100%|██████████| 100/100 [00:00<00:00, 439.15it/s]
5939it [00:00, 677085.39it/s]
100%|██████████| 100/100 [00:00<00:00, 390895.06it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.037792e-07,-1.035709e-07,-2.195688e-07,-2.070015e-07,-2.072967e-07,-1.097775e-07,-2.072014e-07,-1.097497e-07,-2.073330e-07,...,-2.076074e-07,-1.097697e-07,-2.132822e-07,-1.037508e-07,-1.036270e-07,-1.064128e-07,-1.096873e-07,-2.072339e-07,-2.075080e-07,-1.129070e-07
1,-1.037792e-07,,0.000000e+00,-1.039582e-07,-1.065407e-07,-1.130195e-07,0.000000e+00,-1.066466e-07,0.000000e+00,-1.067163e-07,...,-1.099416e-07,0.000000e+00,-1.038898e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066639e-07,-1.098858e-07,0.000000e+00
2,-1.035709e-07,0.000000e+00,,-1.037492e-07,-1.125977e-07,-1.064770e-07,0.000000e+00,-1.094811e-07,0.000000e+00,-1.237732e-07,...,-1.066409e-07,0.000000e+00,-1.036810e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.094993e-07,-1.065885e-07,0.000000e+00
3,-2.195688e-07,-1.039582e-07,-1.037492e-07,,-2.073577e-07,-2.076539e-07,-1.285014e-07,-2.075583e-07,-1.166758e-07,-2.076903e-07,...,-2.079657e-07,-1.132344e-07,-2.136604e-07,-1.039298e-07,-1.038055e-07,-1.066011e-07,-1.202819e-07,-2.075909e-07,-2.078659e-07,-1.098612e-07
4,-2.070015e-07,-1.065407e-07,-1.125977e-07,-2.073577e-07,,-2.128057e-07,-1.036727e-07,-2.188055e-07,-1.036479e-07,-2.254214e-07,...,-2.131331e-07,-1.036657e-07,-2.072214e-07,-1.065108e-07,-1.197367e-07,-1.033952e-07,-1.035922e-07,-2.188417e-07,-2.130283e-07,-1.035690e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.064128e-07,0.000000e+00,0.000000e+00,-1.066011e-07,-1.033952e-07,-1.035425e-07,0.000000e+00,-1.034950e-07,0.000000e+00,-1.035606e-07,...,-1.036975e-07,0.000000e+00,-1.095894e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.035112e-07,-1.036479e-07,0.000000e+00
96,-1.096873e-07,0.000000e+00,0.000000e+00,-1.202819e-07,-1.035922e-07,-1.037401e-07,0.000000e+00,-1.036924e-07,0.000000e+00,-1.037582e-07,...,-1.038957e-07,0.000000e+00,-1.067382e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.037086e-07,-1.038459e-07,0.000000e+00
97,-2.072339e-07,-1.066639e-07,-1.094993e-07,-2.075909e-07,-2.188417e-07,-2.130514e-07,-1.037893e-07,-2.324119e-07,-1.037644e-07,-2.192123e-07,...,-2.133796e-07,-1.037823e-07,-2.074544e-07,-1.066339e-07,-1.095620e-07,-1.035112e-07,-1.037086e-07,,-2.132745e-07,-1.036854e-07
98,-2.075080e-07,-1.098858e-07,-1.065885e-07,-2.078659e-07,-2.130283e-07,-2.194783e-07,-1.039268e-07,-2.132401e-07,-1.039019e-07,-2.133794e-07,...,-2.332690e-07,-1.039198e-07,-2.077290e-07,-1.241555e-07,-1.066479e-07,-1.036479e-07,-1.038459e-07,-2.132745e-07,,-1.038226e-07


np.nanmean(norm_err)=np.float64(-9.399668228646521e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.399668228646521e-08)
    np.nanmedian(norm_err)=np.float64(-1.0381340371378088e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0381340371378088e-07)
    
