In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-18T21:53:25.074289+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
downstream                        : 1.14.3
numpy                             : 2.1.2
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10396.87it/s]
100%|██████████| 100/100 [00:00<00:00, 388.04it/s]
6136it [00:00, 644776.38it/s]
100%|██████████| 100/100 [00:00<00:00, 256375.55it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,-0.107093,0.852466,0.000000,0.000000,0.000000,0.000000,0.000000,-0.098533
1,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,0.715234,-0.121978,0.000000,0.000000,0.000000,0.000000,0.000000,0.554471
2,0.000000,0.000000,,0.000000,0.000000,0.401858,-0.133523,0.000000,0.000000,0.0,...,0.00000,0.000000,-0.043925,0.177293,0.000000,-0.112273,-0.138047,0.000000,0.000000,-0.042414
3,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,0.054723,-0.911032,0.000000,0.000000,0.000000,0.000000,0.156572,0.053536
4,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,0.061689,-1.032259,0.000000,0.000000,0.000000,0.000000,0.170322,0.060184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,-0.112273,0.000000,0.000000,-0.179839,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,-0.031850,0.141287,0.000000,,0.000000,0.000000,0.000000,-0.031047
96,0.000000,0.000000,-0.138047,0.000000,0.000000,-0.901758,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,-0.040422,0.167532,0.000000,0.000000,,0.000000,0.000000,-0.039138
97,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,0.096713,-0.255863,0.000000,0.000000,0.000000,,0.000000,0.093064
98,0.000000,0.000000,0.000000,0.156572,0.170322,0.000000,0.000000,0.000000,0.000000,0.0,...,0.00000,0.000000,0.052539,-0.358391,0.180353,0.000000,0.000000,0.000000,,0.051444


np.nanmean(norm_err)=np.float64(-0.0009325736282948147)
    np.nanmean(np.abs(norm_err))=np.float64(0.07973202042233037)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28930.22it/s]
100%|██████████| 100/100 [00:00<00:00, 431.58it/s]
5977it [00:00, 587008.10it/s]
100%|██████████| 100/100 [00:00<00:00, 202721.31it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.528371e-02,0.000000e+00,-4.275254e-07,-1.227229e-02,1.242797e-02,0.000000e+00,-1.970878e-07,-3.288015e-07,-2.441591e-07,...,1.350689e-03,-1.191918e-02,-4.622541e-07,1.565100e-03,1.273656e-02,-4.284166e-07,1.289096e-02,9.678206e-03,0.000000,-3.438978e-07
1,-1.528371e-02,,-1.776355e-07,-8.028546e-07,-4.279804e-07,2.596018e-02,-2.411011e-07,-3.827056e-07,-6.262884e-07,-4.708365e-07,...,1.316957e-03,-4.139658e-07,-8.637887e-07,1.520022e-03,1.237004e-02,-8.044260e-07,2.690179e-02,9.465068e-03,0.029065,-6.536142e-07
2,0.000000e+00,-1.776355e-07,,-2.817133e-07,-1.496621e-07,-1.696623e-07,0.000000e+00,-1.582984e-07,-2.485931e-07,-1.870189e-07,...,-1.386210e-07,-1.462005e-07,-2.935466e-07,0.000000e+00,-3.640865e-02,-2.795265e-07,0.000000e+00,-4.070747e-03,0.000000,-7.988304e-03
3,-4.275254e-07,-8.028546e-07,-2.817133e-07,,-5.644419e-07,-7.257771e-07,-4.760820e-07,-6.291704e-07,-1.795943e-06,-9.055268e-07,...,-4.907348e-07,-5.403173e-07,-7.456538e-06,-3.068644e-07,-1.002287e-02,-4.554927e-06,-3.860961e-07,-6.922610e-02,-0.000003,2.539095e-02
4,-1.227229e-02,-4.279804e-07,-1.496621e-07,-5.644419e-07,,2.123882e-02,-1.923132e-07,-3.185646e-07,-4.710727e-07,-3.773606e-07,...,1.123714e-03,-3.476775e-07,-5.938960e-07,1.268313e-03,1.031685e-02,-5.652181e-07,2.186498e-02,8.214225e-03,0.019817,-4.863670e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.284166e-07,-8.044260e-07,-2.795265e-07,-4.554927e-06,-5.652181e-07,-7.270610e-07,-5.265551e-07,-6.333782e-07,-1.752248e-06,-9.075263e-07,...,-4.000956e-03,-5.410285e-07,-8.093791e-06,-5.007430e-03,-6.120316e-07,,-3.868229e-07,-4.089557e-07,-0.000003,-1.984357e-06
96,1.289096e-02,2.690179e-02,0.000000e+00,-3.860961e-07,2.186498e-02,2.865443e-02,0.000000e+00,-1.877981e-07,-3.037358e-07,-6.018271e-02,...,1.296166e-03,2.126631e-02,-4.141990e-07,1.492367e-03,1.214304e-02,-3.868229e-07,,9.331626e-03,0.051579,-3.165733e-07
97,9.678206e-03,9.465068e-03,-4.070747e-03,-6.922610e-02,8.214225e-03,9.116043e-03,-1.526275e-07,-2.621160e-07,-5.847680e-03,1.210628e-03,...,9.454516e-04,8.054188e-03,-4.237610e-07,1.045787e-03,2.225117e-01,-4.089557e-07,9.331626e-03,,0.013843,-5.990986e-03
98,0.000000e+00,2.906520e-02,0.000000e+00,-3.270427e-06,1.981723e-02,4.822592e-02,0.000000e+00,-3.288815e-07,-9.919882e-07,-4.848676e-07,...,2.048910e-03,1.891246e-02,-7.689912e-06,2.586186e-03,2.108558e-02,-3.323314e-06,5.157937e-02,1.384340e-02,,-1.143421e-06


np.nanmean(norm_err)=np.float64(-0.0013132435382836782)
    np.nanmean(np.abs(norm_err))=np.float64(0.011662235780904699)
    np.nanmedian(norm_err)=np.float64(-2.0500272431353108e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(7.489308795915357e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36821.21it/s]
100%|██████████| 100/100 [00:00<00:00, 468.58it/s]
5932it [00:00, 616299.11it/s]
100%|██████████| 100/100 [00:00<00:00, 269903.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33942.74it/s]
100%|██████████| 100/100 [00:00<00:00, 1016.46it/s]
5934it [00:00, 652005.34it/s]
100%|██████████| 100/100 [00:00<00:00, 386571.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.065194e-07,-2.190621e-07,-1.036883e-07,-1.066394e-07,-2.128963e-07,-1.037433e-07,-2.070952e-07,-2.196725e-07,-2.133237e-07,...,-1.066874e-07,-1.038068e-07,-2.259426e-07,-1.066313e-07,-2.077468e-07,-1.036790e-07,-2.073076e-07,-1.098383e-07,-1.066892e-07,-1.096696e-07
1,-1.065194e-07,,-1.063394e-07,0.000000e+00,0.000000e+00,-1.125893e-07,0.000000e+00,-1.034204e-07,-1.066270e-07,-1.095871e-07,...,0.000000e+00,0.000000e+00,-1.065193e-07,0.000000e+00,-1.037454e-07,0.000000e+00,-1.035263e-07,0.000000e+00,0.000000e+00,0.000000e+00
2,-2.190621e-07,-1.063394e-07,,-1.035177e-07,-1.064589e-07,-2.125367e-07,-1.035725e-07,-2.067549e-07,-2.399830e-07,-2.129627e-07,...,-1.065068e-07,-1.036358e-07,-2.190618e-07,-1.064509e-07,-2.074044e-07,-1.035084e-07,-2.069666e-07,-1.238910e-07,-1.065086e-07,-1.127135e-07
3,-1.036883e-07,0.000000e+00,-1.035177e-07,,0.000000e+00,-1.034934e-07,0.000000e+00,-1.197175e-07,-1.037903e-07,-1.036954e-07,...,0.000000e+00,0.000000e+00,-1.036882e-07,0.000000e+00,-1.130327e-07,0.000000e+00,-1.064772e-07,0.000000e+00,0.000000e+00,0.000000e+00
4,-1.066394e-07,0.000000e+00,-1.064589e-07,0.000000e+00,,-1.094880e-07,0.000000e+00,-1.035335e-07,-1.067473e-07,-1.200745e-07,...,0.000000e+00,0.000000e+00,-1.066393e-07,0.000000e+00,-1.038592e-07,0.000000e+00,-1.036397e-07,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036790e-07,0.000000e+00,-1.035084e-07,0.000000e+00,0.000000e+00,-1.034841e-07,0.000000e+00,-1.126360e-07,-1.037809e-07,-1.036861e-07,...,0.000000e+00,0.000000e+00,-1.036789e-07,0.000000e+00,-1.282168e-07,,-1.064674e-07,0.000000e+00,0.000000e+00,0.000000e+00
96,-2.073076e-07,-1.035263e-07,-2.069666e-07,-1.064772e-07,-1.036397e-07,-2.069180e-07,-1.065352e-07,-2.126577e-07,-2.075114e-07,-2.073218e-07,...,-1.036850e-07,-1.066021e-07,-2.073074e-07,-1.036320e-07,-2.133448e-07,-1.064674e-07,,-1.037576e-07,-1.036867e-07,-1.036070e-07
97,-1.098383e-07,0.000000e+00,-1.238910e-07,0.000000e+00,0.000000e+00,-1.065575e-07,0.000000e+00,-1.036511e-07,-1.203603e-07,-1.067717e-07,...,0.000000e+00,0.000000e+00,-1.098382e-07,0.000000e+00,-1.039776e-07,0.000000e+00,-1.037576e-07,,0.000000e+00,0.000000e+00
98,-1.066892e-07,0.000000e+00,-1.065086e-07,0.000000e+00,0.000000e+00,-1.095405e-07,0.000000e+00,-1.035804e-07,-1.067971e-07,-1.130189e-07,...,0.000000e+00,0.000000e+00,-1.066891e-07,0.000000e+00,-1.039064e-07,0.000000e+00,-1.036867e-07,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-9.186072419402867e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.186072419402867e-08)
    np.nanmedian(norm_err)=np.float64(-1.0376698542356688e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0376698542356688e-07)
    
