In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-11T16:34:38.686384+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
hstrat                            : 1.20.10
downstream                        : 1.14.3
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10285.70it/s]
100%|██████████| 100/100 [00:00<00:00, 388.97it/s]
6132it [00:00, 619238.99it/s]
100%|██████████| 100/100 [00:00<00:00, 246144.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.474491,0.088705,0.000000,-0.083543,0.023325,0.000000,0.000000,0.000000,...,-0.224039,-0.119835,0.00000,0.085472,0.096935,0.000000,0.000000,-0.058202,-0.052031,-0.185710
1,0.000000,,-0.024514,0.198159,0.000000,-0.174130,-0.041349,0.000000,0.000000,0.000000,...,0.000000,-0.557863,0.00000,-0.023909,0.129968,0.000000,0.000000,-0.127604,-0.101230,0.000000
2,0.474491,-0.024514,,0.079467,-0.021573,-0.075299,-0.551023,0.000000,-0.018185,-0.025381,...,0.186957,-0.130840,0.00000,0.064592,0.088502,0.121478,-0.021523,-0.052224,-0.047203,0.162134
3,0.088705,0.198159,0.079467,,0.144913,0.000000,0.141889,0.000000,0.102299,0.218345,...,0.113266,0.000000,0.00000,0.077366,0.166276,0.171365,0.144167,-0.122960,0.075672,0.088610
4,0.000000,0.000000,-0.021573,0.144913,,-0.131629,-0.033617,0.000000,0.000000,0.000000,...,0.000000,-0.462235,0.00000,-0.021102,0.104730,0.000000,0.000000,-0.094166,-0.078989,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.121478,0.171365,0.000000,-0.153094,0.037656,0.000000,0.000000,0.000000,...,0.000000,-0.177749,0.00000,0.469805,0.160313,,0.000000,-0.110853,-0.090398,0.000000
96,0.000000,0.000000,-0.021523,0.144167,0.000000,-0.131012,-0.033496,0.000000,0.000000,0.000000,...,0.000000,-0.460713,0.00000,-0.021054,0.104339,0.000000,,-0.093693,-0.078656,0.000000
97,-0.058202,-0.127604,-0.052224,-0.122960,-0.094166,-0.205802,-0.092249,0.485475,-0.066963,-0.140122,...,-0.074002,-0.078783,0.19323,-0.050861,-0.107655,-0.110853,-0.093693,,-0.156890,-0.058140
98,-0.052031,-0.101230,-0.047203,0.075672,-0.078989,-0.375839,-0.077636,0.000000,-0.058920,-0.108948,...,-0.064299,-0.100117,0.00000,-0.046087,-0.184681,-0.090398,-0.078656,-0.156890,,-0.051982


np.nanmean(norm_err)=np.float64(-0.02113555150885352)
    np.nanmean(np.abs(norm_err))=np.float64(0.0955111037364902)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0389600035962423)
    


100%|██████████| 100/100 [00:00<00:00, 33251.18it/s]
100%|██████████| 100/100 [00:00<00:00, 441.57it/s]
5956it [00:00, 635850.00it/s]
100%|██████████| 100/100 [00:00<00:00, 287675.17it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.627333e-03,-1.415992e-07,-1.438983e-07,-4.319927e-02,-3.333993e-07,-1.772665e-07,-4.754019e-07,-1.494741e-07,-4.448051e-07,...,-5.679309e-07,-2.600809e-07,-1.748047e-07,-2.906164e-07,-1.760914e-07,-2.780383e-07,-1.012064e-02,-3.395629e-07,1.156044e-02,1.944437e-02
1,-1.627333e-03,,5.927631e-03,-1.473558e-03,-4.304548e-07,2.980572e-02,7.989742e-03,1.231307e-02,2.575331e-02,4.529807e-02,...,-3.946863e-03,5.320787e-03,5.864229e-03,-1.492133e-03,-7.901816e-03,1.195777e-02,1.231121e-01,-1.827701e-03,-7.341155e-07,-3.802891e-07
2,-1.415992e-07,5.927631e-03,,0.000000e+00,1.818924e-03,-1.531413e-07,0.000000e+00,-2.125261e-07,0.000000e+00,1.627927e-03,...,-2.409232e-07,-1.246197e-07,0.000000e+00,-1.331368e-07,0.000000e+00,0.000000e+00,6.026000e-03,-1.533961e-07,-5.075599e-03,-8.922322e-03
3,-1.438983e-07,-1.473558e-03,0.000000e+00,,-1.501783e-02,-1.534890e-07,0.000000e+00,-2.117173e-07,0.000000e+00,-1.994947e-07,...,-2.793986e-07,-1.218587e-07,0.000000e+00,-1.558799e-07,0.000000e+00,0.000000e+00,-9.145575e-03,-1.627579e-07,1.030576e-02,8.980151e-03
4,-4.319927e-02,-4.304548e-07,1.818924e-03,-1.501783e-02,,2.392944e-03,2.677536e-03,5.113668e-03,1.985649e-03,4.320354e-03,...,-7.053330e-02,1.593028e-03,2.608689e-03,-1.525424e-02,-2.637492e-03,8.346865e-03,-1.851118e-02,-1.984578e-02,-6.106137e-07,-2.396652e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.780383e-07,1.195777e-02,0.000000e+00,0.000000e+00,8.346865e-03,-3.254623e-07,0.000000e+00,-7.808190e-07,0.000000e+00,-3.720275e-02,...,-1.459550e-06,-2.099229e-07,0.000000e+00,-2.471876e-07,0.000000e+00,,1.250770e-02,-3.274917e-07,-6.186994e-02,-6.291481e-03
96,-1.012064e-02,1.231121e-01,6.026000e-03,-9.145575e-03,-1.851118e-02,3.042823e-02,8.169496e-03,1.274526e-02,2.621673e-02,4.675167e-02,...,-2.537002e-02,5.399915e-03,5.993460e-03,-9.262281e-03,-1.989029e-03,1.250770e-02,,-1.139874e-02,-7.591594e-07,-3.869009e-07
97,-3.395629e-07,-1.827701e-03,-1.533961e-07,-1.627579e-07,-1.984578e-02,-3.665891e-07,-1.961511e-07,-5.458734e-07,-1.626808e-07,-5.059145e-07,...,-7.362965e-07,-2.798455e-07,-1.931414e-07,-3.291279e-07,-1.947134e-07,-3.274917e-07,-1.139874e-02,,1.325873e-02,1.050865e-02
98,1.156044e-02,-7.341155e-07,-5.075599e-03,1.030576e-02,-6.106137e-07,-6.514054e-03,-7.199827e-03,-1.246632e-02,-5.500531e-03,-1.229211e-01,...,3.689261e-02,-4.490522e-03,-2.373453e-02,1.045375e-02,2.336540e-03,-6.186994e-02,-7.591594e-07,1.325873e-02,,5.425997e-03


np.nanmean(norm_err)=np.float64(-0.002427246360398379)
    np.nanmean(np.abs(norm_err))=np.float64(0.00964928942605245)
    np.nanmedian(norm_err)=np.float64(-2.556628623949844e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(8.502116750523517e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35907.06it/s]
100%|██████████| 100/100 [00:00<00:00, 476.91it/s]
5937it [00:00, 692691.95it/s]
100%|██████████| 100/100 [00:00<00:00, 386215.84it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35842.63it/s]
100%|██████████| 100/100 [00:00<00:00, 1028.75it/s]
5948it [00:00, 692031.07it/s]
100%|██████████| 100/100 [00:00<00:00, 391991.03it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.038175e-07,-1.067537e-07,-1.066715e-07,-2.332640e-07,-1.037965e-07,-1.036938e-07,-1.039684e-07,-2.134387e-07,-2.078967e-07,...,-2.079644e-07,-2.073337e-07,-2.075778e-07,-1.129234e-07,-2.195148e-07,-1.038406e-07,-2.074977e-07,-1.097406e-07,-2.137757e-07,-1.067487e-07
1,-1.038175e-07,,0.000000e+00,0.000000e+00,-1.037677e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036606e-07,-1.066936e-07,...,-1.165084e-07,-1.126828e-07,-1.128270e-07,0.000000e+00,-1.036308e-07,0.000000e+00,-1.064834e-07,0.000000e+00,-1.038196e-07,0.000000e+00
2,-1.067537e-07,0.000000e+00,,0.000000e+00,-1.067011e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.128967e-07,-1.038235e-07,...,-1.038573e-07,-1.035426e-07,-1.036644e-07,0.000000e+00,-1.065562e-07,0.000000e+00,-1.036245e-07,0.000000e+00,-1.130853e-07,0.000000e+00
3,-1.066715e-07,0.000000e+00,0.000000e+00,,-1.066190e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.095649e-07,-1.037458e-07,...,-1.037795e-07,-1.034653e-07,-1.035869e-07,0.000000e+00,-1.064744e-07,0.000000e+00,-1.035471e-07,0.000000e+00,-1.097425e-07,0.000000e+00
4,-2.332640e-07,-1.037677e-07,-1.067011e-07,-1.066190e-07,,-1.037467e-07,-1.036441e-07,-1.039185e-07,-2.133334e-07,-2.077968e-07,...,-2.078645e-07,-2.072344e-07,-2.074783e-07,-1.128645e-07,-2.194035e-07,-1.037908e-07,-2.073983e-07,-1.096849e-07,-2.136702e-07,-1.066961e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.038406e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037908e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036836e-07,-1.097894e-07,...,-1.067537e-07,-1.064213e-07,-1.065499e-07,0.000000e+00,-1.036538e-07,,-1.162445e-07,0.000000e+00,-1.038427e-07,0.000000e+00
96,-2.074977e-07,-1.064834e-07,-1.036245e-07,-1.035471e-07,-2.073983e-07,-1.095177e-07,-1.094034e-07,-1.097091e-07,-2.071842e-07,-2.193736e-07,...,-2.133134e-07,-2.126498e-07,-2.129066e-07,-1.034905e-07,-2.071247e-07,-1.162445e-07,,-1.035474e-07,-2.075018e-07,-1.036198e-07
97,-1.097406e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.096849e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065061e-07,-1.037461e-07,...,-1.037798e-07,-1.034656e-07,-1.035872e-07,0.000000e+00,-1.127699e-07,0.000000e+00,-1.035474e-07,,-1.066740e-07,0.000000e+00
98,-2.137757e-07,-1.038196e-07,-1.130853e-07,-1.097425e-07,-2.136702e-07,-1.037985e-07,-1.036958e-07,-1.039704e-07,-2.481554e-07,-2.079007e-07,...,-2.079685e-07,-2.073377e-07,-2.075819e-07,-1.066137e-07,-2.133798e-07,-1.038427e-07,-2.075018e-07,-1.066740e-07,,-1.130797e-07


np.nanmean(norm_err)=np.float64(-1.0250666472219867e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0250666472219867e-07)
    np.nanmedian(norm_err)=np.float64(-1.038511592726407e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.038511592726407e-07)
    
