In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from hstrat import _auxiliary_lib as hstrat_aux
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-09T18:54:11.196665+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
downstream                        : 1.14.3
numpy                             : 2.1.2
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10589.54it/s]
100%|██████████| 100/100 [00:00<00:00, 394.30it/s]
6147it [00:00, 649299.55it/s]
100%|██████████| 100/100 [00:00<00:00, 225986.21it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.680912,0.419802,0.000000,-0.577712,-0.047193,0.045984,-0.556484,0.026420,0.129905,...,0.000000,-0.428095,-0.357469,0.113148,-0.051443,0.050910,-0.460198,-0.767247,0.083426,0.077410
1,0.680912,,0.000000,-0.034942,0.085571,0.000000,0.000000,0.099668,0.000000,-0.158685,...,0.726943,0.078567,0.066509,-0.129419,0.000000,-0.059913,0.069983,0.109077,1.052516,0.000000
2,0.419802,0.000000,,-0.092384,0.129335,0.000000,0.000000,0.164502,0.000000,-0.539624,...,0.456644,0.113978,0.090243,-0.305047,0.000000,-0.078514,0.096760,0.191811,0.349975,0.000000
3,0.000000,-0.034942,-0.092384,,0.000000,0.062184,-0.014770,0.000000,-0.168260,-0.116509,...,0.000000,0.000000,0.000000,-0.070012,0.074321,-0.017172,0.000000,0.000000,0.000000,-0.035746
4,-0.577712,0.085571,0.129335,0.000000,,-0.052282,0.077414,0.000000,0.031583,0.138240,...,-0.616454,0.000000,0.000000,0.117640,-0.057549,0.380527,0.000000,0.000000,-0.409093,0.138083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.050910,-0.059913,-0.078514,-0.017172,0.380527,0.000000,0.021702,0.678070,0.000000,-0.120207,...,0.052626,0.560006,0.842542,-0.259980,0.000000,,0.326609,0.452857,0.047110,0.033128
96,-0.460198,0.069983,0.096760,0.000000,0.000000,-0.043418,0.068536,0.000000,0.023054,0.101660,...,-0.484450,0.000000,0.000000,0.090062,-0.046991,0.326609,,0.000000,-0.337265,0.112166
97,-0.767247,0.109077,0.191811,0.000000,0.000000,-0.065148,0.088507,0.000000,0.049190,0.212072,...,-0.837118,0.000000,0.000000,0.167166,-0.073536,0.452857,0.000000,,-0.515264,0.177840
98,0.083426,1.052516,0.349975,0.000000,-0.409093,-0.042434,0.042861,-0.830093,0.022233,0.109604,...,0.088764,-0.642606,-0.707489,0.097430,-0.045839,0.047110,-0.337265,-0.515264,,0.068954


np.nanmean(norm_err)=np.float64(0.00478605578327123)
    np.nanmean(np.abs(norm_err))=np.float64(0.09605732974529671)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.04157329267823881)
    


100%|██████████| 100/100 [00:00<00:00, 28978.20it/s]
100%|██████████| 100/100 [00:00<00:00, 431.33it/s]
5974it [00:00, 541709.48it/s]
100%|██████████| 100/100 [00:00<00:00, 254508.74it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,5.799650e-02,0.000000e+00,0.000000e+00,-2.749248e-07,0.000000e+00,-1.978921e-07,-3.626281e-07,-1.677729e-07,-2.348632e-07,...,0.000000e+00,-3.510464e-07,-2.592992e-07,-2.448887e-07,-3.439312e-07,-3.629294e-07,-2.144328e-07,-2.411483e-07,0.000000e+00,-4.943983e-07
1,5.799650e-02,,-5.184314e-02,4.369760e-02,-6.867177e-07,-2.324362e-07,-4.620746e-07,-9.839164e-07,-3.134701e-03,-5.661160e-07,...,-2.054097e-07,-3.933427e-02,-6.386522e-07,-5.954977e-07,-9.163362e-07,-9.850261e-07,-5.078059e-07,-5.844752e-07,-4.724254e-07,-1.541180e-06
2,0.000000e+00,-5.184314e-02,,0.000000e+00,-4.852257e-07,0.000000e+00,-2.876211e-07,-8.466078e-07,-2.289591e-07,-3.729483e-07,...,0.000000e+00,-1.040725e-06,-4.385799e-07,-3.988789e-07,-7.512603e-07,-8.482523e-07,-3.239386e-07,-3.890499e-07,0.000000e+00,-2.241155e-06
3,0.000000e+00,4.369760e-02,0.000000e+00,,-2.148597e-07,0.000000e+00,-1.647419e-07,-2.649366e-07,-1.433223e-07,-1.895864e-07,...,0.000000e+00,-2.587009e-07,-2.051960e-07,-1.960657e-07,-2.548160e-07,-2.650974e-07,-1.760468e-07,-1.936608e-07,0.000000e+00,-3.290012e-07
4,-2.749248e-07,-6.867177e-07,-4.852257e-07,-2.148597e-07,,-2.116057e-07,-4.223359e-07,-8.142636e-07,-3.524073e-07,-5.055153e-07,...,-1.895526e-07,-7.802019e-07,-5.651542e-07,-5.288139e-07,-7.674247e-07,-8.204573e-07,-4.585024e-07,-5.201037e-07,-3.961967e-07,-1.173042e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.629294e-07,-9.850261e-07,-8.482523e-07,-2.650974e-07,-8.204573e-07,-2.601613e-07,-5.305025e-02,-1.270483e-06,-4.172535e-07,-6.505425e-07,...,-3.063247e-02,-1.189459e-06,-5.155995e-02,-6.896440e-07,-1.160014e-06,,-5.747085e-07,-6.749039e-07,-6.278077e-07,-2.531008e-06
96,-2.144328e-07,-5.078059e-07,-3.239386e-07,-1.760468e-07,-4.585024e-07,-1.800050e-07,-3.461449e-07,-6.210596e-07,-2.984469e-07,-4.178863e-07,...,-1.582794e-07,-5.571734e-07,-4.365652e-07,-4.336813e-07,-5.556187e-07,-5.747085e-07,,-4.278057e-07,-2.803981e-07,-7.279798e-07
97,-2.411483e-07,-5.844752e-07,-3.890499e-07,-1.936608e-07,-5.201037e-07,-2.183401e-07,-3.801351e-07,-7.222305e-07,-3.233777e-07,-4.832927e-07,...,-1.723751e-07,-6.508495e-07,-4.920561e-07,-5.045448e-07,-6.487291e-07,-6.749039e-07,-4.278057e-07,,-3.278991e-07,-8.965840e-07
98,0.000000e+00,-4.724254e-07,0.000000e+00,0.000000e+00,-3.961967e-07,0.000000e+00,8.457391e-03,-6.022143e-07,-2.049356e-07,-3.163866e-07,...,7.423020e-03,-5.656843e-07,1.223721e-02,-3.348535e-07,-5.523487e-07,-6.278077e-07,-2.803981e-07,-3.278991e-07,,-1.140859e-06


np.nanmean(norm_err)=np.float64(0.0007044518072666919)
    np.nanmean(np.abs(norm_err))=np.float64(0.006095795445926595)
    np.nanmedian(norm_err)=np.float64(-2.2799152555702175e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(3.0542726086977617e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33495.48it/s]
100%|██████████| 100/100 [00:00<00:00, 440.25it/s]
5945it [00:00, 657242.87it/s]
100%|██████████| 100/100 [00:00<00:00, 258429.08it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29137.23it/s]
100%|██████████| 100/100 [00:00<00:00, 1034.19it/s]
5969it [00:00, 640678.67it/s]
100%|██████████| 100/100 [00:00<00:00, 283207.56it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.034169e-07,0.000000e+00,-1.062621e-07,0.000000e+00,0.000000e+00,-1.033622e-07,0.000000e+00,0.000000e+00,-1.033419e-07,...,-1.062391e-07,-1.036233e-07,0.000000e+00,-1.036074e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.034122e-07,-1.126841e-07,0.000000e+00
1,-1.034169e-07,,-1.128180e-07,-2.069853e-07,-1.036981e-07,-1.197982e-07,-2.321239e-07,-1.035711e-07,-1.163071e-07,-2.126660e-07,...,-2.069417e-07,-2.132617e-07,-1.065163e-07,-2.132282e-07,-1.036281e-07,-1.035772e-07,-1.160290e-07,-2.322500e-07,-2.072435e-07,-1.066501e-07
2,0.000000e+00,-1.128180e-07,,-1.035922e-07,0.000000e+00,0.000000e+00,-1.127530e-07,0.000000e+00,0.000000e+00,-1.064381e-07,...,-1.035704e-07,-1.067366e-07,0.000000e+00,-1.067198e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.128125e-07,-1.037215e-07,0.000000e+00
3,-1.062621e-07,-2.069853e-07,-1.035922e-07,,-1.280146e-07,-1.035083e-07,-2.068759e-07,-1.064249e-07,-1.036326e-07,-2.068352e-07,...,-2.187345e-07,-2.073987e-07,-1.035910e-07,-2.073670e-07,-1.064851e-07,-1.161536e-07,-1.034118e-07,-2.069760e-07,-2.129568e-07,-1.037175e-07
4,0.000000e+00,-1.036981e-07,0.000000e+00,-1.280146e-07,,0.000000e+00,-1.036431e-07,0.000000e+00,0.000000e+00,-1.036227e-07,...,-1.095967e-07,-1.039056e-07,0.000000e+00,-1.038897e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036934e-07,-1.066958e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.035772e-07,0.000000e+00,-1.161536e-07,0.000000e+00,0.000000e+00,-1.035224e-07,0.000000e+00,0.000000e+00,-1.035021e-07,...,-1.094617e-07,-1.037843e-07,0.000000e+00,-1.037684e-07,0.000000e+00,,0.000000e+00,-1.035726e-07,-1.065679e-07,0.000000e+00
96,0.000000e+00,-1.160290e-07,0.000000e+00,-1.034118e-07,0.000000e+00,0.000000e+00,-1.195958e-07,0.000000e+00,0.000000e+00,-1.062476e-07,...,-1.033900e-07,-1.065450e-07,0.000000e+00,-1.065283e-07,0.000000e+00,0.000000e+00,,-1.276726e-07,-1.035407e-07,0.000000e+00
97,-1.034122e-07,-2.322500e-07,-1.128125e-07,-2.069760e-07,-1.036934e-07,-1.161447e-07,-2.393954e-07,-1.035664e-07,-1.199584e-07,-2.126561e-07,...,-2.069324e-07,-2.132518e-07,-1.065114e-07,-2.132183e-07,-1.036234e-07,-1.035726e-07,-1.276726e-07,,-2.072341e-07,-1.066451e-07
98,-1.126841e-07,-2.072435e-07,-1.037215e-07,-2.129568e-07,-1.066958e-07,-1.036374e-07,-2.071337e-07,-1.096237e-07,-1.037620e-07,-2.070930e-07,...,-2.129107e-07,-2.076579e-07,-1.037204e-07,-2.076261e-07,-1.096875e-07,-1.065679e-07,-1.035407e-07,-2.072341e-07,,-1.038472e-07


np.nanmean(norm_err)=np.float64(-1.0658259913314038e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0658259913314038e-07)
    np.nanmedian(norm_err)=np.float64(-1.0509848016076969e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0509848016076969e-07)
    
