In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-28T00:23:29.212885+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10837.71it/s]
100%|██████████| 100/100 [00:00<00:00, 372.54it/s]
6136it [00:00, 635557.10it/s]
100%|██████████| 100/100 [00:00<00:00, 234318.66it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.213516,0.000000,0.000000,0.072418,0.000000,-0.100499,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.140857,-0.771471,-1.056360,0.000000,0.000000
1,0.000000,,0.056581,0.000000,0.000000,-0.099931,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.028914,0.026975,0.038118,0.000000,0.000000
2,-0.213516,0.056581,,-0.051096,-0.152903,0.000000,-0.162701,-0.880259,-0.043806,-0.044456,...,0.382588,-0.130668,-0.054868,0.467510,-0.159225,0.000000,0.194428,0.263944,-0.217990,-0.025694
3,0.000000,0.000000,-0.051096,,0.000000,0.084004,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.027411,-0.025662,-0.035547,0.000000,0.000000
4,0.000000,0.000000,-0.152903,0.000000,,0.070687,0.000000,-0.063349,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.102445,-0.097706,-0.122389,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.140857,0.028914,0.000000,-0.027411,-0.102445,0.000000,-0.106753,-0.249470,-0.025164,-0.025377,...,0.198818,-0.091961,-0.028460,0.248879,-0.083910,,0.582643,0.752280,-0.128064,-0.017911
96,-0.771471,0.026975,0.194428,-0.025662,-0.097706,0.000000,-0.101616,-0.105085,-0.023683,-0.023871,...,0.185705,-0.088123,-0.026580,0.232870,-0.078455,0.582643,,0.000000,-0.120742,-0.017148
97,-1.056360,0.038118,0.263944,-0.035547,-0.122389,0.000000,-0.128587,-0.125501,-0.031859,-0.032201,...,0.260637,-0.107717,-0.037333,0.323607,-0.109482,0.752280,0.000000,,-0.160824,-0.021061
98,0.000000,0.000000,-0.217990,0.000000,0.000000,0.110853,0.000000,-0.079170,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.128064,-0.120742,-0.160824,,0.000000


np.nanmean(norm_err)=np.float64(-0.03551596853082712)
    np.nanmean(np.abs(norm_err))=np.float64(0.1072746533249554)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.019600198439765917)
    


100%|██████████| 100/100 [00:00<00:00, 33303.99it/s]
100%|██████████| 100/100 [00:00<00:00, 423.85it/s]
5974it [00:00, 614543.25it/s]
100%|██████████| 100/100 [00:00<00:00, 307275.02it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,8.967097e-03,9.804530e-03,0.005883,9.610858e-03,5.749081e-03,7.356076e-02,0.000000e+00,8.820346e-03,0.006696,...,9.665315e-02,1.398493e-02,0.000000e+00,1.309678e-02,6.252803e-03,1.494023e-02,8.219510e-03,-1.100609e-02,7.578536e-03,0.009914
1,0.008967,,-1.619169e-07,0.007499,-1.584677e-07,1.210213e-02,-1.981969e-07,1.228559e-03,3.268857e-02,0.012086,...,-5.306775e-07,-4.782523e-07,4.705121e-03,-4.445396e-07,8.002655e-03,-2.575378e-07,8.553116e-02,3.993583e-03,-2.458439e-07,0.037120
2,0.009805,-1.619169e-07,,0.008289,7.204149e-02,1.752269e-02,0.000000e+00,1.357623e-03,1.805697e-02,0.017497,...,-3.178744e-07,-3.833253e-07,5.740675e-03,-2.601864e-07,8.908745e-03,0.000000e+00,1.995608e-02,4.715634e-03,5.399081e-02,0.020802
3,0.005883,7.498982e-03,8.288987e-03,,8.104757e-03,5.253438e-03,1.031879e-02,-1.318120e-03,-4.987497e-02,-0.062938,...,1.411101e-02,1.250576e-02,2.630424e-03,1.156964e-02,-3.780968e-07,1.353867e-02,7.836079e-03,2.196374e-03,6.226099e-03,-0.057228
4,0.009611,-1.584677e-07,7.204149e-02,0.008105,,1.721304e-02,0.000000e+00,1.327564e-03,1.762128e-02,0.017189,...,-3.048481e-07,1.135123e-01,5.480446e-03,-5.067869e-02,8.696289e-03,-1.968672e-02,1.942527e-02,4.538607e-03,-1.692020e-07,0.020226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.014940,-2.575378e-07,0.000000e+00,0.013539,-1.968672e-02,1.051919e-01,0.000000e+00,2.212000e-03,3.122809e-02,0.091241,...,-1.172591e-06,-8.109179e-07,3.021144e-02,-6.735011e-07,1.527423e-02,,3.738041e-02,1.409144e-02,-1.271636e-02,0.040463
96,0.008220,8.553116e-02,1.995608e-02,0.007836,1.942527e-02,-1.465545e-02,1.733703e-02,-1.904782e-07,4.008200e-02,0.013227,...,2.620559e-02,3.355967e-02,-4.672835e-07,3.027713e-02,8.505548e-03,3.738041e-02,,-3.629223e-07,1.427182e-02,0.047302
97,-0.011006,3.993583e-03,4.715634e-03,0.002196,4.538607e-03,-1.877941e-07,7.118336e-03,0.000000e+00,2.499889e-03,0.001534,...,1.614768e-02,1.118989e-02,0.000000e+00,9.184280e-03,2.468712e-03,1.409144e-02,-3.629223e-07,,3.006302e-03,0.003200
98,0.007579,-2.458439e-07,5.399081e-02,0.006226,-1.692020e-07,1.388265e-02,-1.566943e-07,1.020633e-03,1.327348e-02,0.013867,...,-3.917626e-07,7.434727e-02,3.392507e-03,-3.446679e-02,6.569383e-03,-1.271636e-02,1.427182e-02,3.006302e-03,,0.014699


np.nanmean(norm_err)=np.float64(0.001930944173278736)
    np.nanmean(np.abs(norm_err))=np.float64(0.015810264991934175)
    np.nanmedian(norm_err)=np.float64(0.0012222308238368802)
    np.nanmedian(np.abs(norm_err))=np.float64(0.003488676501270476)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34269.99it/s]
100%|██████████| 100/100 [00:00<00:00, 471.45it/s]
5965it [00:00, 683075.96it/s]
100%|██████████| 100/100 [00:00<00:00, 406424.81it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33193.29it/s]
100%|██████████| 100/100 [00:00<00:00, 473.10it/s]
5959it [00:00, 608019.50it/s]
100%|██████████| 100/100 [00:00<00:00, 396437.05it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.263323e-07,-1.038638e-07,-2.130126e-07,-2.136572e-07,-1.038666e-07,-1.038676e-07,-2.132515e-07,-2.077838e-07,-1.037351e-07,...,-1.066027e-07,-2.072851e-07,-1.066885e-07,-2.076586e-07,-2.077386e-07,-2.073551e-07,-1.240087e-07,-1.036137e-07,-1.067717e-07,-1.068003e-07
1,-2.263323e-07,,-1.039258e-07,-2.131429e-07,-2.137883e-07,-1.039286e-07,-1.039296e-07,-2.133821e-07,-2.079078e-07,-1.037969e-07,...,-1.066680e-07,-2.074084e-07,-1.067539e-07,-2.077824e-07,-2.078625e-07,-2.074786e-07,-1.130628e-07,-1.036754e-07,-1.068372e-07,-1.068658e-07
2,-1.038638e-07,-1.039258e-07,,-1.036214e-07,-1.039265e-07,0.000000e+00,0.000000e+00,-1.037345e-07,-1.068061e-07,0.000000e+00,...,0.000000e+00,-1.096037e-07,0.000000e+00,-1.098126e-07,-1.067822e-07,-1.065795e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,-2.130126e-07,-2.131429e-07,-1.036214e-07,,-2.326426e-07,-1.036242e-07,-1.036252e-07,-2.188429e-07,-2.072987e-07,-1.034933e-07,...,-1.126271e-07,-2.068023e-07,-1.094875e-07,-2.071741e-07,-2.072537e-07,-2.068720e-07,-1.064147e-07,-1.033725e-07,-1.162538e-07,-1.162877e-07
4,-2.136572e-07,-2.137883e-07,-1.039265e-07,-2.326426e-07,,-1.039292e-07,-1.039302e-07,-2.195233e-07,-2.079092e-07,-1.037976e-07,...,-1.129876e-07,-2.074098e-07,-1.098282e-07,-2.077838e-07,-2.078639e-07,-2.074800e-07,-1.067365e-07,-1.036761e-07,-1.428433e-07,-1.242739e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.073551e-07,-2.074786e-07,-1.065795e-07,-2.068720e-07,-2.074800e-07,-1.163335e-07,-1.128919e-07,-2.070973e-07,-2.193483e-07,-1.064440e-07,...,-1.035269e-07,-2.126930e-07,-1.036078e-07,-2.130863e-07,-2.326738e-07,,-1.035908e-07,-1.093642e-07,-1.036863e-07,-1.037133e-07
96,-1.240087e-07,-1.130628e-07,0.000000e+00,-1.064147e-07,-1.067365e-07,0.000000e+00,0.000000e+00,-1.065340e-07,-1.038048e-07,0.000000e+00,...,0.000000e+00,-1.035558e-07,0.000000e+00,-1.037423e-07,-1.037822e-07,-1.035908e-07,,0.000000e+00,0.000000e+00,0.000000e+00
97,-1.036137e-07,-1.036754e-07,0.000000e+00,-1.033725e-07,-1.036761e-07,0.000000e+00,0.000000e+00,-1.034850e-07,-1.238346e-07,0.000000e+00,...,0.000000e+00,-1.062793e-07,0.000000e+00,-1.064757e-07,-1.095776e-07,-1.093642e-07,0.000000e+00,,0.000000e+00,0.000000e+00
98,-1.067717e-07,-1.068372e-07,0.000000e+00,-1.162538e-07,-1.428433e-07,0.000000e+00,0.000000e+00,-1.097016e-07,-1.039007e-07,0.000000e+00,...,0.000000e+00,-1.036513e-07,0.000000e+00,-1.038381e-07,-1.038781e-07,-1.036863e-07,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.1094451264530143e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1094451264530143e-07)
    np.nanmedian(norm_err)=np.float64(-1.0640243112533744e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0640243112533744e-07)
    
