In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-18T21:39:50.362101+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10581.79it/s]
100%|██████████| 100/100 [00:00<00:00, 375.77it/s]
6116it [00:00, 640844.47it/s]
100%|██████████| 100/100 [00:00<00:00, 199349.05it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.152354,-0.173517,0.000000,0.000000,0.000000,0.879620,-0.446640,0.060757,0.086453,...,-0.011675,0.371369,0.000000,-0.090961,-0.101995,0.276683,-0.253697,0.394044,0.399110,-0.308109
1,-0.152354,,-0.171186,0.148682,0.149003,0.644710,0.000000,-0.164696,0.000000,0.000000,...,-0.007703,0.000000,0.163195,-0.060627,-0.117152,0.000000,0.000000,0.000000,0.000000,0.000000
2,-0.173517,-0.171186,,0.000000,0.000000,0.000000,0.082881,-0.198991,0.406739,0.675603,...,-0.014651,0.060642,0.000000,-0.113295,0.077264,0.045415,-0.310594,0.064265,0.065073,-0.988355
3,0.000000,0.148682,0.000000,,-0.301987,-0.478923,-0.067941,0.000000,-0.059011,-0.082961,...,-0.011165,-0.052237,-0.410523,-0.400945,0.000000,-0.040531,0.243671,-0.054904,-0.055493,0.293423
4,0.000000,0.149003,0.000000,-0.301987,,-0.818481,-0.068142,0.000000,-0.059163,-0.083261,...,-0.011209,-0.052356,0.000000,-0.402699,0.000000,-0.040603,0.244535,-0.055035,-0.055627,0.294676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.276683,0.000000,0.045415,-0.040531,-0.040603,-0.031536,0.000000,0.355140,-0.083500,-0.104932,...,-0.006334,0.264007,-0.043710,-0.102637,0.033220,,0.000000,-0.281298,0.000000,0.000000
96,-0.253697,0.000000,-0.310594,0.243671,0.244535,0.550474,0.000000,-0.289868,0.000000,0.000000,...,-0.015007,0.000000,0.285246,-0.115944,-0.169092,0.000000,,0.000000,0.000000,0.000000
97,0.394044,0.000000,0.064265,-0.054904,-0.055035,-0.039602,0.000000,0.528237,-0.114329,-0.158714,...,-0.009433,0.000000,-0.060904,-0.153541,0.042295,-0.281298,0.000000,,0.000000,0.000000
98,0.399110,0.000000,0.065073,-0.055493,-0.055627,-0.039907,0.000000,0.536055,-0.115606,-0.161186,...,-0.009573,0.000000,-0.061630,-0.155854,0.042644,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.00924100910479344)
    np.nanmean(np.abs(norm_err))=np.float64(0.12636158176829257)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.043730306508602025)
    


100%|██████████| 100/100 [00:00<00:00, 32650.66it/s]
100%|██████████| 100/100 [00:00<00:00, 450.22it/s]
5972it [00:00, 599415.71it/s]
100%|██████████| 100/100 [00:00<00:00, 260031.25it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,6.920254e-03,-2.147089e-07,5.536778e-03,-1.556900e-07,-2.771384e-02,8.667528e-03,-1.428488e-07,0.000000e+00,0.000000e+00,...,-2.184633e-07,-1.890083e-07,-1.595748e-07,0.000000e+00,-2.055332e-07,-2.801329e-07,-2.626865e-07,-2.666966e-02,8.168036e-03,0.000000e+00
1,6.920254e-03,,-2.784204e-02,2.582181e-02,-1.855253e-02,-1.637890e-07,-1.019121e-02,-3.375976e-02,1.086085e-02,-3.552766e-02,...,8.879609e-03,-2.360283e-02,-1.556084e-03,6.695564e-03,8.303073e-03,-8.223601e-02,1.176633e-02,-1.566665e-07,-2.487476e-02,5.023614e-03
2,-2.147089e-07,-2.784204e-02,,-1.697500e-03,-6.266915e-02,2.698247e-03,-7.406723e-02,-1.556341e-02,-3.393999e-07,-3.254465e-02,...,-5.529856e-07,-5.541021e-07,1.130393e-02,-2.076529e-07,-5.165384e-07,-3.819288e-02,-7.366350e-07,2.579365e-03,-3.499327e-02,-1.553284e-07
3,5.536778e-03,2.582181e-02,-1.697500e-03,,-1.237473e-03,-2.745885e-07,3.210608e-03,-1.136729e-03,7.801573e-03,-2.644846e-03,...,6.723828e-03,-1.497778e-03,-2.557903e-03,5.392002e-03,6.387948e-03,-2.201768e-03,8.258003e-03,-2.645084e-07,3.042914e-02,4.252268e-03
4,-1.556900e-07,-1.855253e-02,-6.266915e-02,-1.237473e-03,,1.034127e-03,-4.082608e-02,-2.662898e-07,-2.122273e-07,-2.858024e-07,...,-3.715895e-07,-5.574431e-02,-2.912492e-07,-1.519461e-07,-3.547684e-07,-4.902032e-07,-4.463687e-07,9.985567e-04,-2.147728e-02,-1.218988e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.801329e-07,-8.223601e-02,-3.819288e-02,-2.201768e-03,-4.902032e-07,1.631573e-03,-1.857787e-02,-5.805662e-07,-5.380271e-07,-1.548864e-06,...,-7.908274e-07,-3.011078e-02,-4.982952e-07,-2.682407e-07,-7.183405e-07,,-1.229018e-06,1.544755e-03,-1.177484e-01,-1.869075e-07
96,-2.626865e-07,1.176633e-02,-7.366350e-07,8.258003e-03,-4.463687e-07,-1.254766e-02,9.753599e-02,-3.954141e-07,-4.771615e-07,-9.866540e-07,...,-7.230446e-07,-5.973086e-07,-4.625098e-07,-2.522018e-07,-6.619713e-07,-1.229018e-06,,-1.191328e-02,1.589551e-02,-2.303012e-07
97,-2.666966e-02,-1.566665e-07,2.579365e-03,-2.645084e-07,9.985567e-04,-3.033427e-02,-2.167056e-07,9.318801e-04,-5.859099e-02,1.750977e-03,...,-4.163211e-02,2.343760e-03,2.054574e-03,-2.170716e-02,-2.481242e-02,1.544755e-03,-1.191328e-02,,-3.532054e-07,-7.099129e-03
98,8.168036e-03,-2.487476e-02,-3.499327e-02,3.042914e-02,-2.147728e-02,-3.714119e-07,-1.147740e-02,-3.853039e-02,1.428627e-02,-5.837791e-02,...,1.104472e-02,-2.854879e-02,-1.807724e-03,7.856827e-03,1.016663e-02,-1.177484e-01,1.589551e-02,-3.532054e-07,,5.650144e-03


np.nanmean(norm_err)=np.float64(-0.0025676315510592663)
    np.nanmean(np.abs(norm_err))=np.float64(0.009873829086437613)
    np.nanmedian(norm_err)=np.float64(-2.2249256725014945e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(4.3960636091151606e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33909.81it/s]
100%|██████████| 100/100 [00:00<00:00, 437.83it/s]
5939it [00:00, 655129.04it/s]
100%|██████████| 100/100 [00:00<00:00, 261327.35it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35469.80it/s]
100%|██████████| 100/100 [00:00<00:00, 1034.97it/s]
5933it [00:00, 640766.44it/s]
100%|██████████| 100/100 [00:00<00:00, 325644.72it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,0.000000e+00,-1.033331e-07,0.000000e+00,0.000000e+00,-1.034507e-07,-1.063436e-07,0.000000e+00,...,-1.093102e-07,-1.065307e-07,0.000000e+00,0.000000e+00,-1.036130e-07,-1.034092e-07,0.000000e+00,0.000000e+00,-1.065496e-07,0.000000e+00
1,0.000000e+00,,0.000000e+00,0.000000e+00,-1.036628e-07,0.000000e+00,0.000000e+00,-1.037811e-07,-1.097627e-07,0.000000e+00,...,-1.066139e-07,-1.099621e-07,0.000000e+00,0.000000e+00,-1.039445e-07,-1.037394e-07,0.000000e+00,0.000000e+00,-1.285073e-07,0.000000e+00
2,0.000000e+00,0.000000e+00,,0.000000e+00,-1.093631e-07,0.000000e+00,0.000000e+00,-1.064396e-07,-1.035618e-07,0.000000e+00,...,-1.034875e-07,-1.037393e-07,0.000000e+00,0.000000e+00,-1.066115e-07,-1.126813e-07,0.000000e+00,0.000000e+00,-1.037572e-07,0.000000e+00
3,0.000000e+00,0.000000e+00,0.000000e+00,,-1.124957e-07,0.000000e+00,0.000000e+00,-1.063545e-07,-1.034813e-07,0.000000e+00,...,-1.034071e-07,-1.036585e-07,0.000000e+00,0.000000e+00,-1.065261e-07,-1.093584e-07,0.000000e+00,0.000000e+00,-1.036764e-07,0.000000e+00
4,-1.033331e-07,-1.036628e-07,-1.093631e-07,-1.124957e-07,,-1.034632e-07,-1.062738e-07,-2.126276e-07,-2.068856e-07,-1.063695e-07,...,-2.067372e-07,-2.072397e-07,-1.128117e-07,-1.095934e-07,-2.129706e-07,-2.186307e-07,-1.064488e-07,-1.035860e-07,-2.072755e-07,-1.033647e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.034092e-07,-1.037394e-07,-1.126813e-07,-1.093584e-07,-2.186307e-07,-1.035395e-07,-1.063543e-07,-2.127888e-07,-2.070381e-07,-1.064501e-07,...,-2.068895e-07,-2.073927e-07,-1.096569e-07,-1.163707e-07,-2.131323e-07,,-1.065295e-07,-1.036625e-07,-2.074286e-07,-1.034409e-07
96,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064488e-07,0.000000e+00,0.000000e+00,-1.096365e-07,-1.036886e-07,0.000000e+00,...,-1.036141e-07,-1.038665e-07,0.000000e+00,0.000000e+00,-1.098189e-07,-1.065295e-07,,0.000000e+00,-1.038845e-07,0.000000e+00
97,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.035860e-07,0.000000e+00,0.000000e+00,-1.037041e-07,-1.066114e-07,0.000000e+00,...,-1.162741e-07,-1.067995e-07,0.000000e+00,0.000000e+00,-1.038673e-07,-1.036625e-07,0.000000e+00,,-1.068185e-07,0.000000e+00
98,-1.065496e-07,-1.285073e-07,-1.037572e-07,-1.036764e-07,-2.072755e-07,-1.066879e-07,-1.037178e-07,-2.075120e-07,-2.194692e-07,-1.038090e-07,...,-2.131747e-07,-2.198678e-07,-1.039447e-07,-1.039645e-07,-2.078386e-07,-2.074286e-07,-1.038845e-07,-1.068185e-07,,-1.065832e-07


np.nanmean(norm_err)=np.float64(-1.0888461398902597e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0888461398902597e-07)
    np.nanmedian(norm_err)=np.float64(-1.0629313277553901e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0629313277553901e-07)
    
