In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-21T00:23:10.670058+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
pandas                            : 2.2.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 9299.80it/s]
100%|██████████| 100/100 [00:00<00:00, 400.24it/s]
6126it [00:00, 619812.00it/s]
100%|██████████| 100/100 [00:00<00:00, 220752.84it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.039183,-0.036120,-0.036956,0.000000,0.187967,-0.333360,0.0,0.000000,...,0.045918,0.0,0.000000,0.057348,0.0,-0.043717,0.0,0.0,0.0,0.0
1,0.000000,,0.000000,0.000000,0.000000,0.300965,-0.969645,0.978440,0.0,-0.274596,...,0.000000,0.0,-0.209931,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.039183,0.0,,0.000000,0.000000,0.000000,0.149577,-0.229086,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,-0.036120,0.0,0.000000,,0.000000,0.000000,0.143760,-0.215716,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,-0.184843,0.0,0.0,0.0,0.0
4,-0.036956,0.0,0.000000,0.000000,,0.000000,0.145397,-0.219423,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,-0.335194,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.043717,0.0,0.000000,-0.184843,-0.335194,0.000000,0.157368,-0.247880,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,,0.0,0.0,0.0,0.0
96,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,,0.0,0.0,0.0
97,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.209093,-0.406136,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,,0.0,0.0
98,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.206697,-0.397192,0.0,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(-0.029742847370744674)
    np.nanmean(np.abs(norm_err))=np.float64(0.09319401051771721)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 34155.57it/s]
100%|██████████| 100/100 [00:00<00:00, 363.24it/s]
5962it [00:00, 558167.02it/s]
100%|██████████| 100/100 [00:00<00:00, 212908.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000474,-2.274947e-07,-3.865543e-07,0.000506,0.000797,-0.000002,-2.202930e-07,-9.117412e-07,-6.249783e-07,...,-1.100252e-06,-1.355549e-02,-2.995590e-02,-1.685697e-06,0.001263,-4.119745e-07,0.000632,-4.141852e-02,-3.912141e-06,-1.993680e-02
1,4.737411e-04,,1.587110e-02,1.010220e-02,0.000000,0.000000,0.013227,1.559939e-02,1.075551e-02,9.228082e-03,...,1.363674e-02,1.527908e-02,4.410836e-02,-2.200079e-02,-0.013722,7.481515e-03,-0.004771,2.416231e-02,5.531359e-03,3.740193e-02
2,-2.274947e-07,0.015871,,6.009049e-03,0.016385,0.019805,0.007748,3.065385e-02,-1.561877e-07,-1.349699e-07,...,7.971493e-03,2.936360e-02,2.028945e-02,0.000000e+00,0.004232,-1.103312e-07,0.008994,4.557457e-02,-1.739317e-01,1.737864e-02
3,-3.865543e-07,0.010102,6.009049e-03,,0.010523,0.013525,-0.039404,5.884983e-03,3.781941e-02,3.096726e-02,...,8.422368e-02,8.882765e-03,1.385230e-02,0.000000e+00,0.006161,1.470401e-02,0.024113,1.588535e-02,-3.387088e-07,1.124012e-02
4,5.061528e-04,0.000000,1.638518e-02,1.052292e-02,,0.000000,0.013958,1.609575e-02,1.123371e-02,9.577891e-03,...,1.441471e-02,1.575495e-02,4.611801e-02,-2.335563e-02,-0.014417,7.709803e-03,-0.004958,2.537434e-02,5.875528e-03,3.883698e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.119745e-07,0.007482,-1.103312e-07,1.470401e-02,0.007710,0.009207,0.030287,-1.086093e-07,-1.013191e-01,-8.767613e-02,...,1.913827e-02,6.743151e-03,9.266883e-03,-1.878067e-07,0.003920,,0.051947,1.013459e-02,-7.877885e-03,8.019999e-03
96,6.317787e-04,-0.004771,8.994177e-03,2.411285e-02,-0.004958,-0.006266,0.050798,8.819883e-03,2.599571e-02,2.166626e-02,...,3.489664e-02,8.615436e-03,1.300956e-02,-1.618743e-01,-0.039891,5.194743e-02,,1.472794e-02,7.172624e-03,1.073515e-02
97,-4.141852e-02,0.024162,4.557457e-02,1.588535e-02,0.025374,0.034635,0.025736,4.445519e-02,1.761923e-02,1.377218e-02,...,2.738278e-02,-1.754374e-07,-5.774454e-07,-4.914999e-07,0.008299,1.013459e-02,0.014728,,1.160340e-02,-4.416332e-07
98,-3.912141e-06,0.005531,-1.739317e-01,-3.387088e-07,0.005876,0.008791,-0.000001,-2.195095e-02,-1.613595e-02,-1.155467e-02,...,-7.847369e-07,4.423934e-03,8.911000e-03,-1.019176e-06,0.034948,-7.877885e-03,0.007173,1.160340e-02,,6.269411e-03


np.nanmean(norm_err)=np.float64(0.006508970752671018)
    np.nanmean(np.abs(norm_err))=np.float64(0.016741409312191553)
    np.nanmedian(norm_err)=np.float64(0.006699046582532599)
    np.nanmedian(np.abs(norm_err))=np.float64(0.009164702323676898)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34155.57it/s]
100%|██████████| 100/100 [00:00<00:00, 426.78it/s]
5917it [00:00, 638758.83it/s]
100%|██████████| 100/100 [00:00<00:00, 375497.22it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31429.78it/s]
100%|██████████| 100/100 [00:00<00:00, 1007.75it/s]
5949it [00:00, 559874.22it/s]
100%|██████████| 100/100 [00:00<00:00, 324134.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.035735e-07,-1.036994e-07,-2.325077e-07,-1.097448e-07,-2.189077e-07,-2.076000e-07,-1.038161e-07,-2.130278e-07,-1.130873e-07,...,-2.128647e-07,-1.097135e-07,-1.129006e-07,-2.071900e-07,-2.074517e-07,-2.130844e-07,-1.067213e-07,-1.036980e-07,-2.560426e-07,-1.130184e-07
1,-1.035735e-07,,0.000000e+00,-1.034738e-07,0.000000e+00,-1.033656e-07,-1.096043e-07,0.000000e+00,-1.034722e-07,0.000000e+00,...,-1.033953e-07,0.000000e+00,0.000000e+00,-1.063272e-07,-1.064650e-07,-1.034990e-07,0.000000e+00,0.000000e+00,-1.035192e-07,0.000000e+00
2,-1.036994e-07,0.000000e+00,,-1.035994e-07,0.000000e+00,-1.034910e-07,-1.066763e-07,0.000000e+00,-1.035978e-07,0.000000e+00,...,-1.035207e-07,0.000000e+00,0.000000e+00,-1.095162e-07,-1.096624e-07,-1.036246e-07,0.000000e+00,0.000000e+00,-1.036449e-07,0.000000e+00
3,-2.325077e-07,-1.034738e-07,-1.035994e-07,,-1.096328e-07,-2.186850e-07,-2.073997e-07,-1.037159e-07,-2.128169e-07,-1.129685e-07,...,-2.126541e-07,-1.096016e-07,-1.127821e-07,-2.069905e-07,-2.072517e-07,-2.128734e-07,-1.066154e-07,-1.035981e-07,-2.323708e-07,-1.128997e-07
4,-1.097448e-07,0.000000e+00,0.000000e+00,-1.096328e-07,,-1.127481e-07,-1.038517e-07,0.000000e+00,-1.065684e-07,0.000000e+00,...,-1.064867e-07,0.000000e+00,0.000000e+00,-1.036465e-07,-1.037775e-07,-1.065967e-07,0.000000e+00,0.000000e+00,-1.096838e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.130844e-07,-1.034990e-07,-1.036246e-07,-2.128734e-07,-1.065967e-07,-2.126445e-07,-2.074502e-07,-1.037412e-07,-2.189799e-07,-1.066784e-07,...,-2.252682e-07,-1.065672e-07,-1.065122e-07,-2.070408e-07,-2.073022e-07,,-1.129577e-07,-1.036233e-07,-2.129694e-07,-1.066171e-07
96,-1.067213e-07,0.000000e+00,0.000000e+00,-1.066154e-07,0.000000e+00,-1.065006e-07,-1.038948e-07,0.000000e+00,-1.096791e-07,0.000000e+00,...,-1.162735e-07,0.000000e+00,0.000000e+00,-1.036894e-07,-1.038205e-07,-1.129577e-07,,0.000000e+00,-1.066636e-07,0.000000e+00
97,-1.036980e-07,0.000000e+00,0.000000e+00,-1.035981e-07,0.000000e+00,-1.034896e-07,-1.066749e-07,0.000000e+00,-1.035965e-07,0.000000e+00,...,-1.035194e-07,0.000000e+00,0.000000e+00,-1.198356e-07,-1.239089e-07,-1.036233e-07,0.000000e+00,,-1.036436e-07,0.000000e+00
98,-2.560426e-07,-1.035192e-07,-1.036449e-07,-2.323708e-07,-1.096838e-07,-2.187864e-07,-2.074908e-07,-1.037615e-07,-2.129128e-07,-1.130225e-07,...,-2.127500e-07,-1.096525e-07,-1.128360e-07,-2.070812e-07,-2.073427e-07,-2.129694e-07,-1.066636e-07,-1.036436e-07,,-1.129537e-07


np.nanmean(norm_err)=np.float64(-1.08783964586695e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.08783964586695e-07)
    np.nanmedian(norm_err)=np.float64(-1.0632396326840586e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0632396326840586e-07)
    
