In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-09T00:25:12.572711+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 11089.88it/s]
100%|██████████| 100/100 [00:00<00:00, 359.75it/s]
6132it [00:00, 615018.82it/s]
100%|██████████| 100/100 [00:00<00:00, 156387.17it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,-0.127666,-0.012374,0.000000,0.000000,0.000000,0.255651,0.000000,...,0.000000,-0.143756,0.361958,0.000000,0.000000,0.000000,0.000000,0.000000,-0.144391,0.000000
1,0.000000,,0.000000,-0.130855,-0.012990,0.000000,0.000000,0.000000,0.423457,0.000000,...,0.000000,-0.147813,0.626713,0.000000,0.000000,0.000000,0.000000,0.000000,-0.148484,0.000000
2,0.000000,0.000000,,0.082148,-0.005954,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.088524,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.088764,0.000000
3,-0.127666,-0.130855,0.082148,,-0.007162,-0.034299,-0.525084,0.106545,-0.268373,0.672271,...,0.149900,0.000000,-0.331218,-0.095181,0.112014,-0.634330,-0.465431,-0.033576,0.000000,0.000000
4,-0.012374,-0.012990,-0.005954,-0.007162,,-0.030697,-0.008601,-0.030681,-0.007456,-0.012329,...,-0.017395,-0.008195,-0.010287,-0.007435,-0.009728,-0.011285,-0.007307,-0.028493,-0.008236,-0.185691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,-0.634330,-0.011285,0.000000,0.508922,0.000000,-0.240706,0.000000,...,0.000000,-0.731140,-0.332717,0.000000,0.000000,,0.000000,0.000000,-0.735052,0.000000
96,0.000000,0.000000,0.000000,-0.465431,-0.007307,0.000000,-0.252073,0.000000,-0.176796,0.000000,...,0.000000,-0.515515,-0.221861,0.000000,0.000000,0.000000,,0.000000,-0.517458,0.000000
97,0.000000,0.000000,0.000000,-0.033576,-0.028493,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.039372,0.000000,0.000000,0.000000,0.000000,0.000000,,-0.039610,0.000000
98,-0.144391,-0.148484,0.088764,0.000000,-0.008236,-0.040620,-0.592263,0.127007,-0.296104,0.786488,...,0.173495,0.000000,-0.374504,-0.104178,0.124686,-0.735052,-0.517458,-0.039610,,0.000000


np.nanmean(norm_err)=np.float64(-0.029918825188433838)
    np.nanmean(np.abs(norm_err))=np.float64(0.09189230168886033)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28380.16it/s]
100%|██████████| 100/100 [00:00<00:00, 369.15it/s]
5967it [00:00, 565042.15it/s]
100%|██████████| 100/100 [00:00<00:00, 256062.52it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-5.007400e-07,-0.000959,0.026690,0.011828,-2.605756e-07,2.994316e-02,-4.242792e-07,-6.571207e-07,-3.320566e-06,...,0.023670,8.106846e-03,0.004769,-1.653328e-06,-1.614569e-06,-0.009726,0.011689,-4.697995e-07,-6.030317e-07,-5.626140e-07
1,-5.007400e-07,,-0.000558,0.004115,0.005310,-1.433284e-07,4.411477e-03,-2.011273e-07,-3.269378e-07,-5.590038e-07,...,0.006848,4.402639e-03,0.003023,-2.793041e-07,-2.781760e-07,-0.141048,0.004840,-2.728169e-07,-3.178609e-07,-2.745750e-07
2,-9.593932e-04,-5.577981e-04,,0.000000,-0.099579,-5.570768e-04,-1.329448e-07,-7.245270e-04,-6.337812e-04,3.303175e-03,...,-0.129387,-3.514002e-02,-0.000480,3.614198e-02,-1.061716e-03,-0.000655,-0.000762,-5.308142e-04,-6.189301e-04,-5.341710e-04
3,2.668987e-02,4.115082e-03,0.000000,,0.005143,1.659813e-02,-1.418854e-07,5.185724e-03,2.808206e-02,-2.937865e-02,...,0.006358,4.375442e-03,0.003595,-2.690013e-03,7.163531e-03,0.000576,0.005415,2.393115e-02,1.113002e-03,1.597627e-02
4,1.182818e-02,5.309993e-03,-0.099579,0.005143,,5.302901e-03,5.614088e-03,7.620886e-03,6.305145e-03,-5.016745e-07,...,0.000000,-2.120176e-07,0.009409,0.000000e+00,3.059258e-02,0.000000,0.017659,4.976919e-03,8.694931e-04,5.017805e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-9.726425e-03,-1.410480e-01,-0.000655,0.000576,0.000000,-4.834388e-03,6.239586e-04,-9.476725e-02,-5.654052e-03,-3.972484e-07,...,0.000000,-1.829537e-07,-0.000578,0.000000e+00,-1.124963e-02,,-0.001016,-4.562288e-03,-7.828215e-04,-4.596620e-03
96,1.168854e-02,4.839530e-03,-0.000762,0.005415,0.017659,4.832675e-03,5.940514e-03,7.142479e-03,5.815510e-03,-5.974064e-07,...,0.027108,1.338036e-02,0.108134,0.000000e+00,1.162112e-01,-0.001016,,4.518026e-03,-1.868809e-03,4.557355e-03
97,-4.697995e-07,-2.728169e-07,-0.000531,0.023931,0.004977,-1.381210e-07,2.559271e-02,-1.756304e-07,-3.200335e-07,-5.077246e-07,...,0.006304,4.171181e-03,0.002895,-2.536992e-07,-2.527680e-07,-0.004562,0.004518,,-3.005977e-07,-2.650049e-07
98,-6.030317e-07,-3.178609e-07,-0.000619,0.001113,0.000869,-1.587435e-07,1.201302e-03,-2.148269e-07,-3.676567e-07,-6.954821e-07,...,0.001171,7.031705e-04,-0.025655,-3.474350e-07,-3.427789e-07,-0.000783,-0.001869,-3.005977e-07,,-3.027335e-07


np.nanmean(norm_err)=np.float64(0.001423297834887428)
    np.nanmean(np.abs(norm_err))=np.float64(0.010407337173580127)
    np.nanmedian(norm_err)=np.float64(-1.5163379421649158e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.000686361541937835)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33306.63it/s]
100%|██████████| 100/100 [00:00<00:00, 398.23it/s]
5960it [00:00, 606955.08it/s]
100%|██████████| 100/100 [00:00<00:00, 348364.12it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30997.74it/s]
100%|██████████| 100/100 [00:00<00:00, 392.45it/s]
5973it [00:00, 578969.24it/s]
100%|██████████| 100/100 [00:00<00:00, 340170.64it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.062854e-07,0.000000e+00,-1.035623e-07,0.000000e+00,-1.062836e-07,0.000000e+00,-1.161320e-07,0.000000e+00,...,-1.035068e-07,0.000000e+00,-1.034571e-07,-1.036004e-07,-1.095066e-07,-1.062881e-07,0.000000e+00,-1.064011e-07,-1.277030e-07,-1.063552e-07
1,0.000000e+00,,-1.034870e-07,0.000000e+00,-1.128397e-07,0.000000e+00,-1.034853e-07,0.000000e+00,-1.036082e-07,0.000000e+00,...,-1.127737e-07,0.000000e+00,-1.064255e-07,-1.065771e-07,-1.036437e-07,-1.034895e-07,0.000000e+00,-1.035966e-07,-1.035416e-07,-1.035532e-07
2,-1.062854e-07,-1.034870e-07,,-1.064326e-07,-2.071404e-07,-1.036958e-07,-2.319748e-07,-1.063509e-07,-2.128432e-07,-1.036674e-07,...,-2.070292e-07,-1.160120e-07,-2.069298e-07,-2.072165e-07,-2.129181e-07,-2.319855e-07,-1.035176e-07,-2.189256e-07,-2.127026e-07,-2.188285e-07
3,0.000000e+00,0.000000e+00,-1.064326e-07,,-1.037020e-07,0.000000e+00,-1.064307e-07,0.000000e+00,-1.096230e-07,0.000000e+00,...,-1.036463e-07,0.000000e+00,-1.035964e-07,-1.037402e-07,-1.163524e-07,-1.064352e-07,0.000000e+00,-1.065486e-07,-1.095484e-07,-1.065026e-07
4,-1.035623e-07,-1.128397e-07,-2.071404e-07,-1.037020e-07,,-1.098320e-07,-2.071369e-07,-1.036245e-07,-2.073832e-07,-1.067281e-07,...,-2.326282e-07,-1.035881e-07,-2.130269e-07,-2.133308e-07,-2.074543e-07,-2.071454e-07,-1.128760e-07,-2.073600e-07,-2.072497e-07,-2.072729e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.062881e-07,-1.034895e-07,-2.319855e-07,-1.064352e-07,-2.071454e-07,-1.036983e-07,-2.392560e-07,-1.063536e-07,-2.128485e-07,-1.036699e-07,...,-2.070343e-07,-1.196542e-07,-2.069348e-07,-2.072215e-07,-2.129234e-07,,-1.035201e-07,-2.189312e-07,-2.127079e-07,-2.188341e-07
96,0.000000e+00,0.000000e+00,-1.035176e-07,0.000000e+00,-1.128760e-07,0.000000e+00,-1.035158e-07,0.000000e+00,-1.036389e-07,0.000000e+00,...,-1.128100e-07,0.000000e+00,-1.064578e-07,-1.066096e-07,-1.036744e-07,-1.035201e-07,,-1.036273e-07,-1.035722e-07,-1.035838e-07
97,-1.064011e-07,-1.035966e-07,-2.189256e-07,-1.065486e-07,-2.073600e-07,-1.038059e-07,-2.189217e-07,-1.064667e-07,-2.130751e-07,-1.037775e-07,...,-2.072487e-07,-1.094828e-07,-2.071490e-07,-2.074363e-07,-2.131502e-07,-2.189312e-07,-1.036273e-07,,-2.129342e-07,-2.255502e-07
98,-1.277030e-07,-1.035416e-07,-2.127026e-07,-1.095484e-07,-2.072497e-07,-1.037506e-07,-2.126989e-07,-1.094619e-07,-2.324213e-07,-1.037222e-07,...,-2.071385e-07,-1.063702e-07,-2.070389e-07,-2.073259e-07,-2.191530e-07,-2.127079e-07,-1.035722e-07,-2.129342e-07,,-2.128423e-07


np.nanmean(norm_err)=np.float64(-1.2161206712367573e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.2161206712367573e-07)
    np.nanmedian(norm_err)=np.float64(-1.0658073701395446e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0658073701395446e-07)
    
