In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-08T00:24:56.273175+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10582.59it/s]
100%|██████████| 100/100 [00:00<00:00, 344.40it/s]
6112it [00:00, 572107.97it/s]
100%|██████████| 100/100 [00:00<00:00, 247743.89it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.249271,0.271558,0.000000,0.000000,0.588894,0.342220,0.000000,0.256876,0.000000,...,-0.254369,0.000000,0.000000,0.000000,0.000000,0.140219,0.532340,0.000000,0.197795,0.415451
1,0.249271,,0.117233,-0.129729,-0.161686,0.461316,0.000000,-0.024496,0.148502,-0.793294,...,0.054055,-0.302866,-0.022356,-0.952108,0.112336,0.251697,0.211888,-0.266501,0.027281,0.000000
2,0.271558,0.117233,,-0.116519,-0.208044,0.000000,0.140270,-0.026413,0.000000,-0.133058,...,0.058761,-0.167334,-0.023942,-0.124231,0.120348,0.225570,0.000000,-0.121662,0.029683,0.154634
3,0.000000,-0.129729,-0.116519,,0.000000,-0.178388,-0.158544,0.000000,-0.112385,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.170174,0.000000,0.045612,-0.177144
4,0.000000,-0.161686,-0.208044,0.000000,,-0.443401,-0.219760,0.000000,-0.196956,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.402052,0.000000,0.219351,-0.264704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.140219,0.251697,0.225570,0.000000,0.000000,0.386868,0.301271,0.000000,0.613546,0.000000,...,-0.032872,0.000000,0.000000,0.000000,-0.467745,,0.494841,0.000000,0.031674,0.332202
96,0.532340,0.211888,0.000000,-0.170174,-0.402052,0.000000,0.275493,-0.046243,-0.446002,-0.207918,...,0.112355,-0.305797,-0.039167,-0.187139,0.197546,0.494841,,-0.181371,0.057348,0.321050
97,0.000000,-0.266501,-0.121662,0.000000,0.000000,-0.190732,-0.333558,0.000000,-0.117162,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.181371,,0.050645,-0.378588
98,0.197795,0.027281,0.029683,0.045612,0.219351,0.063242,0.037259,0.173164,0.028102,0.064445,...,0.408610,0.159456,0.103250,0.053406,0.087213,0.031674,0.057348,0.050645,,0.045047


np.nanmean(norm_err)=np.float64(0.019717621483238208)
    np.nanmean(np.abs(norm_err))=np.float64(0.15463148608342847)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0984188431280834)
    


100%|██████████| 100/100 [00:00<00:00, 25461.69it/s]
100%|██████████| 100/100 [00:00<00:00, 363.70it/s]
5991it [00:00, 418857.10it/s]
100%|██████████| 100/100 [00:00<00:00, 295998.87it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-2.074995e-07,-1.547616e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.310129e-07,-1.778589e-07,...,-1.631303e-07,-3.185973e-07,0.000000e+00,-2.349151e-07,-1.489646e-07,-1.724905e-07,-2.806875e-07,-0.000276,0.000000e+00,0.000000e+00
1,0.000000e+00,,-4.268560e-07,-2.783948e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-4.099375e-02,-3.178785e-07,...,-3.129886e-07,-2.110753e-06,0.000000e+00,-5.417663e-07,-2.646945e-07,-3.493627e-07,-1.047087e-06,0.015480,0.000000e+00,0.000000e+00
2,-2.074995e-07,-4.268560e-07,,-3.421659e-07,-1.488783e-07,-2.200920e-07,-2.807244e-07,-1.561008e-07,-2.850464e-07,-4.398264e-07,...,-3.627370e-07,-7.929192e-07,-2.039857e-07,-5.396714e-07,-3.280532e-07,-3.860268e-07,-6.788297e-07,-0.000304,-1.537666e-07,-3.863288e-07
3,-1.547616e-07,-2.783948e-07,-3.421659e-07,,-1.196293e-07,-2.315044e-02,-2.023229e-07,-1.242486e-07,-1.573282e-02,-3.008305e-07,...,-2.957291e-07,-5.029965e-07,-1.427895e-07,-3.739561e-07,-2.722613e-07,-3.110276e-07,-4.486883e-07,0.008464,-1.268433e-07,-2.477596e-07
4,0.000000e+00,0.000000e+00,-1.488783e-07,-1.196293e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.049269e-07,-1.329780e-07,...,-1.245690e-07,-1.985558e-07,0.000000e+00,-1.624837e-07,-1.161357e-07,-1.299540e-07,-1.831405e-07,-0.000217,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.724905e-07,-3.493627e-07,-3.860268e-07,-3.110276e-07,-1.299540e-07,-1.949799e-07,-2.337287e-07,-1.354234e-07,-9.778594e-02,-3.342172e-07,...,-3.314946e-07,-6.038573e-07,-1.577489e-07,-4.269772e-07,-3.022875e-07,,-5.272444e-07,0.009260,-1.385116e-07,-2.965563e-07
96,-2.806875e-07,-1.047087e-06,-6.788297e-07,-4.486883e-07,-1.831405e-07,-3.168694e-07,-4.598690e-07,-1.941932e-07,-3.553205e-07,-5.334197e-07,...,-4.847360e-07,-1.762666e-06,-2.436381e-07,-8.165428e-07,-4.247283e-07,-5.272444e-07,,-0.000389,-1.954773e-07,-8.327861e-07
97,-2.764869e-04,1.548021e-02,-3.036413e-04,8.463765e-03,-2.170512e-04,1.017826e-02,2.419555e-02,-2.249715e-04,1.476879e-02,-2.693734e-04,...,8.843104e-03,-4.313269e-04,-2.564387e-04,-3.295811e-04,8.197027e-03,9.259576e-03,-3.891840e-04,,1.567601e-02,-5.989700e-02
98,0.000000e+00,0.000000e+00,-1.537666e-07,-1.268433e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.458047e-02,-1.368643e-07,...,-1.324106e-07,-2.157769e-07,0.000000e+00,-1.664872e-07,-1.229226e-07,-1.385116e-07,-1.954773e-07,0.015676,,0.000000e+00


np.nanmean(norm_err)=np.float64(0.00031575758174815465)
    np.nanmean(np.abs(norm_err))=np.float64(0.007921175318735822)
    np.nanmedian(norm_err)=np.float64(-2.0228786148133454e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(2.93166044848394e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34888.57it/s]
100%|██████████| 100/100 [00:00<00:00, 427.33it/s]
5965it [00:00, 639529.24it/s]
100%|██████████| 100/100 [00:00<00:00, 325392.09it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28328.41it/s]
100%|██████████| 100/100 [00:00<00:00, 415.39it/s]
5947it [00:00, 606780.33it/s]
100%|██████████| 100/100 [00:00<00:00, 383742.36it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.098321e-07,0.000000e+00,-1.038688e-07,0.000000e+00,0.000000e+00,-1.096546e-07,-1.163361e-07,-1.065419e-07,-1.036182e-07,...,0.000000e+00,0.000000e+00,-1.036652e-07,-1.035762e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,-1.098321e-07,,-1.039150e-07,-2.077898e-07,-1.037473e-07,-1.200115e-07,-2.400760e-07,-2.193547e-07,-2.131388e-07,-2.072885e-07,...,-1.037932e-07,-1.067762e-07,-2.073825e-07,-2.072044e-07,-1.038990e-07,-1.038074e-07,-1.098038e-07,-1.037737e-07,-1.067824e-07,-1.038229e-07
2,0.000000e+00,-1.039150e-07,,-1.068398e-07,0.000000e+00,0.000000e+00,-1.037561e-07,-1.037504e-07,-1.037099e-07,-1.065748e-07,...,0.000000e+00,0.000000e+00,-1.066244e-07,-1.065303e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,-1.038688e-07,-2.077898e-07,-1.068398e-07,,-1.066625e-07,-1.037163e-07,-2.074720e-07,-2.074607e-07,-2.073798e-07,-2.325983e-07,...,-1.067111e-07,-1.038857e-07,-2.400384e-07,-2.256171e-07,-1.068229e-07,-1.067261e-07,-1.038435e-07,-1.130120e-07,-1.038915e-07,-1.067424e-07
4,0.000000e+00,-1.037473e-07,0.000000e+00,-1.066625e-07,,0.000000e+00,-1.035889e-07,-1.035832e-07,-1.035429e-07,-1.063983e-07,...,0.000000e+00,0.000000e+00,-1.064479e-07,-1.063540e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.038074e-07,0.000000e+00,-1.067261e-07,0.000000e+00,0.000000e+00,-1.036488e-07,-1.036432e-07,-1.036028e-07,-1.064616e-07,...,0.000000e+00,0.000000e+00,-1.065112e-07,-1.064172e-07,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
96,0.000000e+00,-1.098038e-07,0.000000e+00,-1.038435e-07,0.000000e+00,0.000000e+00,-1.096264e-07,-1.199618e-07,-1.065153e-07,-1.035931e-07,...,0.000000e+00,0.000000e+00,-1.036400e-07,-1.035511e-07,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00
97,0.000000e+00,-1.037737e-07,0.000000e+00,-1.130120e-07,0.000000e+00,0.000000e+00,-1.036152e-07,-1.036096e-07,-1.035692e-07,-1.127155e-07,...,0.000000e+00,0.000000e+00,-1.127710e-07,-1.236189e-07,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00
98,0.000000e+00,-1.067824e-07,0.000000e+00,-1.038915e-07,0.000000e+00,0.000000e+00,-1.066146e-07,-1.066086e-07,-1.096284e-07,-1.036409e-07,...,0.000000e+00,0.000000e+00,-1.036879e-07,-1.035989e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0666734826268942e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0666734826268942e-07)
    np.nanmedian(norm_err)=np.float64(-1.062631105446422e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.062631105446422e-07)
    
