In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-31T00:23:15.692624+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
pandas                            : 2.2.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10770.64it/s]
100%|██████████| 100/100 [00:00<00:00, 382.75it/s]
6155it [00:00, 664622.74it/s]
100%|██████████| 100/100 [00:00<00:00, 220289.08it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.090900,1.226226,-0.208083,0.571221,0.000000,0.000000,0.509104,0.000000,0.000000,...,-0.168657,0.000000,0.000000,0.576454,0.000000,0.766981,-0.091117,0.612996,0.000000,0.042871
1,-0.090900,,0.039804,0.000000,0.034893,0.000000,0.057724,0.032929,0.000000,-0.106823,...,0.000000,-0.242406,0.000000,0.035049,0.027730,0.039868,0.000000,0.036096,-0.270582,0.414419
2,1.226226,0.039804,,0.078577,0.354274,-0.316689,0.113685,0.072345,-0.232419,0.732488,...,0.066781,0.134235,-0.360273,0.083425,0.108062,0.117105,0.039887,0.089610,0.155108,-0.725305
3,-0.208083,0.000000,0.078577,,0.061493,0.000000,0.202709,0.055643,0.000000,-0.315860,...,0.000000,-0.424585,0.000000,0.061978,0.088642,0.078827,0.000000,0.065329,-0.519301,0.430903
4,0.571221,0.034893,0.354274,0.061493,,-0.319815,0.081091,0.258322,-0.164745,0.511708,...,0.054025,0.115905,-0.252919,0.579900,0.078189,0.355298,0.034957,0.612551,0.131143,-0.074191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.766981,0.039868,0.117105,0.078827,0.355298,-0.318047,0.114211,0.000000,-0.233517,0.736124,...,0.066962,0.134479,-0.362032,0.000000,0.108537,,0.039951,0.000000,0.155433,-0.280596
96,-0.091117,0.000000,0.039887,0.000000,0.034957,0.000000,0.057899,0.032986,0.000000,-0.107123,...,0.000000,-0.242847,0.000000,0.035113,0.027810,0.039951,,0.036164,-0.271131,0.415170
97,0.612996,0.036096,0.089610,0.065329,0.612551,-0.248901,0.087897,0.000000,-0.178805,0.557052,...,0.056963,0.120343,-0.275054,0.000000,0.084497,0.000000,0.036164,,0.136853,-0.249969
98,0.000000,-0.270582,0.155108,-0.519301,0.131143,0.000000,0.000000,0.122027,0.000000,0.000000,...,-0.445112,0.000000,0.000000,0.131876,0.000000,0.155433,-0.271131,0.136853,,0.033673


np.nanmean(norm_err)=np.float64(0.01943582933171962)
    np.nanmean(np.abs(norm_err))=np.float64(0.15519733760666635)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.08654673605322241)
    


100%|██████████| 100/100 [00:00<00:00, 33909.81it/s]
100%|██████████| 100/100 [00:00<00:00, 452.94it/s]
5969it [00:00, 590857.18it/s]
100%|██████████| 100/100 [00:00<00:00, 179627.58it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.936169e-07,-7.393341e-07,-2.478667e-07,-2.218695e-07,-2.936898e-03,-0.071878,-5.164890e-07,-1.457342e-06,-0.007448,...,-1.438893e-06,-2.611217e-07,-4.335244e-07,-1.000421e-06,1.442628e-03,-7.457536e-07,-2.333090e-03,-2.745220e-03,-5.919395e-02,-3.767374e-07
1,-2.936169e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.064909e-02,-0.001238,-2.022105e-07,-3.627525e-07,0.005988,...,-4.114833e-07,0.000000e+00,-1.649287e-07,-3.307917e-07,6.082598e-03,-2.420296e-07,-8.966113e-03,-1.013514e-02,-1.465895e-07,-1.575048e-07
2,-7.393341e-07,0.000000e+00,,0.000000e+00,0.000000e+00,4.547827e-03,0.001798,-3.709881e-07,-1.421524e-06,0.000000,...,-2.440390e-06,0.000000e+00,-2.493768e-07,-9.813808e-07,-1.747369e-03,-4.811170e-07,3.246576e-03,4.102790e-03,-2.097078e-07,-2.310242e-07
3,-2.478667e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.483106e-03,-0.047590,-1.513738e-07,-2.435327e-07,-0.004594,...,-2.424936e-07,0.000000e+00,-1.361060e-07,-2.112815e-07,9.738433e-04,-1.846478e-07,-1.311661e-03,-1.432671e-03,-3.962478e-02,-1.243379e-07
4,-2.218695e-07,0.000000e+00,0.000000e+00,0.000000e+00,,1.652984e-03,0.001062,-2.111496e-07,-2.591979e-07,0.000000,...,-2.805556e-07,0.000000e+00,-1.395755e-07,-2.396037e-07,-1.044579e-03,-1.910920e-07,1.442797e-03,1.590167e-03,-1.262128e-07,-1.336338e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-7.457536e-07,-2.420296e-07,-4.811170e-07,-1.846478e-07,-1.910920e-07,-2.330466e-03,-0.231609,-4.349503e-07,-9.531575e-07,-0.006393,...,-9.452310e-07,-2.195120e-07,-4.523070e-07,-7.339215e-07,1.279074e-03,,-1.933415e-03,-2.208177e-03,-7.403883e-02,-3.314186e-07
96,-2.333090e-03,-8.966113e-03,3.246576e-03,-1.311661e-03,1.442797e-03,0.000000e+00,0.037276,1.624230e-03,-2.862129e-03,-0.001464,...,-4.892713e-02,2.485943e-02,-1.328075e-03,-1.295548e-02,-1.458568e-07,-1.933415e-03,,-2.557695e-07,-1.182656e-03,-8.279167e-02
97,-2.745220e-03,-1.013514e-02,4.102790e-03,-1.432671e-03,1.590167e-03,-3.136963e-07,0.062799,1.813481e-03,-3.508115e-03,-0.001617,...,-6.191518e-02,2.782481e-02,-1.452257e-03,-1.554627e-02,-3.159952e-07,-2.208177e-03,-2.557695e-07,,-1.280153e-03,-4.350072e-02
98,-5.919395e-02,-1.465895e-07,-2.097078e-07,-3.962478e-02,-1.262128e-07,-1.320248e-03,-0.059273,-2.744135e-07,-5.328815e-02,-0.004193,...,-4.161526e-07,-1.380147e-07,-5.644378e-02,-3.693355e-07,9.006816e-04,-7.403883e-02,-1.182656e-03,-1.280153e-03,,-2.292340e-07


np.nanmean(norm_err)=np.float64(-0.0013781786433584146)
    np.nanmean(np.abs(norm_err))=np.float64(0.010632364492562024)
    np.nanmedian(norm_err)=np.float64(-3.0063200251359326e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0011918655769100738)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33447.40it/s]
100%|██████████| 100/100 [00:00<00:00, 456.29it/s]
5941it [00:00, 631500.04it/s]
100%|██████████| 100/100 [00:00<00:00, 360026.09it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32691.38it/s]
100%|██████████| 100/100 [00:00<00:00, 985.71it/s]
5923it [00:00, 611847.96it/s]
100%|██████████| 100/100 [00:00<00:00, 376508.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.065761e-07,-2.324709e-07,-1.036372e-07,-2.071685e-07,-1.063426e-07,-2.251104e-07,-2.070529e-07,-2.325418e-07,-1.125496e-07,...,-1.036317e-07,-2.473187e-07,-2.192435e-07,-1.096184e-07,-1.094900e-07,-1.034187e-07,-2.396921e-07,-1.095595e-07,-1.036876e-07,-1.095322e-07
1,-1.065761e-07,,-1.067480e-07,0.000000e+00,-1.038187e-07,0.000000e+00,-1.065302e-07,-1.037607e-07,-1.067780e-07,0.000000e+00,...,0.000000e+00,-1.066593e-07,-1.068077e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.067144e-07,0.000000e+00,0.000000e+00,0.000000e+00
2,-2.324709e-07,-1.067480e-07,,-1.037998e-07,-2.074933e-07,-1.065138e-07,-2.254940e-07,-2.073774e-07,-2.481020e-07,-1.127414e-07,...,-1.037943e-07,-2.326689e-07,-2.196073e-07,-1.098003e-07,-1.096715e-07,-1.035806e-07,-2.328000e-07,-1.097412e-07,-1.038504e-07,-1.097138e-07
3,-1.036372e-07,0.000000e+00,-1.037998e-07,,-1.480870e-07,0.000000e+00,-1.035938e-07,-1.066142e-07,-1.038281e-07,0.000000e+00,...,0.000000e+00,-1.037159e-07,-1.038562e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037680e-07,0.000000e+00,0.000000e+00,0.000000e+00
4,-2.071685e-07,-1.038187e-07,-2.074933e-07,-1.480870e-07,,-1.035972e-07,-2.070818e-07,-2.131164e-07,-2.075499e-07,-1.035362e-07,...,-1.066697e-07,-2.073257e-07,-2.076061e-07,-1.038001e-07,-1.036850e-07,-1.064440e-07,-2.074298e-07,-1.037472e-07,-1.067289e-07,-1.037227e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.034187e-07,0.000000e+00,-1.035806e-07,0.000000e+00,-1.064440e-07,0.000000e+00,-1.033755e-07,-1.094349e-07,-1.036088e-07,0.000000e+00,...,0.000000e+00,-1.034971e-07,-1.036368e-07,0.000000e+00,0.000000e+00,,-1.035489e-07,0.000000e+00,0.000000e+00,0.000000e+00
96,-2.396921e-07,-1.067144e-07,-2.328000e-07,-1.037680e-07,-2.074298e-07,-1.064803e-07,-2.254190e-07,-2.073140e-07,-2.328711e-07,-1.127039e-07,...,-1.037625e-07,-2.399026e-07,-2.195361e-07,-1.097647e-07,-1.096360e-07,-1.035489e-07,,-1.097057e-07,-1.038185e-07,-1.096783e-07
97,-1.095595e-07,0.000000e+00,-1.097412e-07,0.000000e+00,-1.037472e-07,0.000000e+00,-1.095110e-07,-1.036893e-07,-1.097728e-07,0.000000e+00,...,0.000000e+00,-1.096474e-07,-1.165118e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.097057e-07,,0.000000e+00,0.000000e+00
98,-1.036876e-07,0.000000e+00,-1.038504e-07,0.000000e+00,-1.067289e-07,0.000000e+00,-1.036442e-07,-1.097361e-07,-1.038787e-07,0.000000e+00,...,0.000000e+00,-1.037664e-07,-1.039069e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.038185e-07,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0683642923254183e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0683642923254183e-07)
    np.nanmedian(norm_err)=np.float64(-1.039408389532857e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.039408389532857e-07)
    
