In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-18T21:41:06.340649+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10590.34it/s]
100%|██████████| 100/100 [00:00<00:00, 368.86it/s]
6114it [00:00, 561961.18it/s]
100%|██████████| 100/100 [00:00<00:00, 239537.64it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.266842,0.0,0.0,-0.177187,0.251780,0.000000,-0.278133,0.207817,0.000000,...,0.227668,0.0,0.228630,-0.032000,0.157448,-0.052092,0.0,0.000000,0.0,0.143905
1,0.266842,,0.0,0.0,0.180236,0.000000,0.000000,0.131682,0.000000,0.000000,...,0.000000,0.0,0.000000,-0.315396,0.000000,0.000000,0.0,0.498375,0.0,0.000000
2,0.000000,0.000000,,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000
3,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.079863,0.000000,0.000000,0.0,0.000000,0.0,0.000000
4,-0.177187,0.180236,0.0,0.0,,0.149942,0.000000,0.000000,0.091993,0.000000,...,0.113991,0.0,0.115205,-0.062293,0.053857,-0.249544,0.0,-0.364684,0.0,0.340936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.052092,0.000000,0.0,0.0,-0.249544,0.000000,0.000000,-0.165208,0.000000,0.000000,...,0.000000,0.0,0.000000,-0.061279,0.000000,,0.0,-0.095312,0.0,0.000000
96,0.000000,0.000000,0.0,0.0,0.000000,0.000000,-0.730219,0.000000,0.000000,-0.121357,...,0.000000,0.0,0.000000,0.314742,0.000000,0.000000,,0.000000,0.0,0.000000
97,0.000000,0.498375,0.0,0.0,-0.364684,0.448289,0.000000,-0.539257,0.325637,0.000000,...,0.377166,0.0,0.379815,-0.044363,0.216906,-0.095312,0.0,,0.0,0.288433
98,0.000000,0.000000,0.0,0.0,0.000000,0.000000,-0.100476,0.000000,0.000000,-0.509844,...,0.000000,0.0,0.000000,0.282032,0.000000,0.000000,0.0,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.001145112132652099)
    np.nanmean(np.abs(norm_err))=np.float64(0.06757643274885705)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 27921.08it/s]
100%|██████████| 100/100 [00:00<00:00, 421.26it/s]
5957it [00:00, 598540.36it/s]
100%|██████████| 100/100 [00:00<00:00, 288070.33it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,3.060827e-02,1.149639e-02,4.072363e-03,-5.646559e-03,2.774157e-02,-9.491954e-03,5.351801e-02,4.783957e-03,0.002398,...,-1.354200e-01,0.001954,3.159379e-02,-0.037350,4.895094e-03,5.736471e-03,2.404117e-02,6.263313e-02,6.774950e-02,1.368790e-02
1,0.030608,,1.550262e-02,4.079617e-03,-5.249125e-07,-4.282405e-07,-3.122322e-07,2.807825e-02,5.086111e-03,0.013429,...,-3.811829e-07,0.010181,6.052413e-02,0.007614,5.254550e-03,6.643611e-03,-2.122309e-07,2.879933e-02,3.837167e-02,2.056750e-02
2,0.011496,1.550262e-02,,0.000000e+00,-3.317081e-07,1.314532e-02,0.000000e+00,0.000000e+00,-2.485188e-07,0.040667,...,-2.246502e-07,0.029342,-2.881829e-07,0.009349,-2.584674e-07,0.000000e+00,1.051723e-02,5.649530e-03,9.817310e-03,-5.360337e-07
3,0.004072,4.079617e-03,0.000000e+00,,-1.878554e-07,3.674951e-03,0.000000e+00,0.000000e+00,-6.512941e-02,0.005100,...,-1.479310e-07,0.004103,-1.720776e-07,0.011310,-1.681201e-07,0.000000e+00,3.159959e-03,-1.217251e-02,1.260726e-03,-2.322788e-07
4,-0.005647,-5.249125e-07,-3.317081e-07,-1.878554e-07,,-4.470799e-07,-4.083045e-07,-1.641587e-07,-4.841465e-07,-0.007835,...,-4.451138e-07,-0.005706,-5.540278e-07,-0.009154,-5.030053e-07,-3.335074e-07,-1.797389e-07,-1.688148e-07,-4.660322e-07,-9.506756e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.005736,6.643611e-03,0.000000e+00,0.000000e+00,-3.335074e-07,5.633423e-03,0.000000e+00,0.000000e+00,-3.410041e-02,0.008009,...,-2.254740e-07,0.005798,-2.868187e-07,0.018913,-3.369317e-07,,4.507284e-03,-1.709309e-02,1.963214e-03,-5.049581e-07
96,0.024041,-2.122309e-07,1.051723e-02,3.159959e-03,-1.797389e-07,-1.556726e-07,0.000000e+00,2.226134e-02,3.731936e-03,0.009874,...,-1.428512e-07,0.007998,4.268577e-02,0.005404,3.821825e-03,4.507284e-03,,2.271220e-02,2.827482e-02,1.262657e-02
97,0.062633,2.879933e-02,5.649530e-03,-1.217251e-02,-1.688148e-07,2.614821e-02,0.000000e+00,4.907374e-02,-1.428096e-02,0.009365,...,-1.358637e-07,0.007661,2.588367e-02,0.005100,-1.460967e-02,-1.709309e-02,2.271220e-02,,6.306589e-02,6.701919e-03
98,0.067749,3.837167e-02,9.817310e-03,1.260726e-03,-4.660322e-07,3.380502e-02,-2.714342e-07,7.118682e-02,1.543889e-03,0.012221,...,-3.491489e-07,0.009472,3.658801e-02,0.006845,1.590320e-03,1.963214e-03,2.827482e-02,6.306589e-02,,1.255922e-02


np.nanmean(norm_err)=np.float64(0.0047125085005786235)
    np.nanmean(np.abs(norm_err))=np.float64(0.014512759577478494)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.003805079311351934)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34709.57it/s]
100%|██████████| 100/100 [00:00<00:00, 417.38it/s]
5977it [00:00, 561074.17it/s]
100%|██████████| 100/100 [00:00<00:00, 320420.47it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35769.26it/s]
100%|██████████| 100/100 [00:00<00:00, 1001.27it/s]
5946it [00:00, 573898.46it/s]
100%|██████████| 100/100 [00:00<00:00, 331827.85it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.036894e-07,-2.070954e-07,-2.070176e-07,-2.189881e-07,-2.643647e-07,-1.035716e-07,-1.200121e-07,-1.064098e-07,-2.074331e-07,...,-1.063782e-07,-1.064326e-07,-1.037740e-07,-1.066436e-07,-2.071769e-07,-2.072434e-07,-1.035082e-07,-2.075317e-07,-1.037780e-07,-1.064499e-07
1,-1.036894e-07,,-1.279396e-07,-1.095226e-07,-1.036189e-07,-1.035625e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.201237e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.128545e-07,-1.065853e-07,0.000000e+00,-1.067378e-07,0.000000e+00,0.000000e+00
2,-2.070954e-07,-1.279396e-07,,-2.187291e-07,-2.069548e-07,-2.068423e-07,-1.063827e-07,-1.036275e-07,-1.034499e-07,-2.398671e-07,...,-1.034200e-07,-1.034714e-07,-1.096606e-07,-1.036709e-07,-2.253734e-07,-2.128712e-07,-1.125918e-07,-2.131754e-07,-1.096651e-07,-1.034878e-07
3,-2.070176e-07,-1.095226e-07,-2.187291e-07,,-2.068771e-07,-2.067647e-07,-1.063417e-07,-1.035885e-07,-1.034111e-07,-2.191058e-07,...,-1.033812e-07,-1.034326e-07,-1.163009e-07,-1.036319e-07,-2.188200e-07,-2.127891e-07,-1.093205e-07,-2.130930e-07,-1.128648e-07,-1.034489e-07
4,-2.189881e-07,-1.036189e-07,-2.069548e-07,-2.068771e-07,,-2.187051e-07,-1.035012e-07,-1.095832e-07,-1.063356e-07,-2.072920e-07,...,-1.063040e-07,-1.063583e-07,-1.037033e-07,-1.065690e-07,-2.070361e-07,-2.071026e-07,-1.034380e-07,-2.073905e-07,-1.037074e-07,-1.063756e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.072434e-07,-1.065853e-07,-2.128712e-07,-2.127891e-07,-2.071026e-07,-2.069899e-07,-1.161887e-07,-1.037016e-07,-1.035238e-07,-2.132280e-07,...,-1.034938e-07,-1.035453e-07,-1.066747e-07,-1.037450e-07,-2.129573e-07,,-1.063939e-07,-2.194690e-07,-1.066790e-07,-1.035617e-07
96,-1.035082e-07,0.000000e+00,-1.125918e-07,-1.093205e-07,-1.034380e-07,-1.033818e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.127914e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.160672e-07,-1.063939e-07,,-1.065459e-07,0.000000e+00,0.000000e+00
97,-2.075317e-07,-1.067378e-07,-2.131754e-07,-2.130930e-07,-2.073905e-07,-2.072775e-07,-1.096783e-07,-1.038459e-07,-1.036676e-07,-2.135332e-07,...,-1.036376e-07,-1.036892e-07,-1.068275e-07,-1.038895e-07,-2.132617e-07,-2.194690e-07,-1.065459e-07,,-1.068318e-07,-1.037056e-07
98,-1.037780e-07,0.000000e+00,-1.096651e-07,-1.128648e-07,-1.037074e-07,-1.036509e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.098545e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.097108e-07,-1.066790e-07,0.000000e+00,-1.068318e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0241847791964168e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0241847791964168e-07)
    np.nanmedian(norm_err)=np.float64(-1.0381510654575895e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0381510654575895e-07)
    
