In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T17:58:54.133182+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10422.96it/s]
100%|██████████| 100/100 [00:00<00:00, 375.80it/s]
6126it [00:00, 664536.57it/s]
100%|██████████| 100/100 [00:00<00:00, 220173.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.026924,0.000000,0.000000,0.030225,0.000000,0.000000,0.000000,0.029894,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.032802,-0.046720,-0.035554,0.000000,0.000000
1,0.000000,,0.051073,0.000000,0.000000,0.064419,0.000000,0.000000,0.000000,0.062934,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.077376,-0.079165,-0.051669,0.000000,0.000000
2,0.026924,0.051073,,0.050607,0.053598,0.000000,0.045597,0.023227,0.048361,0.000000,...,0.022688,0.028036,0.031684,0.031948,0.097940,0.000000,0.032658,0.022687,0.050051,0.036884
3,0.000000,0.000000,0.050607,,0.000000,0.063679,0.000000,0.000000,0.000000,0.062228,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.076311,-0.466313,-0.286621,0.000000,0.000000
4,0.000000,0.000000,0.053598,0.000000,,0.068488,0.000000,0.000000,0.000000,0.066812,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.083322,-0.082162,-0.052929,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.032802,0.077376,0.000000,0.076311,0.083322,0.000000,0.065465,0.027474,0.071316,0.000000,...,0.026723,0.034467,0.040151,0.040576,0.281334,,0.041728,0.026722,0.075054,0.048884
96,-0.046720,-0.079165,0.032658,-0.466313,-0.082162,0.037644,-0.072430,0.041056,-0.075870,0.037133,...,-0.040212,-0.048383,-0.053715,0.054093,0.000000,0.041728,,0.000000,0.461616,-0.060997
97,-0.035554,-0.051669,0.022687,-0.286621,-0.052929,0.024986,-0.048713,0.032176,-0.050245,0.024760,...,-0.031655,-0.036509,-0.039465,0.039669,0.000000,0.026722,0.000000,,0.284843,-0.043259
98,0.000000,0.000000,0.050051,0.000000,0.000000,0.062801,0.000000,0.000000,0.000000,0.061389,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.075054,0.461616,0.284843,,0.000000


np.nanmean(norm_err)=np.float64(0.010286757146820506)
    np.nanmean(np.abs(norm_err))=np.float64(0.055545488381026464)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30316.62it/s]
100%|██████████| 100/100 [00:00<00:00, 393.27it/s]
5973it [00:00, 602037.29it/s]
100%|██████████| 100/100 [00:00<00:00, 190217.87it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.205261e-07,2.903253e-02,-2.028666e-07,0.000000e+00,4.404896e-02,0.000000e+00,0.000000e+00,2.574796e-02,0.007878,...,-1.047844e-03,2.669964e-03,-1.766463e-07,2.594606e-02,3.405457e-02,3.329453e-02,3.343951e-02,2.708758e-03,0.004216,0.004701
1,-3.205261e-07,,-2.587615e-07,-6.123344e-07,-2.982866e-07,-2.262900e-07,-5.001399e-07,-2.436398e-07,-4.964899e-07,-0.039008,...,3.739001e-02,-9.248200e-07,-6.196130e-07,-3.703121e-07,-6.781760e-07,-6.514881e-07,-2.638266e-07,-4.768616e-07,-0.047383,-0.014609
2,2.903253e-02,-2.587615e-07,,2.836381e-02,2.798393e-02,2.400779e-02,4.862240e-02,2.505823e-02,3.469556e-02,0.006678,...,-8.755001e-04,2.298085e-03,-2.561145e-03,2.540332e-02,-3.399821e-02,3.222887e-02,2.620237e-02,2.326793e-03,0.003534,0.004245
3,-2.028666e-07,-6.123344e-07,2.836381e-02,,-1.843623e-07,-2.126755e-02,-2.273377e-07,-1.553232e-07,2.516906e-02,0.007609,...,-1.009079e-03,2.587478e-03,-3.443913e-07,2.521551e-02,3.313818e-02,3.241806e-02,3.262737e-02,2.623905e-03,0.004062,0.004603
4,0.000000e+00,-2.982866e-07,2.798393e-02,-1.843623e-07,,0.000000e+00,0.000000e+00,0.000000e+00,2.483993e-02,0.007459,...,-9.871462e-04,2.541501e-03,-1.696745e-07,2.480394e-02,3.262077e-02,3.192273e-02,3.216690e-02,2.576638e-03,0.003976,0.004548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3.329453e-02,-6.514881e-07,3.222887e-02,3.241806e-02,3.192273e-02,2.684995e-02,1.230170e-01,2.817067e-02,7.146085e-02,0.007974,...,-1.062140e-03,2.699058e-03,-2.924577e-03,2.943256e-02,3.791923e-02,,2.962494e-02,2.738709e-03,0.004271,0.004734
96,3.343951e-02,-2.638266e-07,2.620237e-02,3.262737e-02,3.216690e-02,2.738750e-02,0.000000e+00,-5.811854e-02,2.329349e-02,0.006780,...,-8.900856e-04,2.330423e-03,-1.579398e-07,2.290584e-02,3.022516e-02,2.962494e-02,,2.359948e-03,0.003591,0.004287
97,2.708758e-03,-4.768616e-07,2.326793e-03,2.623905e-03,2.576638e-03,2.105684e-03,3.597225e-03,2.225706e-03,2.256647e-03,0.005195,...,-7.375436e-02,-3.005526e-07,4.227077e-02,2.984734e-03,2.814496e-03,2.738709e-03,2.359948e-03,,0.011559,0.002741
98,4.216216e-03,-4.738298e-02,3.533520e-03,4.061766e-03,3.976252e-03,3.153304e-03,5.944093e-03,3.358370e-03,3.411849e-03,-0.048220,...,-4.811883e-07,1.129624e-02,8.253074e-03,4.731062e-03,4.411420e-03,4.271337e-03,3.591459e-03,1.155881e-02,,-0.074655


np.nanmean(norm_err)=np.float64(0.004685499522103227)
    np.nanmean(np.abs(norm_err))=np.float64(0.013523113348847551)
    np.nanmedian(norm_err)=np.float64(0.0016015943397011457)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0028692767940893776)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35154.67it/s]
100%|██████████| 100/100 [00:00<00:00, 440.23it/s]
5937it [00:00, 663352.32it/s]
100%|██████████| 100/100 [00:00<00:00, 374157.36it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30961.13it/s]
100%|██████████| 100/100 [00:00<00:00, 1017.00it/s]
5953it [00:00, 653784.71it/s]
100%|██████████| 100/100 [00:00<00:00, 320665.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.037885e-07,-2.073482e-07,-1.068344e-07,-2.331199e-07,-2.073948e-07,-2.129698e-07,-2.191840e-07,-1.037033e-07,-1.068386e-07,...,-1.097117e-07,-2.074988e-07,-2.195983e-07,-1.039196e-07,-2.072439e-07,-1.066131e-07,-1.095732e-07,-1.065092e-07,-1.127854e-07,-2.071837e-07
1,-1.037885e-07,,-1.064760e-07,0.000000e+00,-1.037785e-07,-1.095593e-07,-1.035067e-07,-1.035508e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.065554e-07,-1.037357e-07,0.000000e+00,-1.094750e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.126740e-07
2,-2.073482e-07,-1.064760e-07,,-1.037224e-07,-2.073281e-07,-2.127600e-07,-2.067858e-07,-2.068736e-07,-1.126707e-07,-1.037263e-07,...,-1.035434e-07,-2.189794e-07,-2.072427e-07,-1.129261e-07,-2.126012e-07,-1.035138e-07,-1.034201e-07,-1.034158e-07,-1.033963e-07,-2.125380e-07
3,-1.068344e-07,0.000000e+00,-1.037224e-07,,-1.068237e-07,-1.037457e-07,-1.095966e-07,-1.065825e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.037978e-07,-1.067784e-07,0.000000e+00,-1.036702e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036401e-07
4,-2.331199e-07,-1.037785e-07,-2.073281e-07,-1.068237e-07,,-2.073746e-07,-2.129486e-07,-2.191615e-07,-1.036932e-07,-1.068279e-07,...,-1.097004e-07,-2.074787e-07,-2.195757e-07,-1.039095e-07,-2.072238e-07,-1.066025e-07,-1.095620e-07,-1.064986e-07,-1.127735e-07,-2.071637e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.066131e-07,0.000000e+00,-1.035138e-07,0.000000e+00,-1.066025e-07,-1.035370e-07,-1.093638e-07,-1.063622e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.035889e-07,-1.065573e-07,0.000000e+00,-1.034618e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.034318e-07
96,-1.095732e-07,0.000000e+00,-1.034201e-07,0.000000e+00,-1.095620e-07,-1.034433e-07,-1.062170e-07,-1.234588e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.034950e-07,-1.237218e-07,0.000000e+00,-1.033682e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.033383e-07
97,-1.065092e-07,0.000000e+00,-1.034158e-07,0.000000e+00,-1.064986e-07,-1.034390e-07,-1.158929e-07,-1.062588e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.034908e-07,-1.064535e-07,0.000000e+00,-1.033639e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.033340e-07
98,-1.127854e-07,0.000000e+00,-1.033963e-07,0.000000e+00,-1.127735e-07,-1.034195e-07,-1.061919e-07,-1.092817e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.034712e-07,-1.094876e-07,0.000000e+00,-1.033444e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.033145e-07


np.nanmean(norm_err)=np.float64(-8.963342316487717e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.963342316487717e-08)
    np.nanmedian(norm_err)=np.float64(-1.0374206444529774e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0374206444529774e-07)
    
