In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-06T00:30:44.011194+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10801.99it/s]
100%|██████████| 100/100 [00:00<00:00, 399.36it/s]
6127it [00:00, 655791.48it/s]
100%|██████████| 100/100 [00:00<00:00, 267323.39it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.056326,0.056824,0.057361,-0.806778,0.000000,-0.080537,-0.064051,...,0.000000,0.000000,0.338711,0.000000,-0.062289,-0.071524,0.000000,0.059426,0.000000,0.192765
1,0.000000,,0.000000,0.076084,0.076994,0.077984,-0.169646,0.000000,-0.098892,-0.090890,...,0.000000,0.000000,0.420820,0.000000,-0.087383,-0.085641,0.000000,0.081851,0.000000,0.247802
2,0.000000,0.000000,,0.058158,0.058688,0.059262,0.000000,0.000000,-0.082392,-0.066430,...,0.000000,0.000000,0.346920,0.000000,-0.064537,-0.072983,0.000000,0.061468,0.000000,0.198102
3,0.056326,0.076084,0.058158,,0.000000,0.000000,0.054043,0.108560,0.000000,0.000000,...,0.070580,0.051664,-0.072902,0.057738,0.000000,0.000000,0.073524,0.000000,0.061934,-0.113020
4,0.056824,0.076994,0.058688,0.000000,,0.000000,0.054501,0.110423,0.000000,0.000000,...,0.071363,0.052082,-0.331627,0.058260,0.000000,0.000000,0.074374,0.000000,0.062536,-0.556119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.071524,-0.085641,-0.072983,0.000000,0.000000,0.000000,-0.069656,-0.102974,0.583501,0.000000,...,-0.082041,-0.067649,0.000000,-0.072652,0.000000,,-0.083996,0.000000,-0.075886,0.000000
96,0.000000,0.000000,0.000000,0.073524,0.074374,0.075297,0.000000,0.000000,-0.096705,-0.087261,...,0.000000,0.000000,0.350568,0.000000,-0.084023,-0.083996,,0.078896,0.000000,0.240973
97,0.059426,0.081851,0.061468,0.000000,0.000000,0.000000,0.056890,0.120692,0.000000,0.000000,...,0.075516,0.054259,-0.078177,0.060999,0.000000,0.000000,0.078896,,0.065702,-0.126223
98,0.000000,0.000000,0.000000,0.061934,0.062536,0.063187,0.000000,0.000000,-0.086110,-0.071402,...,0.000000,0.000000,0.888348,0.000000,-0.069220,-0.075886,0.000000,0.065702,,0.208946


np.nanmean(norm_err)=np.float64(0.01898772304430463)
    np.nanmean(np.abs(norm_err))=np.float64(0.1156718927225362)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06352441358483531)
    


100%|██████████| 100/100 [00:00<00:00, 34675.13it/s]
100%|██████████| 100/100 [00:00<00:00, 448.45it/s]
5974it [00:00, 647578.94it/s]
100%|██████████| 100/100 [00:00<00:00, 305040.29it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-0.009094,3.448453e-02,0.000000e+00,0.000000e+00,-1.185217e-01,-8.368978e-03,-2.040924e-07,-1.481803e-02,...,-0.009807,-3.354837e-07,-2.359883e-07,-1.253999e-02,-1.775529e-07,7.000753e-03,-2.184375e-07,-2.226854e-02,0.000000e+00,-1.137823e-02
1,0.000000e+00,,0.001857,-1.779249e-03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.215058e-07,2.436012e-03,...,0.001944,-1.584514e-07,-1.443244e-07,2.234698e-03,-1.115768e-07,-1.218449e-07,-2.074027e-03,-1.794548e-07,-3.747698e-03,0.000000e+00
2,-9.093560e-03,1.856612e-03,,0.000000e+00,5.031611e-02,1.797815e-02,-1.023978e-02,-1.104480e-02,1.241662e-02,-1.667950e-07,...,0.000000,2.192894e-02,2.230708e-03,-1.458044e-07,3.061993e-02,-6.211607e-03,1.730107e-02,-5.087018e-02,0.000000e+00,-1.337600e-02
3,3.448453e-02,-1.779249e-03,0.000000,,-2.316101e-02,-5.218281e-02,0.000000e+00,0.000000e+00,-1.184659e-02,-1.477306e-07,...,0.000000,-1.305282e-01,-2.120212e-03,-1.356842e-07,-1.454411e-02,9.948353e-02,-2.030048e-03,-1.766086e-07,-1.845502e-03,0.000000e+00
4,0.000000e+00,0.000000e+00,0.050316,-2.316101e-02,,0.000000e+00,0.000000e+00,0.000000e+00,-2.116511e-07,8.384651e-02,...,0.020046,-3.606181e-07,-2.412876e-07,3.453581e-02,-1.987615e-07,-2.090400e-07,2.590299e-02,-4.653213e-07,-3.070417e-03,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7.000753e-03,-1.218449e-07,-0.006212,9.948353e-02,-2.090400e-07,-1.471842e-07,-3.920876e-02,-5.864769e-03,-2.702355e-07,-8.438380e-03,...,-0.006536,-3.648316e-07,-2.967928e-07,-7.647281e-03,-2.459016e-07,,-2.825186e-07,-1.042452e-02,-1.269158e-07,-7.198978e-03
96,-2.184375e-07,-2.074027e-03,0.017301,-2.030048e-03,2.590299e-02,5.974883e-02,-2.516008e-07,-1.234500e-07,1.160127e-02,2.386841e-02,...,0.013649,1.030403e-01,-2.547715e-03,4.351410e-02,1.474659e-02,-2.825186e-07,,-4.499987e-07,-1.091558e-01,-1.526408e-07
97,-2.226854e-02,-1.794548e-07,-0.050870,-1.766086e-07,-4.653213e-07,-2.404144e-07,-6.355548e-07,-1.769857e-07,-4.196188e-07,-8.539690e-02,...,-0.020281,-7.024300e-07,-4.873310e-07,-3.506091e-02,-3.637281e-07,-1.042452e-02,-4.499987e-07,,9.390594e-03,-2.438397e-07
98,0.000000e+00,-3.747698e-03,0.000000,-1.845502e-03,-3.070417e-03,-2.225227e-03,3.488523e-02,5.522758e-03,-2.054671e-03,-1.574465e-07,...,0.000000,-2.712768e-03,-4.507976e-03,-1.438365e-07,-1.880557e-03,-1.269158e-07,-1.091558e-01,9.390594e-03,,6.690431e-03


np.nanmean(norm_err)=np.float64(0.0002282177818498509)
    np.nanmean(np.abs(norm_err))=np.float64(0.010764072441868491)
    np.nanmedian(norm_err)=np.float64(-1.802950975041105e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(5.631526552985951e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 28928.23it/s]
100%|██████████| 100/100 [00:00<00:00, 479.74it/s]
5926it [00:00, 630288.97it/s]
100%|██████████| 100/100 [00:00<00:00, 306153.58it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32458.63it/s]
100%|██████████| 100/100 [00:00<00:00, 1022.36it/s]
5923it [00:00, 673065.91it/s]
100%|██████████| 100/100 [00:00<00:00, 349525.33it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.068605e-07,0.000000e+00,-1.065542e-07,0.000000e+00,-1.066360e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.038673e-07,0.000000e+00,-1.482780e-07,-1.036389e-07,-1.068016e-07,0.000000e+00,-1.038591e-07,0.000000e+00,0.000000e+00,-1.066845e-07
1,-1.068605e-07,,-1.038011e-07,-2.325714e-07,-1.203619e-07,-2.258749e-07,-1.037040e-07,-1.066044e-07,-1.039244e-07,-1.038483e-07,...,-2.077120e-07,-1.065072e-07,-2.135252e-07,-2.072553e-07,-2.197304e-07,-1.038307e-07,-2.076955e-07,-1.037331e-07,-1.067328e-07,-2.194827e-07
2,0.000000e+00,-1.038011e-07,,-1.035121e-07,0.000000e+00,-1.035892e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.066159e-07,0.000000e+00,-1.037200e-07,-1.063753e-07,-1.037455e-07,0.000000e+00,-1.096722e-07,0.000000e+00,0.000000e+00,-1.036350e-07
3,-1.065542e-07,-2.325714e-07,-1.035121e-07,,-1.163154e-07,-2.251908e-07,-1.034156e-07,-1.062996e-07,-1.036347e-07,-1.035590e-07,...,-2.071334e-07,-1.062030e-07,-2.129138e-07,-2.066792e-07,-2.190830e-07,-1.035415e-07,-2.071170e-07,-1.034444e-07,-1.064272e-07,-2.188367e-07
4,0.000000e+00,-1.203619e-07,0.000000e+00,-1.163154e-07,,-1.129654e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.038797e-07,0.000000e+00,-1.067876e-07,-1.036512e-07,-1.098917e-07,0.000000e+00,-1.038714e-07,0.000000e+00,0.000000e+00,-1.097677e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.038307e-07,0.000000e+00,-1.035415e-07,0.000000e+00,-1.036187e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.097144e-07,0.000000e+00,-1.037495e-07,-1.094596e-07,-1.037750e-07,,-1.066385e-07,0.000000e+00,0.000000e+00,-1.036645e-07
96,-1.038591e-07,-2.076955e-07,-1.096722e-07,-2.071170e-07,-1.038714e-07,-2.072714e-07,-1.095639e-07,-1.036172e-07,-1.067374e-07,-1.066570e-07,...,-2.133304e-07,-1.035253e-07,-2.075331e-07,-2.128487e-07,-2.075842e-07,-1.066385e-07,,-1.095963e-07,-1.037384e-07,-2.073631e-07
97,0.000000e+00,-1.037331e-07,0.000000e+00,-1.034444e-07,0.000000e+00,-1.035215e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.065442e-07,0.000000e+00,-1.036521e-07,-1.063039e-07,-1.036775e-07,0.000000e+00,-1.095963e-07,,0.000000e+00,-1.035672e-07
98,0.000000e+00,-1.067328e-07,0.000000e+00,-1.064272e-07,0.000000e+00,-1.065088e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.037466e-07,0.000000e+00,-1.164104e-07,-1.035188e-07,-1.066740e-07,0.000000e+00,-1.037384e-07,0.000000e+00,,-1.065572e-07


np.nanmean(norm_err)=np.float64(-1.0686368269594719e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0686368269594719e-07)
    np.nanmedian(norm_err)=np.float64(-1.0632468677949318e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0632468677949318e-07)
    
