In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T17:58:12.678637+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10378.60it/s]
100%|██████████| 100/100 [00:00<00:00, 380.58it/s]
6130it [00:00, 622258.13it/s]
100%|██████████| 100/100 [00:00<00:00, 182044.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.133959,0.000000,-0.072198,-0.051057,0.000000,0.0,-0.081249,-0.043997,0.070515,...,-0.089639,-0.044121,-0.120321,-0.043745,0.134279,-0.049697,-0.044405,0.064491,-0.091675,-0.057926
1,-0.133959,,0.000000,-0.128709,-0.074049,0.000000,0.0,-0.160601,-0.060069,0.000000,...,-0.197059,-0.060299,0.000000,-0.059599,0.000000,-0.071221,-0.060832,0.000000,-0.207173,-0.089428
2,0.000000,0.000000,,0.097962,0.062723,0.000000,0.0,0.115404,0.052394,0.000000,...,0.133099,0.052569,0.000000,0.052036,0.000000,0.060682,0.052973,0.000000,0.137637,0.073417
3,-0.072198,-0.128709,0.097962,,0.000000,0.116066,0.0,0.000000,0.000000,-0.217592,...,0.000000,0.000000,-0.116063,0.000000,-0.275876,-0.000006,0.000005,-0.198514,0.000000,0.000000
4,-0.051057,-0.074049,0.062723,0.000000,,0.069682,0.0,0.155822,0.000000,-0.118786,...,0.374402,0.000000,-0.069681,0.000000,-0.154044,0.600772,-0.181474,-0.112865,0.355077,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.049697,-0.071221,0.060682,-0.000006,0.600772,0.067173,0.0,0.149583,-0.000004,-0.113949,...,0.356525,-0.000004,-0.067172,0.502064,-0.147937,,0.000000,-0.108489,0.337936,0.698123
96,-0.044405,-0.060832,0.052973,0.000005,-0.181474,0.057853,0.0,-0.000005,0.000003,-0.096392,...,-0.000006,0.000003,-0.057853,-0.158014,-0.125654,0.000000,,-0.092456,-0.000006,-0.202833
97,0.064491,0.000000,0.000000,-0.198514,-0.112865,0.000000,0.0,-0.249438,-0.091282,0.000000,...,-0.308533,-0.091636,0.000000,-0.090558,0.000000,-0.108489,-0.092456,,-0.325096,-0.136757
98,-0.091675,-0.207173,0.137637,0.000000,0.355077,0.176268,0.0,0.000000,0.000000,-0.379600,...,0.000000,0.000000,-0.176260,0.271078,-0.464278,0.337936,-0.000006,-0.325096,,0.455100


np.nanmean(norm_err)=np.float64(0.002142464620585391)
    np.nanmean(np.abs(norm_err))=np.float64(0.12510810526000857)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.05241189943563099)
    


100%|██████████| 100/100 [00:00<00:00, 34205.71it/s]
100%|██████████| 100/100 [00:00<00:00, 641.06it/s]
5940it [00:00, 570679.75it/s]
100%|██████████| 100/100 [00:00<00:00, 278506.24it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.002441,0.002977,-3.275217e-07,-1.186101e-07,3.305879e-03,1.920820e-03,0.015436,0.001941,-2.055458e-07,...,0.014803,0.016902,-4.177904e-07,0.001891,0.014506,-1.458916e-07,-4.266551e-07,-2.405183e-07,-1.577847e-07,-2.779120e-07
1,2.441165e-03,,0.000000,3.005717e-03,1.794425e-02,-2.696912e-07,-1.578561e-07,0.002535,0.004510,4.163780e-03,...,0.002415,0.002818,-5.451449e-02,0.000000,0.002360,8.508319e-03,3.684600e-02,1.599502e-02,9.346461e-03,2.159791e-02
2,2.977310e-03,0.000000,,3.862010e-03,2.147170e-02,-3.985397e-07,-1.669988e-07,0.003118,0.005401,6.009466e-03,...,0.002939,0.003558,-1.703853e-01,0.000000,0.002857,1.074379e-02,5.560282e-02,2.627122e-02,1.211572e-02,2.692116e-02
3,-3.275217e-07,0.003006,0.003862,,-1.392638e-07,4.433694e-03,2.253953e-03,0.018560,0.002281,-2.766461e-07,...,0.017652,0.020721,-5.654959e-07,0.002213,0.017232,-1.784429e-07,-5.818593e-07,-3.439596e-07,-1.965648e-07,-3.363519e-07
4,-1.186101e-07,0.017944,0.021472,-1.392638e-07,,4.163278e-02,1.438905e-02,-0.010060,0.005413,0.000000e+00,...,-0.009682,-0.010924,-1.823350e-07,0.014182,-0.003791,-1.095480e-02,-1.933504e-07,0.000000e+00,0.000000e+00,1.169197e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.458916e-07,0.008508,0.010744,-1.784429e-07,-1.095480e-02,1.219427e-02,6.480326e-03,-0.012475,0.008755,0.000000e+00,...,-0.011899,-0.013833,-2.558967e-07,0.006369,-0.004637,,-2.193547e-02,6.784995e-02,3.024258e-03,-1.591218e-07
96,-4.266551e-07,0.036846,0.055603,-5.818593e-07,-1.933504e-07,1.500145e-01,2.444450e-02,-0.018618,0.009228,-5.171314e-07,...,-0.017364,-0.021814,-1.148733e-06,0.023854,-0.006687,-2.193547e-02,,-9.268867e-07,-3.068480e-07,4.474671e-02
97,-2.405183e-07,0.015995,0.026271,-3.439596e-07,0.000000e+00,3.704797e-02,1.007054e-02,-0.008413,0.013700,0.000000e+00,...,-0.007779,-0.010082,-8.256885e-07,0.009804,-0.018849,6.784995e-02,-9.268867e-07,,-1.215324e-02,-4.577347e-03
98,-1.577847e-07,0.009346,0.012116,-1.965648e-07,0.000000e+00,1.399270e-02,6.955391e-03,-0.005394,0.077945,0.000000e+00,...,-0.005126,-0.006035,-2.948833e-07,0.006827,-0.027825,3.024258e-03,-3.068480e-07,-1.215324e-02,,-2.844793e-03


np.nanmean(norm_err)=np.float64(0.0007660849454804025)
    np.nanmean(np.abs(norm_err))=np.float64(0.011190622464441987)
    np.nanmedian(norm_err)=np.float64(-1.8938363330101742e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(8.908909760765518e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 28755.68it/s]
100%|██████████| 100/100 [00:00<00:00, 467.69it/s]
5955it [00:00, 512024.77it/s]
100%|██████████| 100/100 [00:00<00:00, 312774.35it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31693.40it/s]
100%|██████████| 100/100 [00:00<00:00, 1017.19it/s]
5933it [00:00, 599836.23it/s]
100%|██████████| 100/100 [00:00<00:00, 388002.22it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.237974e-07,0.000000e+00,-1.096469e-07,0.000000e+00,-1.064827e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066098e-07,-1.037957e-07,0.000000e+00,-1.036486e-07,-1.036080e-07
1,-1.237974e-07,,-1.036250e-07,-2.190708e-07,-1.035943e-07,-2.127552e-07,-1.034996e-07,-1.036646e-07,-1.064246e-07,-1.036195e-07,...,-1.200306e-07,-1.037483e-07,-1.097157e-07,-1.063830e-07,-1.036225e-07,-2.130089e-07,-2.073917e-07,-1.034635e-07,-2.070981e-07,-2.070171e-07
2,0.000000e+00,-1.036250e-07,,-1.036905e-07,0.000000e+00,-1.035953e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037156e-07,-1.164740e-07,0.000000e+00,-1.128487e-07,-1.065021e-07
3,-1.096469e-07,-2.190708e-07,-1.036905e-07,,-1.036598e-07,-2.128934e-07,-1.035649e-07,-1.037302e-07,-1.064937e-07,-1.036850e-07,...,-1.097509e-07,-1.038140e-07,-1.164948e-07,-1.064521e-07,-1.036880e-07,-2.131473e-07,-2.075229e-07,-1.035289e-07,-2.072290e-07,-2.071479e-07
4,0.000000e+00,-1.035943e-07,0.000000e+00,-1.036598e-07,,-1.035646e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036848e-07,-1.066678e-07,0.000000e+00,-1.065125e-07,-1.127643e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.066098e-07,-2.130089e-07,-1.037156e-07,-2.131473e-07,-1.036848e-07,-2.190604e-07,-1.035899e-07,-1.037553e-07,-1.323997e-07,-1.037101e-07,...,-1.067081e-07,-1.038391e-07,-1.067443e-07,-1.162097e-07,-1.037131e-07,,-2.075731e-07,-1.035539e-07,-2.072790e-07,-2.071979e-07
96,-1.037957e-07,-2.073917e-07,-1.164740e-07,-2.075229e-07,-1.066678e-07,-2.073322e-07,-1.096301e-07,-1.098153e-07,-1.037108e-07,-1.130166e-07,...,-1.038889e-07,-1.131698e-07,-1.039232e-07,-1.036713e-07,-1.066977e-07,-2.075731e-07,,-1.128311e-07,-2.258655e-07,-2.131539e-07
97,0.000000e+00,-1.034635e-07,0.000000e+00,-1.035289e-07,0.000000e+00,-1.034340e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.035539e-07,-1.128311e-07,,-1.197291e-07,-1.063316e-07
98,-1.036486e-07,-2.070981e-07,-1.128487e-07,-2.072290e-07,-1.065125e-07,-2.070388e-07,-1.094660e-07,-1.096506e-07,-1.035639e-07,-1.162820e-07,...,-1.037416e-07,-1.164441e-07,-1.037758e-07,-1.035245e-07,-1.065423e-07,-2.072790e-07,-2.258655e-07,-1.197291e-07,,-2.128437e-07


np.nanmean(norm_err)=np.float64(-9.599534824808921e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.599534824808921e-08)
    np.nanmedian(norm_err)=np.float64(-1.0372773088797305e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0372773088797305e-07)
    
