In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-01T00:26:12.665774+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10183.56it/s]
100%|██████████| 100/100 [00:00<00:00, 368.06it/s]
6130it [00:00, 656934.02it/s]
100%|██████████| 100/100 [00:00<00:00, 249958.52it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.085379,0.000000,0.000000,0.035771,0.000000,0.000000,0.000000,0.043491,...,0.000000,-0.107821,0.000000,0.000000,0.000000,0.067437,0.000000,0.035614,0.000000,-0.166368
1,0.000000,,-0.247297,0.000000,0.000000,-0.098693,0.000000,0.000000,-0.466873,-0.120887,...,0.117740,0.000000,0.735827,0.186754,-0.728479,-0.191873,0.000000,-0.098245,0.106858,0.000000
2,0.085379,-0.247297,,0.235981,-0.121516,0.000000,-0.133048,-0.350959,-0.146885,0.000000,...,-0.226415,0.105685,-0.219359,-0.160574,-0.222226,0.000000,-0.181752,0.000000,-0.200264,0.195779
3,0.000000,0.000000,0.235981,,0.000000,0.048826,0.000000,0.000000,0.000000,0.064441,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.135985,0.000000,0.048534,0.000000,0.000000
4,0.000000,0.000000,-0.121516,0.000000,,-0.069842,0.000000,0.000000,0.132039,-0.080271,...,0.256896,0.000000,0.185596,0.064961,0.165695,-0.106412,0.000000,-0.069617,0.239180,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.067437,-0.191873,0.000000,0.135985,-0.106412,0.000000,-0.115152,-0.248918,-0.125375,0.000000,...,-0.515301,0.079503,-0.145023,-0.368097,-0.176430,,-0.149924,0.000000,-0.457087,0.121597
96,0.000000,0.000000,-0.181752,0.000000,0.000000,-0.086276,0.000000,0.000000,-0.112566,-0.102770,...,0.100490,0.000000,0.514886,0.590133,-0.152066,-0.149924,,-0.085933,0.092454,0.000000
97,0.035614,-0.098245,0.000000,0.048534,-0.069617,0.000000,-0.073255,-0.111306,-0.077262,0.000000,...,-0.347574,0.038717,-0.059411,-0.281204,-0.094031,0.000000,-0.085933,,-0.323275,0.046567
98,0.000000,0.106858,-0.200264,0.000000,0.239180,-0.324486,0.077944,0.418751,0.219840,-0.382145,...,0.000000,0.000000,0.197533,-0.419976,0.275806,-0.457087,0.092454,-0.323275,,0.000000


np.nanmean(norm_err)=np.float64(-0.03090659718197147)
    np.nanmean(np.abs(norm_err))=np.float64(0.11572661242429629)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06820463014514438)
    


100%|██████████| 100/100 [00:00<00:00, 34039.15it/s]
100%|██████████| 100/100 [00:00<00:00, 412.52it/s]
5990it [00:00, 607267.74it/s]
100%|██████████| 100/100 [00:00<00:00, 239811.55it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.005686,-4.135447e-07,-4.111432e-07,-0.001566,-3.025627e-07,5.383360e-02,2.765621e-02,0.000000e+00,-3.270889e-07,...,-2.436383e-07,-3.926033e-07,0.000000e+00,-0.001723,0.000000e+00,-3.072583e-07,3.639952e-02,6.737431e-04,-1.981875e-03,-3.257078e-07
1,-5.686101e-03,,-2.375750e-03,-2.366060e-03,0.029238,-1.889714e-03,1.451406e-03,-1.564931e-03,-4.247376e-03,-2.004282e-03,...,-1.595546e-03,-2.290098e-03,-1.193526e-03,0.031696,-4.327684e-03,-1.911989e-03,1.118983e-03,1.730498e-03,-1.700411e-03,-5.499007e-03
2,-4.135447e-07,-0.002376,,1.062156e-02,-0.001884,6.782895e-03,1.351281e-02,-3.009741e-07,-2.575885e-07,-9.552754e-07,...,5.096224e-03,-4.138898e-02,3.314176e-03,-0.403905,-2.649155e-07,-9.387652e-02,7.031086e-03,7.878730e-04,-2.310594e-03,-7.796198e-07
3,-4.111432e-07,-0.002366,1.062156e-02,,-0.350417,-8.610220e-07,1.330611e-02,-1.983625e-02,-2.566548e-07,7.509550e-03,...,-6.405811e-07,9.801851e-03,-2.061457e-07,-0.002109,-2.639280e-07,6.886645e-03,6.974710e-03,7.857373e-04,-1.468322e-02,-7.753510e-07
4,-1.565988e-03,0.029238,-1.883623e-03,-3.504172e-01,,-2.372347e-02,1.099670e-03,-1.207979e-02,-1.223587e-03,-1.642321e-03,...,-2.055189e-02,-1.829382e-03,-1.594225e-02,0.047731,-1.243528e-03,-1.579841e-03,8.975891e-04,1.535404e-03,-9.613086e-03,-1.523310e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.072583e-07,-0.001912,-9.387652e-02,6.886645e-03,-0.001580,5.038034e-03,7.037933e-03,-2.404415e-07,-2.119258e-07,-6.825685e-07,...,4.043931e-03,-2.708252e-02,2.834536e-03,-0.085182,-2.168604e-07,,4.754911e-03,6.786302e-04,-1.996604e-03,-5.879197e-07
96,3.639952e-02,0.001119,7.031086e-03,6.974710e-03,0.000898,4.663742e-03,-2.319801e-06,2.697359e-02,2.324847e-02,5.148045e-03,...,3.578055e-03,6.547767e-03,2.380056e-03,0.001003,2.388126e-02,4.754911e-03,,-1.518825e-03,-3.616663e-07,3.443991e-02
97,6.737431e-04,0.001730,7.878730e-04,7.857373e-04,0.001535,6.729954e-04,-1.798196e-03,5.864737e-04,5.429575e-04,7.015772e-04,...,5.948338e-04,7.687901e-04,4.755342e-04,0.001633,5.507990e-04,6.786302e-04,-1.518825e-03,,-2.228475e-07,6.576475e-04
98,-1.981875e-03,-0.001700,-2.310594e-03,-1.468322e-02,-0.009613,-1.260709e-02,-4.262768e-07,-9.597705e-02,-1.603317e-03,-2.062727e-03,...,-1.116179e-02,-2.255890e-03,-8.944055e-03,-0.001606,-1.626096e-03,-1.996604e-03,-3.616663e-07,-2.228475e-07,,-1.936065e-03


np.nanmean(norm_err)=np.float64(-0.0009539728670736)
    np.nanmean(np.abs(norm_err))=np.float64(0.01191628860869038)
    np.nanmedian(norm_err)=np.float64(-3.945286233303086e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002245483774354674)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 25646.96it/s]
100%|██████████| 100/100 [00:00<00:00, 470.36it/s]
5941it [00:00, 650185.52it/s]
100%|██████████| 100/100 [00:00<00:00, 280931.28it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31540.86it/s]
100%|██████████| 100/100 [00:00<00:00, 1016.11it/s]
5934it [00:00, 601198.10it/s]
100%|██████████| 100/100 [00:00<00:00, 329740.88it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.034836e-07,-1.034618e-07,-1.065831e-07,-2.191677e-07,-1.036319e-07,-1.037649e-07,-2.328723e-07,-1.200697e-07,-1.035106e-07,...,-2.131589e-07,-2.131810e-07,-1.066024e-07,-1.094823e-07,-2.132210e-07,-2.072823e-07,-2.069255e-07,-2.073838e-07,-2.191286e-07,-2.075910e-07
1,-1.034836e-07,,0.000000e+00,0.000000e+00,-1.035109e-07,0.000000e+00,0.000000e+00,-1.036477e-07,0.000000e+00,0.000000e+00,...,-1.035635e-07,-1.035739e-07,0.000000e+00,0.000000e+00,-1.035929e-07,-1.064069e-07,-1.124829e-07,-1.237249e-07,-1.034935e-07,-1.128763e-07
2,-1.034618e-07,0.000000e+00,,0.000000e+00,-1.034891e-07,0.000000e+00,0.000000e+00,-1.036258e-07,0.000000e+00,0.000000e+00,...,-1.035416e-07,-1.035520e-07,0.000000e+00,0.000000e+00,-1.035710e-07,-1.126679e-07,-1.061958e-07,-1.064372e-07,-1.034716e-07,-1.065464e-07
3,-1.065831e-07,0.000000e+00,0.000000e+00,,-1.066121e-07,0.000000e+00,0.000000e+00,-1.067571e-07,0.000000e+00,0.000000e+00,...,-1.097363e-07,-1.097480e-07,0.000000e+00,0.000000e+00,-1.097693e-07,-1.037248e-07,-1.035461e-07,-1.037756e-07,-1.065936e-07,-1.038794e-07
4,-2.191677e-07,-1.035109e-07,-1.034891e-07,-1.066121e-07,,-1.036593e-07,-1.037924e-07,-2.195357e-07,-1.097409e-07,-1.035380e-07,...,-2.132168e-07,-2.132389e-07,-1.066314e-07,-1.127497e-07,-2.132790e-07,-2.073371e-07,-2.069801e-07,-2.074386e-07,-2.559577e-07,-2.076460e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.072823e-07,-1.064069e-07,-1.126679e-07,-1.037248e-07,-2.073371e-07,-1.065637e-07,-1.164787e-07,-2.076115e-07,-1.037816e-07,-1.127259e-07,...,-2.074426e-07,-2.074635e-07,-1.037430e-07,-1.035777e-07,-2.075015e-07,,-2.127696e-07,-2.132542e-07,-2.073022e-07,-2.134733e-07
96,-2.069255e-07,-1.124829e-07,-1.061958e-07,-1.035461e-07,-2.069801e-07,-1.160864e-07,-1.065152e-07,-2.072535e-07,-1.036027e-07,-1.062473e-07,...,-2.070852e-07,-2.071060e-07,-1.035643e-07,-1.033995e-07,-2.071439e-07,-2.127696e-07,,-2.254580e-07,-2.069452e-07,-2.648837e-07
97,-2.073838e-07,-1.237249e-07,-1.064372e-07,-1.037756e-07,-2.074386e-07,-1.129299e-07,-1.067581e-07,-2.077132e-07,-1.038325e-07,-1.064889e-07,...,-2.075442e-07,-2.075651e-07,-1.037939e-07,-1.036284e-07,-2.076032e-07,-2.132542e-07,-2.254580e-07,,-2.074036e-07,-2.262484e-07
98,-2.191286e-07,-1.034935e-07,-1.034716e-07,-1.065936e-07,-2.559577e-07,-1.036418e-07,-1.037748e-07,-2.194964e-07,-1.097213e-07,-1.035205e-07,...,-2.131798e-07,-2.132019e-07,-1.066129e-07,-1.127290e-07,-2.132420e-07,-2.073022e-07,-2.069452e-07,-2.074036e-07,,-2.076109e-07


np.nanmean(norm_err)=np.float64(-1.0460969652461719e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0460969652461719e-07)
    np.nanmedian(norm_err)=np.float64(-1.0391801575954246e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0391801575954246e-07)
    
