In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-27T00:25:11.818786+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10348.39it/s]
100%|██████████| 100/100 [00:00<00:00, 375.05it/s]
6155it [00:00, 564110.24it/s]
100%|██████████| 100/100 [00:00<00:00, 245712.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.304531,0.000000,0.000000,-0.326081,-0.673151,-0.041628,-0.178316,0.000000,...,-0.398563,0.000000,-0.521412,0.135110,0.000000,0.000000,-0.339820,-0.264460,0.000000,-0.189529
1,0.000000,,0.067638,-0.314783,0.000000,0.008628,0.572399,0.130458,0.130767,-0.344593,...,0.009975,-0.099989,0.470520,0.000000,-0.566261,-0.137662,0.661135,0.194529,-0.000024,0.136698
2,-0.304531,0.067638,,0.007633,0.059947,-1.131736,-0.352037,0.078351,0.085187,0.012260,...,-1.035019,0.007545,-0.298957,-0.027338,0.011470,0.065353,-0.220236,-0.054092,0.068677,0.119792
3,0.000000,-0.314783,0.007633,,-0.272761,0.474428,0.012413,0.013522,0.013549,0.075993,...,0.561015,0.427757,0.010789,0.000000,0.000000,-0.302099,0.008235,0.013101,-0.320608,0.014055
4,0.000000,0.000000,0.059947,-0.272761,,0.007628,0.450211,0.104578,0.104777,-0.270235,...,0.008661,-0.087871,0.384696,0.000000,-0.443384,-0.109163,0.560700,0.156125,0.000000,0.108550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,-0.137662,0.065353,-0.302099,-0.109163,0.008330,0.532977,0.122215,0.122487,-0.320555,...,0.009579,-0.096367,0.443552,0.000000,-0.526496,,0.517028,0.182308,-0.142035,0.127675
96,-0.339820,0.661135,-0.220236,0.008235,0.560700,-0.291361,0.000000,0.089511,0.000000,0.013889,...,-0.341477,0.008132,0.000000,-0.031446,0.012883,0.517028,,0.000000,0.675404,0.000000
97,-0.264460,0.194529,-0.054092,0.013101,0.156125,-0.154123,0.000000,0.272642,0.000000,0.037189,...,-0.197285,0.012843,0.000000,-0.107916,0.030762,0.182308,0.000000,,0.200341,0.000000
98,0.000000,-0.000024,0.068677,-0.320608,0.000000,0.008764,0.591326,0.134380,0.134708,-0.356151,...,0.010156,-0.101642,0.483234,0.000000,-0.585395,-0.142035,0.675404,0.200341,,0.141010


np.nanmean(norm_err)=np.float64(-0.005375836941817746)
    np.nanmean(np.abs(norm_err))=np.float64(0.15226637420997138)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.04846781553446014)
    


100%|██████████| 100/100 [00:00<00:00, 32029.81it/s]
100%|██████████| 100/100 [00:00<00:00, 373.81it/s]
5981it [00:00, 596536.09it/s]
100%|██████████| 100/100 [00:00<00:00, 271651.81it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.206526e-03,-5.632973e-03,-3.816220e-03,-1.771173e-03,-2.847158e-03,-6.051612e-03,-4.090934e-07,0.022195,0.013272,...,0.170903,-1.286439e-02,-4.324198e-03,-0.000001,-1.174958e-02,-0.006052,3.568470e-02,-2.632821e-03,-7.407329e-03,3.380544e-02
1,-0.002207,,0.000000e+00,0.000000e+00,-1.334024e-07,0.000000e+00,0.000000e+00,-1.020562e-03,-0.001345,-0.001010,...,-0.001583,-1.911826e-03,-1.490919e-07,-0.002191,-2.308513e-07,-0.001430,-1.248361e-03,-1.758410e-07,0.000000e+00,-1.212420e-03
2,-0.005633,0.000000e+00,,-6.420864e-02,-1.485379e-07,0.000000e+00,8.640515e-02,-2.279309e-03,-0.003109,-0.002253,...,-0.003759,-1.910517e-02,-1.842520e-07,-0.005582,-1.920600e-02,-0.013498,-1.415840e-03,-1.907444e-07,0.000000e+00,-1.369786e-03
3,-0.003816,0.000000e+00,-6.420864e-02,,-1.233672e-07,0.000000e+00,0.000000e+00,-1.911159e-03,-0.002462,-0.001893,...,-0.002853,-1.363447e-02,-7.563299e-02,-0.003793,-2.103870e-07,-0.010517,-1.140648e-03,-1.511439e-07,0.000000e+00,-1.110567e-03
4,-0.001771,-1.334024e-07,-1.485379e-07,-1.233672e-07,,-1.498328e-07,-1.530764e-07,-9.164360e-04,-0.001169,-0.000908,...,-0.001346,-1.576139e-03,-2.629571e-07,-0.001761,-3.823854e-07,-0.001233,-8.926557e-03,-2.896462e-07,-2.051803e-07,-8.699287e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.006052,-1.429745e-03,-1.349789e-02,-1.051669e-02,-1.233363e-03,-1.673775e-03,-1.407780e-02,-2.345133e-03,-0.003233,-0.002318,...,-0.003942,-3.660062e-07,-1.143644e-02,-0.005994,-1.956151e-02,,-1.467494e-03,-1.597407e-03,-2.623118e-03,-1.418082e-03
96,0.035685,-1.248361e-03,-1.415840e-03,-1.140648e-03,-8.926557e-03,-1.430459e-03,-1.467281e-03,7.426238e-03,0.032753,0.024350,...,0.025174,-1.979852e-03,-1.227662e-03,0.016373,-1.923428e-03,-0.001467,,-1.374331e-03,-3.418096e-02,-3.168622e-07
97,-0.002633,-1.758410e-07,-1.907444e-07,-1.511439e-07,-2.896462e-07,-2.269533e-07,-1.982941e-07,-1.103257e-03,-0.001492,-0.001091,...,-0.001791,-2.223893e-03,-3.270041e-07,-0.002610,-5.346660e-07,-0.001597,-1.374331e-03,,-2.905765e-07,-1.330902e-03
98,-0.007407,0.000000e+00,0.000000e+00,0.000000e+00,-2.051803e-07,0.000000e+00,0.000000e+00,-1.511367e-03,-0.002350,-0.001488,...,-0.003191,-4.882316e-03,-2.414140e-07,-0.007233,-5.660000e-07,-0.002623,-3.418096e-02,-2.905765e-07,,-3.255426e-02


np.nanmean(norm_err)=np.float64(-0.0016963854159464957)
    np.nanmean(np.abs(norm_err))=np.float64(0.008392681681150404)
    np.nanmedian(norm_err)=np.float64(-0.001252612425782805)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0016414332140063793)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35454.81it/s]
100%|██████████| 100/100 [00:00<00:00, 417.72it/s]
5936it [00:00, 624072.90it/s]
100%|██████████| 100/100 [00:00<00:00, 344359.93it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33415.42it/s]
100%|██████████| 100/100 [00:00<00:00, 1002.50it/s]
5948it [00:00, 588778.44it/s]
100%|██████████| 100/100 [00:00<00:00, 348653.70it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.096011e-07,-1.037393e-07,-2.070857e-07,-2.130627e-07,-2.331310e-07,-2.259620e-07,-2.133772e-07,-2.075948e-07,-2.130507e-07,...,-2.076827e-07,-1.037012e-07,-2.077908e-07,-2.258423e-07,-1.037426e-07,-1.036443e-07,-2.072729e-07,-1.035904e-07,-1.065294e-07,-1.038902e-07
1,-1.096011e-07,,0.000000e+00,-1.034220e-07,-1.064035e-07,-1.097160e-07,-1.095954e-07,-1.065603e-07,-1.036760e-07,-1.063975e-07,...,-1.037198e-07,0.000000e+00,-1.037737e-07,-1.095390e-07,0.000000e+00,0.000000e+00,-1.035154e-07,0.000000e+00,0.000000e+00,0.000000e+00
2,-1.037393e-07,0.000000e+00,,-1.063998e-07,-1.036097e-07,-1.038422e-07,-1.037341e-07,-1.037585e-07,-1.066686e-07,-1.036040e-07,...,-1.067151e-07,0.000000e+00,-1.067722e-07,-1.036837e-07,0.000000e+00,0.000000e+00,-1.064987e-07,0.000000e+00,0.000000e+00,0.000000e+00
3,-2.070857e-07,-1.034220e-07,-1.063998e-07,,-2.068276e-07,-2.072907e-07,-2.070754e-07,-2.071239e-07,-2.190348e-07,-2.068162e-07,...,-2.256128e-07,-1.063598e-07,-2.477300e-07,-2.069749e-07,-1.064034e-07,-1.062999e-07,-2.319743e-07,-1.062432e-07,-1.034119e-07,-1.096207e-07
4,-2.130627e-07,-1.064035e-07,-1.036097e-07,-2.068276e-07,,-2.132798e-07,-2.130519e-07,-2.399074e-07,-2.073354e-07,-2.253470e-07,...,-2.074232e-07,-1.035718e-07,-2.075309e-07,-2.129454e-07,-1.036131e-07,-1.035149e-07,-2.070143e-07,-1.034612e-07,-1.094453e-07,-1.037603e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036443e-07,0.000000e+00,0.000000e+00,-1.062999e-07,-1.035149e-07,-1.037470e-07,-1.036391e-07,-1.036634e-07,-1.065682e-07,-1.035093e-07,...,-1.066146e-07,0.000000e+00,-1.066715e-07,-1.035887e-07,0.000000e+00,,-1.063985e-07,0.000000e+00,0.000000e+00,0.000000e+00
96,-2.072729e-07,-1.035154e-07,-1.064987e-07,-2.319743e-07,-2.070143e-07,-2.074783e-07,-2.072626e-07,-2.073112e-07,-2.192442e-07,-2.070030e-07,...,-2.258351e-07,-1.064586e-07,-2.328595e-07,-2.071619e-07,-1.065022e-07,-1.063985e-07,,-1.063418e-07,-1.035053e-07,-1.097256e-07
97,-1.035904e-07,0.000000e+00,0.000000e+00,-1.062432e-07,-1.034612e-07,-1.036930e-07,-1.035852e-07,-1.036095e-07,-1.065112e-07,-1.034555e-07,...,-1.065575e-07,0.000000e+00,-1.066144e-07,-1.035349e-07,0.000000e+00,0.000000e+00,-1.063418e-07,,0.000000e+00,0.000000e+00
98,-1.065294e-07,0.000000e+00,0.000000e+00,-1.034119e-07,-1.094453e-07,-1.066380e-07,-1.065240e-07,-1.096112e-07,-1.036658e-07,-1.094389e-07,...,-1.037097e-07,0.000000e+00,-1.037636e-07,-1.064708e-07,0.000000e+00,0.000000e+00,-1.035053e-07,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.1095377708079698e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1095377708079698e-07)
    np.nanmedian(norm_err)=np.float64(-1.0637471205307621e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0637471205307621e-07)
    
