In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from hstrat import _auxiliary_lib as hstrat_aux
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-09T18:55:12.916870+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
downstream                        : 1.14.3
hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 11029.23it/s]
100%|██████████| 100/100 [00:00<00:00, 381.49it/s]
6141it [00:00, 632482.59it/s]
100%|██████████| 100/100 [00:00<00:00, 243713.19it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.162031,-0.089237,-0.441271,-0.081115,-0.081454,-0.308383,-0.265999,0.0,-0.085226,...,-0.233735,-0.101835,0.0,-0.174570,-0.406290,-0.234795,-0.633244,-0.111122,-0.085873,-0.086903
1,-0.162031,,0.000000,0.239877,0.000000,0.000000,0.361653,0.158685,0.0,0.000000,...,-0.296685,0.000000,0.0,-0.235833,-0.074607,0.000000,0.077891,-0.140299,0.000000,0.000000
2,-0.089237,0.000000,,0.064823,-0.238122,0.507094,0.053529,0.049011,0.0,0.000000,...,-0.091044,0.000000,0.0,-0.080429,-0.035085,0.070439,0.035793,0.044834,0.000000,0.000000
3,-0.441271,0.239877,0.064823,,0.060429,0.060617,-0.387846,0.000028,0.0,0.152633,...,0.000000,0.174466,0.0,0.000000,0.000000,0.117898,-0.000005,0.075640,0.153520,0.154927
4,-0.081115,0.000000,-0.238122,0.060429,,0.256388,0.050497,0.046457,0.0,0.000000,...,-0.082607,0.000000,0.0,-0.073773,-0.031380,0.062975,0.031945,0.041689,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.234795,0.000000,0.070439,0.117898,0.062975,0.063282,0.085203,0.074301,0.0,0.000000,...,-0.247587,0.000000,0.0,-0.182198,-0.130418,,0.140806,0.084123,0.000000,0.000000
96,-0.633244,0.077891,0.035793,-0.000005,0.031945,0.032103,-0.000003,-0.000003,0.0,0.033875,...,0.000000,0.042052,0.0,0.000000,0.000000,0.140806,,0.046907,0.034182,0.034673
97,-0.111122,-0.140299,0.044834,0.075640,0.041689,0.041823,0.060697,0.054953,0.0,-0.088514,...,-0.197374,-0.101391,0.0,-0.168457,-0.045697,0.084123,0.046907,,-0.089036,-0.089864
98,-0.085873,0.000000,0.000000,0.153520,0.000000,0.000000,0.434233,0.115650,0.0,-0.372069,...,-0.150295,0.577659,0.0,-0.132920,-0.033536,0.000000,0.034182,-0.089036,,0.000000


np.nanmean(norm_err)=np.float64(0.008108383181981208)
    np.nanmean(np.abs(norm_err))=np.float64(0.08972040490300148)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.02875219082590367)
    


100%|██████████| 100/100 [00:00<00:00, 33067.68it/s]
100%|██████████| 100/100 [00:00<00:00, 730.69it/s]
5950it [00:00, 605290.05it/s]
100%|██████████| 100/100 [00:00<00:00, 205200.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.270528e-03,8.273096e-03,-3.306415e-02,-2.025970e-02,2.323240e-03,-1.304304e-02,0.004045,-1.729160e-02,1.467042e-02,...,-1.213251e-07,-1.645125e-02,-2.821436e-07,3.672212e-03,3.550448e-03,0.000000,-2.791395e-03,1.142764e-02,0.000000,-0.005060
1,-0.004271,,-2.081475e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.003203,1.532953e-01,0.000000e+00,...,1.370543e-01,0.000000e+00,-1.120363e-02,0.000000e+00,0.000000e+00,-0.003801,0.000000e+00,-3.813873e-07,-0.004028,-0.004757
2,0.008273,-2.081475e-07,,-1.993545e-07,-1.329948e-07,-1.765651e-01,-2.094953e-07,0.007129,-1.158954e-07,2.120743e-02,...,-1.762002e-03,-1.427129e-07,1.178292e-02,-3.034320e-03,-2.951363e-03,0.007809,-1.466094e-07,2.714144e-02,0.008040,0.129384
3,-0.033064,0.000000e+00,-1.993545e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-0.018717,-1.142458e-01,0.000000e+00,...,3.059197e-03,0.000000e+00,-8.157034e-02,0.000000e+00,0.000000e+00,-0.029555,0.000000e+00,-3.528690e-07,-0.031258,-0.027367
4,-0.020260,0.000000e+00,-1.329948e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-0.012671,-7.309433e-02,0.000000e+00,...,2.079442e-03,0.000000e+00,-3.187347e-02,0.000000e+00,0.000000e+00,-0.018886,0.000000e+00,-1.873778e-07,-0.019567,-0.016120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,-3.800914e-03,7.808660e-03,-2.955505e-02,-1.888574e-02,2.176919e-03,-1.159270e-02,0.003823,-1.628069e-02,1.327079e-02,...,-1.147774e-07,-1.524967e-02,-2.664999e-07,3.319535e-03,3.219719e-03,,-2.582809e-03,1.056007e-02,0.000000,-0.004717
96,-0.002791,0.000000e+00,-1.466094e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.002292,7.585487e-03,0.000000e+00,...,9.221748e-03,0.000000e+00,-4.687576e-03,0.000000e+00,0.000000e+00,-0.002583,,-2.155838e-07,-0.002686,-0.002992
97,0.011428,-3.813873e-07,2.714144e-02,-3.528690e-07,-1.873778e-07,-1.725465e-07,-3.859367e-07,0.009354,-1.551303e-07,-3.156320e-07,...,-1.874587e-02,-2.072625e-07,1.941692e-02,-3.100000e-07,-2.960694e-07,0.010560,-2.155838e-07,,0.010988,0.037292
98,0.000000,-4.028314e-03,8.040369e-03,-3.125803e-02,-1.956693e-02,2.249653e-03,-1.229450e-02,0.003934,-1.678441e-02,1.395420e-02,...,-1.180478e-07,-1.584372e-02,-2.650328e-07,3.491679e-03,3.381414e-03,0.000000,-2.685836e-03,1.098831e-02,,-0.004887


np.nanmean(norm_err)=np.float64(-0.0037612399617412955)
    np.nanmean(np.abs(norm_err))=np.float64(0.014489598106500664)
    np.nanmedian(norm_err)=np.float64(-2.2428720092541278e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0028905625452769427)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35302.62it/s]
100%|██████████| 100/100 [00:00<00:00, 424.31it/s]
5944it [00:00, 652950.16it/s]
100%|██████████| 100/100 [00:00<00:00, 260839.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28735.98it/s]
100%|██████████| 100/100 [00:00<00:00, 1014.16it/s]
5945it [00:00, 558557.80it/s]
100%|██████████| 100/100 [00:00<00:00, 351281.74it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.133827e-07,-1.036910e-07,-2.073922e-07,-1.038241e-07,-1.037526e-07,-2.260386e-07,-1.035946e-07,-1.038297e-07,-1.064509e-07,...,-1.038827e-07,-1.096275e-07,-1.037223e-07,-2.134574e-07,-1.036703e-07,-1.096810e-07,-1.036263e-07,-2.190403e-07,-2.073559e-07,-1.036689e-07
1,-2.133827e-07,,-1.037043e-07,-2.074189e-07,-1.038375e-07,-1.037660e-07,-2.134223e-07,-1.036080e-07,-1.038431e-07,-1.198439e-07,...,-1.038961e-07,-1.065791e-07,-1.037357e-07,-2.261415e-07,-1.036836e-07,-1.066297e-07,-1.036397e-07,-2.129553e-07,-2.073826e-07,-1.036823e-07
2,-1.036910e-07,-1.037043e-07,,-1.065019e-07,0.000000e+00,0.000000e+00,-1.037097e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.037396e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.034891e-07,-1.064827e-07,0.000000e+00
3,-2.073922e-07,-2.074189e-07,-1.065019e-07,,-1.066423e-07,-1.065669e-07,-2.074297e-07,-1.126864e-07,-1.066483e-07,-1.034823e-07,...,-1.067041e-07,-1.035901e-07,-1.095957e-07,-2.074896e-07,-1.279007e-07,-1.036379e-07,-1.094885e-07,-2.069885e-07,-2.190923e-07,-1.095361e-07
4,-1.038241e-07,-1.038375e-07,0.000000e+00,-1.066423e-07,,0.000000e+00,-1.038428e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.038729e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036217e-07,-1.066231e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.096810e-07,-1.066297e-07,0.000000e+00,-1.036379e-07,0.000000e+00,0.000000e+00,-1.097019e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.066670e-07,0.000000e+00,,0.000000e+00,-1.126886e-07,-1.036198e-07,0.000000e+00
96,-1.036263e-07,-1.036397e-07,0.000000e+00,-1.094885e-07,0.000000e+00,0.000000e+00,-1.036450e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036749e-07,0.000000e+00,0.000000e+00,,-1.034247e-07,-1.161335e-07,0.000000e+00
97,-2.190403e-07,-2.129553e-07,-1.034891e-07,-2.069885e-07,-1.036217e-07,-1.035506e-07,-2.190821e-07,-1.033932e-07,-1.036274e-07,-1.062382e-07,...,-1.036801e-07,-1.126321e-07,-1.035204e-07,-2.130298e-07,-1.034685e-07,-1.126886e-07,-1.034247e-07,,-2.069524e-07,-1.034672e-07
98,-2.073559e-07,-2.073826e-07,-1.064827e-07,-2.190923e-07,-1.066231e-07,-1.065478e-07,-2.073933e-07,-1.094329e-07,-1.066291e-07,-1.034642e-07,...,-1.066849e-07,-1.035720e-07,-1.162541e-07,-2.074532e-07,-1.095173e-07,-1.036198e-07,-1.161335e-07,-2.069524e-07,,-1.127528e-07


np.nanmean(norm_err)=np.float64(-1.0888144702486005e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0888144702486005e-07)
    np.nanmedian(norm_err)=np.float64(-1.0510624339598263e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0510624339598263e-07)
    
