In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T15:28:31.583345+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10445.80it/s]
100%|██████████| 100/100 [00:00<00:00, 356.83it/s]
6131it [00:00, 600880.41it/s]
100%|██████████| 100/100 [00:00<00:00, 240361.26it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.000004,0.000000,-0.080670,-0.087867,0.000000,0.0,0.000000,-0.121755,0.000000,...,-0.113114,0.000000,-0.078519,0.000000,0.000000,-0.000003,0.000000,0.000000,0.000000,0.000000
1,-0.000004,,-0.000004,-0.077997,-0.084705,-0.000004,0.0,-0.000005,-0.115766,0.000000,...,-0.107927,-0.250869,-0.075984,-0.000003,-0.219216,-0.186650,0.301005,0.354744,0.600199,0.000003
2,0.000000,-0.000004,,-0.091615,-0.101012,0.000000,0.0,0.000000,-0.148537,0.000000,...,-0.135875,0.000000,-0.088850,0.000000,0.000000,-0.000004,0.000000,0.000000,0.000000,0.000000
3,-0.080670,-0.077997,-0.091615,,0.000000,-0.088794,0.0,-0.108430,0.000000,0.069011,...,0.000000,-0.116417,0.604008,-0.077333,-0.102655,-0.069768,-0.188962,-0.220402,-0.257005,-0.070750
4,-0.087867,-0.084705,-0.101012,0.000000,,-0.097593,0.0,-0.121845,0.000000,0.080257,...,0.000000,-0.132023,0.688448,-0.083923,-0.114599,-0.075087,-0.209011,-0.248168,-0.295567,-0.076225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.000003,-0.186650,-0.000004,-0.069768,-0.075087,-0.000003,0.0,-0.000004,-0.098519,0.000000,...,-0.092784,-0.210872,-0.068153,-0.000003,-0.188050,,0.364340,0.708439,0.345761,0.000003
96,0.000000,0.301005,0.000000,-0.188962,-0.209011,0.000000,0.0,0.000000,-0.312430,0.000000,...,-0.284544,0.000000,-0.530565,0.000000,0.000000,0.364340,,0.000000,0.000000,0.000000
97,0.000000,0.354744,0.000000,-0.220402,-0.248168,0.000000,0.0,0.000000,-0.408863,0.000000,...,-0.362386,0.000000,-1.012423,0.000000,0.000000,0.708439,0.000000,,0.000000,0.000000
98,0.000000,0.600199,0.000000,-0.257005,-0.295567,0.000000,0.0,0.000000,-0.555674,0.000000,...,-0.473195,0.000000,-0.561665,0.000000,0.000000,0.345761,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.008269076775891176)
    np.nanmean(np.abs(norm_err))=np.float64(0.07590875116928969)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33098.99it/s]
100%|██████████| 100/100 [00:00<00:00, 423.92it/s]
5985it [00:00, 605823.67it/s]
100%|██████████| 100/100 [00:00<00:00, 268693.40it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.424869e-07,-5.610657e-07,-1.974750e-07,-6.173524e-07,0.004078,-4.200727e-07,-0.030957,0.003721,0.002540,...,-6.163068e-07,0.002344,-0.002655,0.002678,-7.794703e-07,-2.299779e-07,-5.185677e-07,-7.351690e-07,2.078472e-02,-3.009745e-07
1,-3.424869e-07,,-4.641809e-07,-1.629824e-07,-4.784381e-07,0.006796,-3.632994e-07,0.040067,0.006294,0.004522,...,-4.854099e-07,0.004211,0.026059,0.004739,-5.896600e-07,-1.926571e-07,-4.058049e-07,-5.772867e-07,-1.367561e-07,-2.088792e-07
2,-5.610657e-07,-4.641809e-07,,-2.590245e-07,-1.049717e-06,0.012913,-6.255202e-07,0.036053,0.011214,0.006604,...,-1.083873e-06,0.005960,0.017466,0.007077,-1.790845e-06,-3.429856e-07,-7.537263e-07,-1.729025e-06,-1.985193e-07,-3.980159e-07
3,-1.974750e-07,-1.629824e-07,-2.590245e-07,,-2.828347e-07,0.003797,-1.977428e-07,-0.028666,0.003485,0.002427,...,-2.823957e-07,0.002248,-0.002533,0.002553,-3.494257e-07,0.000000e+00,-2.659890e-07,-3.315148e-07,1.985303e-02,0.000000e+00
4,-6.173524e-07,-4.784381e-07,-1.049717e-06,-2.828347e-07,,0.006989,-6.448054e-07,-0.089544,0.006001,0.003429,...,-1.261219e-06,0.003082,-0.003631,0.003685,-2.206349e-06,-3.719028e-07,-8.589298e-07,-1.884849e-06,-5.722170e-02,-4.571515e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.299779e-07,-1.926571e-07,-3.429856e-07,0.000000e+00,-3.719028e-07,0.004872,-2.431900e-07,-0.005277,0.004371,0.002826,...,-3.803967e-07,0.002586,-0.096245,0.002999,-5.217098e-07,,-2.909444e-07,-4.827670e-07,0.000000e+00,0.000000e+00
96,-5.185677e-07,-4.058049e-07,-7.537263e-07,-2.659890e-07,-8.589298e-07,0.005287,-5.194914e-07,-0.041053,0.004702,0.002961,...,-8.569072e-07,0.002699,-0.003115,0.003151,-1.208691e-06,-2.909444e-07,,-1.105399e-06,2.429508e-02,-3.565578e-07
97,-7.351690e-07,-5.772867e-07,-1.729025e-06,-3.315148e-07,-1.884849e-06,0.019389,-8.499220e-07,0.057367,0.015796,0.007965,...,-1.997896e-06,0.007047,0.021299,0.008662,-7.336515e-06,-4.827670e-07,-1.105399e-06,,-2.384864e-07,-5.994196e-07
98,2.078472e-02,-1.367561e-07,-1.985193e-07,1.985303e-02,-5.722170e-02,0.002978,-1.604176e-07,-0.035031,0.002783,0.002065,...,-2.119639e-07,0.001933,-0.002143,0.002155,-2.476171e-07,0.000000e+00,2.429508e-02,-2.384864e-07,,2.494737e-02


np.nanmean(norm_err)=np.float64(0.0029535397254279584)
    np.nanmean(np.abs(norm_err))=np.float64(0.013052755120253065)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0035554432025805766)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35690.13it/s]
100%|██████████| 100/100 [00:00<00:00, 433.31it/s]
5958it [00:00, 671927.70it/s]
100%|██████████| 100/100 [00:00<00:00, 338250.32it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31254.13it/s]
100%|██████████| 100/100 [00:00<00:00, 1027.05it/s]
5962it [00:00, 42911.09it/s]
100%|██████████| 100/100 [00:00<00:00, 366314.76it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.035424e-07,-1.035207e-07,-1.160861e-07,-1.035196e-07,-1.065292e-07,-1.035100e-07,-2.067405e-07,-1.033659e-07,-2.130839e-07,...,-1.035713e-07,-1.036745e-07,-1.034351e-07,-2.127605e-07,-2.185171e-07,-2.190983e-07,-2.068806e-07,-1.035965e-07,-2.069865e-07,-2.066761e-07
1,-1.035424e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064343e-07,0.000000e+00,-1.038259e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036723e-07,-1.035167e-07,-1.037775e-07,-1.128078e-07,0.000000e+00,-1.065647e-07,-1.236435e-07
2,-1.035207e-07,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.094649e-07,0.000000e+00,-1.038041e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036505e-07,-1.034950e-07,-1.037557e-07,-1.064855e-07,0.000000e+00,-1.096028e-07,-1.063772e-07
3,-1.160861e-07,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,-1.034946e-07,0.000000e+00,-1.066740e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.065119e-07,-1.093974e-07,-1.096888e-07,-1.035648e-07,0.000000e+00,-1.036178e-07,-1.034623e-07
4,-1.035196e-07,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,-1.064102e-07,0.000000e+00,-1.038030e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036495e-07,-1.034939e-07,-1.037546e-07,-1.198686e-07,0.000000e+00,-1.065405e-07,-1.126592e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.190983e-07,-1.037775e-07,-1.037557e-07,-1.096888e-07,-1.037546e-07,-1.067781e-07,-1.037449e-07,-2.072092e-07,-1.036002e-07,-2.135818e-07,...,-1.038065e-07,-1.039102e-07,-1.036697e-07,-2.132568e-07,-2.323842e-07,,-2.073499e-07,-1.038318e-07,-2.074562e-07,-2.071444e-07
96,-2.068806e-07,-1.128078e-07,-1.064855e-07,-1.035648e-07,-1.198686e-07,-1.037111e-07,-1.064742e-07,-2.126527e-07,-1.063217e-07,-2.074465e-07,...,-1.065390e-07,-1.129646e-07,-1.126804e-07,-2.071399e-07,-2.068292e-07,-2.073499e-07,,-1.096282e-07,-2.129130e-07,-2.251306e-07
97,-1.035965e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064914e-07,0.000000e+00,-1.038803e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.037265e-07,-1.035707e-07,-1.038318e-07,-1.096282e-07,,-1.066219e-07,-1.095134e-07
98,-2.069865e-07,-1.065647e-07,-1.096028e-07,-1.036178e-07,-1.065405e-07,-1.037644e-07,-1.128322e-07,-2.321902e-07,-1.236132e-07,-2.075530e-07,...,-1.096595e-07,-1.067046e-07,-1.064510e-07,-2.072460e-07,-2.069350e-07,-2.074562e-07,-2.129130e-07,-1.066219e-07,,-2.126964e-07


np.nanmean(norm_err)=np.float64(-9.807879275942943e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.807879275942943e-08)
    np.nanmedian(norm_err)=np.float64(-1.0377912191786999e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0377912191786999e-07)
    
