In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-14T04:09:32.565541+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
downstream                        : 1.14.3
pandas                            : 2.2.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 12322.05it/s]
100%|██████████| 100/100 [00:00<00:00, 395.18it/s]
6139it [00:00, 678064.79it/s]
100%|██████████| 100/100 [00:00<00:00, 236298.82it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.031810,-0.074715,0.046336,-0.375612,-0.467450,0.055002,-0.059275,-0.040188,0.036258,...,0.000000,0.000000,-0.046064,-0.043642,-0.041500,0.0,0.051974,0.009818,0.050327,0.045616
1,0.031810,,-0.271745,0.171368,-0.031414,-0.036170,0.143774,-0.335187,-0.063697,1.122990,...,0.041913,0.043677,-0.070858,-0.259988,-0.249050,0.0,0.187855,0.006329,0.403146,0.124278
2,-0.074715,-0.271745,,-0.177342,0.280676,0.397334,-0.240920,0.000000,0.217002,-0.121007,...,-0.148134,-0.169288,0.281634,0.000000,0.000000,0.0,-0.216717,-0.075886,-0.204506,-0.172777
3,0.046336,0.171368,-0.177342,,-0.043092,-0.052577,-0.324562,-0.135473,-0.235071,0.054149,...,0.071408,0.076684,-1.270220,-0.096132,-0.090961,0.0,0.000000,0.011232,0.085317,-0.239684
4,-0.375612,-0.031414,0.280676,-0.043092,,0.000000,-0.049433,0.358511,0.439668,-0.035140,...,-0.309242,-0.326256,0.278141,0.263436,0.250434,0.0,-0.047267,-0.022899,-0.046067,-0.042546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000
96,0.051974,0.187855,-0.216717,0.000000,-0.047267,-0.058927,-0.416973,-0.157306,-0.260127,0.060905,...,0.085742,0.093464,-1.212409,-0.106635,-0.100309,0.0,,0.013766,0.103386,-0.286589
97,0.009818,0.006329,-0.075886,0.011232,-0.022899,-0.030771,0.015331,-0.044480,-0.022796,0.007632,...,0.020488,0.023773,-0.028253,-0.025900,-0.023942,0.0,0.013766,,0.012979,0.010939
98,0.050327,0.403146,-0.204506,0.085317,-0.046067,-0.057073,0.000000,-0.150771,-0.437840,0.000000,...,0.081351,0.088270,-0.301216,-0.103591,-0.097611,0.0,0.103386,0.012979,,0.000000


np.nanmean(norm_err)=np.float64(-0.015620048916618981)
    np.nanmean(np.abs(norm_err))=np.float64(0.14502523174789655)
    np.nanmedian(norm_err)=np.float64(-0.008556895344584755)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0709695961651616)
    


100%|██████████| 100/100 [00:00<00:00, 33681.07it/s]
100%|██████████| 100/100 [00:00<00:00, 415.11it/s]
6010it [00:00, 577815.23it/s]
100%|██████████| 100/100 [00:00<00:00, 282064.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-0.010252,-3.843498e-07,-0.000903,3.696577e-02,-3.736990e-07,4.231086e-04,5.369387e-02,0.000000e+00,...,-1.269271e-02,-1.814170e-07,-0.001456,-3.804951e-07,0.000000e+00,0.000000e+00,0.000000e+00,3.060834e-02,0.000000e+00,5.076087e-02
1,0.000000e+00,,0.063560,-2.516212e-02,0.000681,0.000000e+00,-2.486406e-02,-4.971970e-03,0.000000e+00,0.000000e+00,...,0.000000e+00,-9.871091e-03,0.000790,-2.027284e-02,-1.926370e-02,0.000000e+00,-9.610395e-03,0.000000e+00,-1.726056e-02,-1.584557e-07
2,-1.025212e-02,6.355951e-02,,7.447460e-03,0.000408,-2.969183e-04,7.348062e-03,-1.345925e-07,-3.570555e-04,7.588012e-03,...,-1.484812e-01,4.864735e-03,0.000486,7.168888e-04,5.535201e-03,8.881214e-03,4.725474e-03,-2.669441e-04,5.986697e-04,-3.482374e-04
3,-3.843498e-07,-2.516212e-02,0.007447,,0.001407,-2.351271e-07,-6.713453e-07,-7.843949e-03,-3.158948e-07,1.833735e-02,...,-2.255909e-07,4.408432e-02,0.001970,-3.898703e-02,-2.576356e-07,2.446728e-02,1.325796e-01,-2.008848e-07,-2.919047e-02,-6.052472e-07
4,-9.029984e-04,6.806905e-04,0.000408,1.406805e-03,,-1.901203e-01,1.370722e-03,0.000000e+00,-3.004590e-03,1.459707e-03,...,-4.847165e-04,6.919384e-04,0.000000,3.510776e-02,8.414282e-04,2.051590e-03,6.634396e-04,-1.570784e-01,2.524160e-02,-2.857106e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,0.008881,2.446728e-02,0.002052,0.000000e+00,2.371687e-02,-4.631168e-02,0.000000e+00,0.000000e+00,...,0.000000e+00,1.091411e-02,0.003519,-1.525161e-02,1.353195e-02,,1.042323e-02,0.000000e+00,-1.050215e-02,-4.132962e-07
96,0.000000e+00,-9.610395e-03,0.004725,1.325796e-01,0.000663,0.000000e+00,4.182200e-02,-4.881946e-03,0.000000e+00,9.123926e-03,...,0.000000e+00,-2.964726e-02,0.000767,-1.516884e-01,1.004924e-01,1.042323e-02,,0.000000e+00,-8.988280e-02,-1.547428e-07
97,3.060834e-02,0.000000e+00,-0.000267,-2.008848e-07,-0.157078,-1.226513e-01,-1.979363e-07,2.774974e-04,1.126981e-01,0.000000e+00,...,0.000000e+00,-1.267693e-07,-0.073554,-1.998268e-07,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,2.011006e-01
98,0.000000e+00,-1.726056e-02,0.000599,-2.919047e-02,0.025242,0.000000e+00,-2.870120e-02,-6.455495e-03,0.000000e+00,-8.585814e-03,...,0.000000e+00,-1.128485e-01,0.031837,-6.688054e-02,-2.033581e-02,-1.050215e-02,-8.988280e-02,0.000000e+00,,-2.260495e-07


np.nanmean(norm_err)=np.float64(-0.0003976333413701183)
    np.nanmean(np.abs(norm_err))=np.float64(0.011609588092266629)
    np.nanmedian(norm_err)=np.float64(-1.8055515219582712e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0004973587407638967)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36142.21it/s]
100%|██████████| 100/100 [00:00<00:00, 458.73it/s]
5927it [00:00, 679819.51it/s]
100%|██████████| 100/100 [00:00<00:00, 398698.10it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33648.65it/s]
100%|██████████| 100/100 [00:00<00:00, 1007.97it/s]
5957it [00:00, 46843.74it/s]
100%|██████████| 100/100 [00:00<00:00, 416514.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.133511e-07,-1.037706e-07,-1.036966e-07,-2.075800e-07,-1.034938e-07,-1.066504e-07,-1.036714e-07,-2.070858e-07,-2.073790e-07,...,-2.259603e-07,-2.072938e-07,-2.072446e-07,-1.035206e-07,-1.095746e-07,-2.074350e-07,-1.163155e-07,-2.188055e-07,-1.128966e-07,-2.132477e-07
1,-2.133511e-07,,-1.039257e-07,-1.038515e-07,-2.078904e-07,-1.036481e-07,-1.202867e-07,-1.038262e-07,-2.073947e-07,-2.076887e-07,...,-2.136520e-07,-2.076033e-07,-2.075540e-07,-1.036750e-07,-1.066785e-07,-2.077449e-07,-1.067310e-07,-2.130312e-07,-1.067513e-07,-2.197263e-07
2,-1.037706e-07,-1.039257e-07,,0.000000e+00,-1.099299e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.163412e-07,-1.067442e-07,...,-1.039130e-07,-1.066991e-07,-1.066730e-07,0.000000e+00,0.000000e+00,-1.067739e-07,0.000000e+00,-1.036192e-07,0.000000e+00,-1.038767e-07
3,-1.036966e-07,-1.038515e-07,0.000000e+00,,-1.067722e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065108e-07,-1.129844e-07,...,-1.038387e-07,-1.096865e-07,-1.096590e-07,0.000000e+00,0.000000e+00,-1.097656e-07,0.000000e+00,-1.035454e-07,0.000000e+00,-1.038025e-07
4,-2.075800e-07,-2.078904e-07,-1.099299e-07,-1.067722e-07,,-1.065573e-07,-1.039213e-07,-1.130737e-07,-2.193488e-07,-2.135294e-07,...,-2.078648e-07,-2.134391e-07,-2.133870e-07,-1.096494e-07,-1.037928e-07,-2.135888e-07,-1.038425e-07,-2.072771e-07,-1.038617e-07,-2.077922e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.074350e-07,-2.077449e-07,-1.067739e-07,-1.097656e-07,-2.135888e-07,-1.095384e-07,-1.038486e-07,-1.066688e-07,-2.130656e-07,-2.195153e-07,...,-2.077194e-07,-2.328110e-07,-2.258588e-07,-1.065092e-07,-1.037203e-07,,-1.037699e-07,-2.071325e-07,-1.037891e-07,-2.076469e-07
96,-1.163155e-07,-1.067310e-07,0.000000e+00,0.000000e+00,-1.038425e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035951e-07,-1.037418e-07,...,-1.130423e-07,-1.036992e-07,-1.036746e-07,0.000000e+00,0.000000e+00,-1.037699e-07,,-1.094610e-07,0.000000e+00,-1.066792e-07
97,-2.188055e-07,-2.130312e-07,-1.036192e-07,-1.035454e-07,-2.072771e-07,-1.033432e-07,-1.064905e-07,-1.035203e-07,-2.067844e-07,-2.070767e-07,...,-2.191220e-07,-2.069917e-07,-2.069427e-07,-1.033700e-07,-1.277211e-07,-2.071325e-07,-1.094610e-07,,-1.094824e-07,-2.129281e-07
98,-1.128966e-07,-1.067513e-07,0.000000e+00,0.000000e+00,-1.038617e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036143e-07,-1.037611e-07,...,-1.165187e-07,-1.037184e-07,-1.036938e-07,0.000000e+00,0.000000e+00,-1.037891e-07,0.000000e+00,-1.094824e-07,,-1.066996e-07


np.nanmean(norm_err)=np.float64(-1.0464132122314915e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0464132122314915e-07)
    np.nanmedian(norm_err)=np.float64(-1.0387083495561733e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0387083495561733e-07)
    
