In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-03T00:24:58.808393+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10117.97it/s]
100%|██████████| 100/100 [00:00<00:00, 413.29it/s]
6112it [00:00, 644110.20it/s]
100%|██████████| 100/100 [00:00<00:00, 219597.07it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.847047,0.000000,0.000000,0.410499,0.000000,0.000000,0.173659,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.066225,0.000000,0.000000,0.000000
1,0.000000,,-0.030183,-0.687252,0.822831,0.148885,-0.186546,0.564287,0.093416,0.000000,...,0.097769,0.122036,0.000000,0.093943,0.000000,0.0,-0.450381,0.567263,0.000000,0.057115
2,0.000000,-0.030183,,-0.219576,0.000000,0.000000,-0.220234,0.000000,0.000000,0.000000,...,0.000000,0.000000,-0.032727,0.000000,0.000000,0.0,-0.292277,0.000000,0.000000,0.000000
3,0.847047,-0.687252,-0.219576,,-0.070005,-0.083900,0.000000,-0.045281,-0.041860,0.259061,...,-0.044525,-0.061146,-0.741987,-0.042177,0.110508,0.0,0.000000,-0.045550,0.147503,-0.120831
4,0.000000,0.822831,0.000000,-0.070005,,0.000000,-0.070105,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.890222,0.000000,0.000000,0.0,-0.079458,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000
96,1.066225,-0.450381,-0.292277,0.000000,-0.079458,-0.097852,0.000000,-0.049056,-0.045066,0.332167,...,-0.048170,-0.068237,-0.485595,-0.045434,0.136062,0.0,,-0.049372,0.196850,-0.152056
97,0.000000,0.567263,0.000000,-0.045550,0.000000,0.000000,-0.045592,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.598498,0.000000,0.000000,0.0,-0.049372,,0.000000,0.000000
98,0.000000,0.000000,0.000000,0.147503,0.000000,0.000000,0.147949,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.196850,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.0027595137541759113)
    np.nanmean(np.abs(norm_err))=np.float64(0.06713966092816427)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35374.07it/s]
100%|██████████| 100/100 [00:00<00:00, 432.73it/s]
5962it [00:00, 613023.15it/s]
100%|██████████| 100/100 [00:00<00:00, 249215.92it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,1.231716e-03,-1.862026e-03,1.176406e-03,1.167780e-03,-1.430978e-03,-1.501101e-03,-1.149623e-02,-1.082109e-03,-2.394287e-03,...,-9.690421e-03,-1.739041e-07,-1.426051e-03,-1.832126e-03,0.003231,2.248161e-03,2.470663e-03,1.932692e-03,1.076541e-03,1.968691e-03
1,0.001232,,-2.159659e-07,0.000000e+00,-1.401437e-07,0.000000e+00,-1.763236e-07,-1.661493e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,1.227338e-03,0.000000e+00,0.000000e+00,0.001018,-2.640781e-07,-3.030185e-07,0.000000e+00,-1.428683e-07,0.000000e+00
2,-0.001862,-2.159659e-07,,-2.025120e-07,-4.009844e-07,-3.618310e-02,-5.838494e-07,1.862977e-01,-2.522662e-07,-6.954265e-07,...,5.368332e-02,-1.851467e-03,-2.697241e-07,-4.073943e-07,-0.002851,-1.196375e-06,-1.482660e-06,-4.430102e-07,-3.589027e-07,-4.584976e-07
3,0.001176,0.000000e+00,-2.025120e-07,,-1.343517e-07,0.000000e+00,-1.672518e-07,-1.580702e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,1.172419e-03,0.000000e+00,0.000000e+00,0.000980,-2.442374e-07,-2.674642e-07,0.000000e+00,-1.255898e-07,0.000000e+00
4,0.001168,-1.401437e-07,-4.009844e-07,-1.343517e-07,,-1.589511e-07,-3.317435e-07,-3.136739e-07,-1.233445e-07,-2.484598e-07,...,-1.344781e-07,1.163852e-03,-1.584609e-07,-1.977129e-07,0.000974,-4.788250e-07,-5.189278e-07,-2.099756e-07,-2.475948e-07,-2.133920e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.002248,-2.640781e-07,-1.196375e-06,-2.442374e-07,-4.788250e-07,-3.361080e-07,-7.372601e-07,-6.535860e-07,-2.087086e-07,-1.410858e-06,...,-2.427099e-07,2.233391e-03,-3.339234e-07,-5.741087e-07,0.001625,,-3.957849e-06,-7.073724e-07,-4.229278e-07,-7.477002e-07
96,0.002471,-3.030185e-07,-1.482660e-06,-2.674642e-07,-5.189278e-07,-3.770107e-07,-8.368351e-07,-7.306601e-07,-2.237848e-07,-2.590684e-06,...,-2.633412e-07,2.452806e-03,-3.742643e-07,-7.047015e-07,0.001738,-3.957849e-06,,-1.078688e-06,-4.714414e-07,-1.018458e-06
97,0.001933,0.000000e+00,-4.430102e-07,0.000000e+00,-2.099756e-07,0.000000e+00,-3.031852e-07,-2.743026e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,1.921801e-03,0.000000e+00,0.000000e+00,0.001453,-7.073724e-07,-1.078688e-06,,-2.018547e-07,0.000000e+00
98,0.001077,-1.428683e-07,-3.589027e-07,-1.255898e-07,-2.475948e-07,-1.454322e-07,-3.024085e-07,-2.873206e-07,-1.150458e-07,-2.169380e-07,...,-1.246732e-07,1.073211e-03,-1.450216e-07,-1.772216e-07,0.000909,-4.229278e-07,-4.714414e-07,-2.018547e-07,,-1.921049e-07


np.nanmean(norm_err)=np.float64(-0.0008175053297644278)
    np.nanmean(np.abs(norm_err))=np.float64(0.00986126447890741)
    np.nanmedian(norm_err)=np.float64(-1.5285017575704967e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(7.173172769347348e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 27140.57it/s]
100%|██████████| 100/100 [00:00<00:00, 400.55it/s]
5950it [00:00, 624058.73it/s]
100%|██████████| 100/100 [00:00<00:00, 269210.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 27626.82it/s]
100%|██████████| 100/100 [00:00<00:00, 1011.04it/s]
5947it [00:00, 619838.13it/s]
100%|██████████| 100/100 [00:00<00:00, 343513.84it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.065640e-07,-2.073428e-07,-1.035681e-07,-2.126802e-07,-2.073420e-07,-2.070174e-07,-2.128703e-07,-1.065341e-07,-1.037412e-07,...,-1.037193e-07,-2.070401e-07,-1.035973e-07,-1.036642e-07,-2.322258e-07,-1.036148e-07,-2.258833e-07,-2.131348e-07,-2.256888e-07,-1.094651e-07
1,-1.065640e-07,,-1.037204e-07,0.000000e+00,-1.236321e-07,-1.037200e-07,-1.035575e-07,-1.095447e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.035689e-07,0.000000e+00,0.000000e+00,-1.064488e-07,0.000000e+00,-1.066796e-07,-1.129320e-07,-1.065928e-07,0.000000e+00
2,-2.073428e-07,-1.037204e-07,,-1.162479e-07,-2.070164e-07,-2.132388e-07,-2.254794e-07,-2.071965e-07,-1.036921e-07,-1.164661e-07,...,-1.066705e-07,-2.129196e-07,-1.096026e-07,-1.066123e-07,-2.071247e-07,-1.128656e-07,-2.075615e-07,-2.074472e-07,-2.073973e-07,-1.035759e-07
3,-1.035681e-07,0.000000e+00,-1.162479e-07,,-1.034052e-07,-1.065101e-07,-1.126175e-07,-1.034951e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.063508e-07,0.000000e+00,0.000000e+00,-1.034592e-07,0.000000e+00,-1.036772e-07,-1.036201e-07,-1.035953e-07,0.000000e+00
4,-2.126802e-07,-1.236321e-07,-2.070164e-07,-1.034052e-07,,-2.070156e-07,-2.066920e-07,-2.186161e-07,-1.235918e-07,-1.035778e-07,...,-1.035560e-07,-2.067146e-07,-1.034343e-07,-1.035011e-07,-2.124507e-07,-1.034518e-07,-2.129103e-07,-2.253610e-07,-2.127376e-07,-1.062396e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036148e-07,0.000000e+00,-1.128656e-07,0.000000e+00,-1.034518e-07,-1.065596e-07,-1.197466e-07,-1.035418e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.064001e-07,0.000000e+00,0.000000e+00,-1.035059e-07,,-1.037241e-07,-1.036670e-07,-1.036421e-07,0.000000e+00
96,-2.258833e-07,-1.066796e-07,-2.075615e-07,-1.036772e-07,-2.129103e-07,-2.075607e-07,-2.072354e-07,-2.131008e-07,-1.066496e-07,-1.038507e-07,...,-1.038288e-07,-2.072582e-07,-1.037065e-07,-1.037736e-07,-2.256244e-07,-1.037241e-07,,-2.133659e-07,-2.328438e-07,-1.095870e-07
97,-2.131348e-07,-1.129320e-07,-2.074472e-07,-1.036201e-07,-2.253610e-07,-2.074463e-07,-2.071214e-07,-2.190965e-07,-1.128984e-07,-1.037935e-07,...,-1.037715e-07,-2.071441e-07,-1.036494e-07,-1.037164e-07,-2.129044e-07,-1.036670e-07,-2.133659e-07,,-2.131925e-07,-1.064665e-07
98,-2.256888e-07,-1.065928e-07,-2.073973e-07,-1.035953e-07,-2.127376e-07,-2.073965e-07,-2.070717e-07,-2.129278e-07,-1.065629e-07,-1.037685e-07,...,-1.037466e-07,-2.070945e-07,-1.036245e-07,-1.036915e-07,-2.254304e-07,-1.036421e-07,-2.328438e-07,-2.131925e-07,,-1.094955e-07


np.nanmean(norm_err)=np.float64(-1.1309824441832272e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1309824441832272e-07)
    np.nanmedian(norm_err)=np.float64(-1.0640836391036587e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0640836391036587e-07)
    
