In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-09T18:55:35.970477+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10938.62it/s]
100%|██████████| 100/100 [00:00<00:00, 349.17it/s]
6131it [00:00, 614198.86it/s]
100%|██████████| 100/100 [00:00<00:00, 147790.84it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.089122,0.000000,0.000000,0.000000,0.102315,0.000000,0.142937,0.000000,0.000000,0.062213
1,0.000000,,-0.132237,-0.262608,0.000000,0.000000,-0.575865,0.000000,-0.429454,0.211817,...,-0.020802,0.494573,0.154949,0.000000,-0.022236,-0.203826,-0.025628,0.175434,0.000000,-0.017102
2,0.000000,-0.132237,,-0.117386,0.000000,-0.124311,-0.331079,-0.104866,-0.379409,-0.474057,...,-0.016920,0.143302,0.282640,-0.113221,-0.017857,0.000000,-0.019981,-0.572370,-0.239547,-0.014388
3,0.000000,-0.262608,-0.117386,,0.000000,-0.544989,-0.459115,1.055880,-0.350826,0.162434,...,-0.018304,0.393110,0.131546,0.210060,-0.019405,-0.170564,-0.021939,0.140146,0.000000,-0.015377
4,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.041318,0.000000,0.000000,0.000000,-0.047388,0.000000,-0.066006,0.000000,0.000000,-0.028900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,-0.203826,0.000000,-0.170564,0.000000,-0.185587,-0.636259,-0.145350,-0.709308,-1.029701,...,-0.024457,0.263780,0.478623,-0.161910,-0.026464,,-0.031411,-1.160385,-0.458183,-0.019498
96,0.142937,-0.025628,-0.019981,-0.021939,-0.066006,-0.023626,-0.044886,-0.019028,-0.040635,-0.039235,...,0.000000,-0.232584,-0.163585,-0.020952,0.000000,-0.031411,,-0.033375,-0.037771,0.378459
97,0.000000,0.175434,-0.572370,0.140146,0.000000,0.155680,-0.702877,0.115459,-0.778950,0.000000,...,-0.025631,0.768339,0.205389,0.131484,-0.027843,-1.160385,-0.033375,,-0.155986,-0.020237
98,0.000000,0.000000,-0.239547,0.000000,0.000000,0.000000,-0.873146,0.000000,-1.186917,-0.224574,...,-0.028147,0.976496,0.234828,0.000000,-0.030838,-0.458183,-0.037771,-0.155986,,-0.021773


np.nanmean(norm_err)=np.float64(-0.010292421527612474)
    np.nanmean(np.abs(norm_err))=np.float64(0.12438767783964981)
    np.nanmedian(norm_err)=np.float64(-0.013402675766029295)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06253323177802704)
    


100%|██████████| 100/100 [00:00<00:00, 32724.54it/s]
100%|██████████| 100/100 [00:00<00:00, 419.15it/s]
5973it [00:00, 605895.76it/s]
100%|██████████| 100/100 [00:00<00:00, 248920.12it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.618976e-02,3.102403e-02,-2.161260e-07,-0.014228,-3.476725e-02,1.776704e-03,-0.004677,-0.024762,-0.063326,...,2.269568e-03,-0.121825,2.274375e-03,6.636898e-02,1.699866e-02,0.016705,-1.794793e-07,2.877441e-03,-5.299792e-02,5.902860e-03
1,-4.618976e-02,,-2.557485e-03,-3.011294e-02,-0.020192,0.000000e+00,-2.554522e-07,-0.007044,-0.085409,-0.033052,...,-2.458706e-03,-0.074494,-3.422207e-07,-6.022601e-03,-7.070104e-03,-0.006916,-3.844863e-02,-4.588308e-07,-3.587981e-07,-7.502230e-03
2,3.102403e-02,-2.557485e-03,,2.331238e-02,-0.001493,-2.785533e-03,1.230978e-03,-0.001126,0.006647,0.024893,...,0.000000e+00,-0.001529,1.450951e-03,0.000000e+00,-2.050111e-07,0.000000,2.758295e-02,1.674893e-03,-2.798252e-03,-2.092864e-07
3,-2.161260e-07,-3.011294e-02,2.331238e-02,,-0.008585,-2.168907e-02,1.183794e-03,-0.003266,-0.017328,-0.046368,...,1.726345e-03,-0.070888,1.385872e-03,3.886548e-02,9.520649e-03,0.009428,-2.550151e-07,1.588800e-03,-3.286542e-02,3.246252e-03
4,-1.422755e-02,-2.019195e-02,-1.493233e-03,-8.584617e-03,,-2.473331e-02,5.342324e-03,0.000000,-0.012598,-0.009553,...,-1.426833e-03,-0.013458,7.929644e-03,-4.476733e-03,-5.728415e-03,-0.005531,-1.140136e-02,5.100904e-02,-2.501668e-02,-6.312862e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.670486e-02,-6.916480e-03,0.000000e+00,9.427859e-03,-0.005531,-8.883301e-03,3.170470e-03,-0.002506,0.160729,0.010609,...,0.000000e+00,-0.006047,5.201508e-03,0.000000e+00,-1.190354e-06,,1.293848e-02,9.989829e-03,-9.012728e-03,-1.293309e-06
96,-1.794793e-07,-3.844863e-02,2.758295e-02,-2.550151e-07,-0.011401,-2.832606e-02,1.494333e-03,-0.004022,-0.021314,0.018212,...,2.028744e-03,-0.062206,1.831431e-03,5.238785e-02,1.311398e-02,0.012938,,2.203326e-03,-4.305230e-02,4.510610e-03
97,2.877441e-03,-4.588308e-07,1.674893e-03,1.588800e-03,0.051009,-6.026992e-07,-8.506117e-07,0.010575,0.002481,0.001794,...,1.590502e-03,0.027838,-1.471981e-06,6.974636e-03,1.066148e-02,0.009990,2.203326e-03,,-1.224709e-06,1.292604e-02
98,-5.299792e-02,-3.587981e-07,-2.798252e-03,-3.286542e-02,-0.025017,-3.954169e-07,-5.937863e-07,-0.007986,-0.203740,-0.036398,...,-2.680434e-03,-0.093961,-8.418646e-07,-7.552192e-03,-9.275158e-03,-0.009013,-4.305230e-02,-1.224709e-06,,-1.003327e-02


np.nanmean(norm_err)=np.float64(-0.0025660191377723314)
    np.nanmean(np.abs(norm_err))=np.float64(0.013892543236410189)
    np.nanmedian(norm_err)=np.float64(-2.296960453523091e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.003472987910472227)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35128.17it/s]
100%|██████████| 100/100 [00:00<00:00, 421.16it/s]
5957it [00:00, 614452.18it/s]
100%|██████████| 100/100 [00:00<00:00, 332881.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32298.66it/s]
100%|██████████| 100/100 [00:00<00:00, 1029.59it/s]
5971it [00:00, 656655.63it/s]
100%|██████████| 100/100 [00:00<00:00, 393093.16it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.130060e-07,-2.134745e-07,-1.037568e-07,-1.037673e-07,-2.263031e-07,-2.130356e-07,-1.095655e-07,-1.066376e-07,-1.095825e-07,...,-1.038564e-07,-1.036911e-07,-2.072291e-07,-1.037671e-07,-2.129727e-07,-1.065124e-07,-1.035991e-07,-2.132916e-07,-1.039317e-07,-2.071955e-07
1,-2.130060e-07,,-2.396575e-07,-1.034882e-07,-1.034986e-07,-2.130605e-07,-2.550937e-07,-1.062234e-07,-1.094040e-07,-1.062394e-07,...,-1.035872e-07,-1.034228e-07,-2.066934e-07,-1.034984e-07,-2.184897e-07,-1.159130e-07,-1.033313e-07,-2.321419e-07,-1.036622e-07,-2.066599e-07
2,-2.134745e-07,-2.396575e-07,,-1.037093e-07,-1.037198e-07,-2.135292e-07,-2.396950e-07,-1.064565e-07,-1.096512e-07,-1.064725e-07,...,-1.038088e-07,-1.036437e-07,-2.071345e-07,-1.037197e-07,-2.189827e-07,-1.161905e-07,-1.035518e-07,-2.326985e-07,-1.038841e-07,-2.071008e-07
3,-1.037568e-07,-1.034882e-07,-1.037093e-07,,0.000000e+00,-1.037826e-07,-1.035021e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.126586e-07,0.000000e+00,-1.034724e-07,0.000000e+00,0.000000e+00,-1.036230e-07,0.000000e+00,-1.094082e-07
4,-1.037673e-07,-1.034986e-07,-1.037198e-07,0.000000e+00,,-1.037931e-07,-1.035126e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.094386e-07,0.000000e+00,-1.034829e-07,0.000000e+00,0.000000e+00,-1.036335e-07,0.000000e+00,-1.126511e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.065124e-07,-1.159130e-07,-1.161905e-07,0.000000e+00,0.000000e+00,-1.065396e-07,-1.159305e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.033555e-07,0.000000e+00,-1.092547e-07,,0.000000e+00,-1.197254e-07,0.000000e+00,-1.033388e-07
96,-1.035991e-07,-1.033313e-07,-1.035518e-07,0.000000e+00,0.000000e+00,-1.036249e-07,-1.033452e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.124727e-07,0.000000e+00,-1.033156e-07,0.000000e+00,,-1.034657e-07,0.000000e+00,-1.092329e-07
97,-2.132916e-07,-2.321419e-07,-2.326985e-07,-1.036230e-07,-1.036335e-07,-2.133462e-07,-2.321771e-07,-1.063655e-07,-1.095547e-07,-1.063815e-07,...,-1.037223e-07,-1.035574e-07,-2.069623e-07,-1.036333e-07,-2.187902e-07,-1.197254e-07,-1.034657e-07,,-1.037975e-07,-2.069287e-07
98,-1.039317e-07,-1.036622e-07,-1.038841e-07,0.000000e+00,0.000000e+00,-1.039576e-07,-1.036762e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.096215e-07,0.000000e+00,-1.036464e-07,0.000000e+00,0.000000e+00,-1.037975e-07,,-1.238346e-07


np.nanmean(norm_err)=np.float64(-1.046389471040651e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.046389471040651e-07)
    np.nanmedian(norm_err)=np.float64(-1.0392665563346649e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0392665563346649e-07)
    
