In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-09T18:54:40.735998+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10704.12it/s]
100%|██████████| 100/100 [00:00<00:00, 396.74it/s]
6138it [00:00, 663675.54it/s]
100%|██████████| 100/100 [00:00<00:00, 253585.49it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.054899,0.000000,0.069442,-0.076363,-0.036360,-0.048205,-0.032134,-0.038893,-0.035669,...,0.000000,0.078485,-0.029252,-0.039764,0.0,0.0,-0.030164,-0.032296,-0.054242,0.054753
1,0.054899,,0.170598,-0.000002,0.395242,0.291491,0.332423,0.273469,0.067670,0.288690,...,0.106208,-0.000002,0.059183,0.068321,0.0,0.0,0.060102,0.062147,0.846121,0.037263
2,0.000000,0.170598,,0.140122,0.323957,0.149506,0.200034,0.131701,0.077027,0.146589,...,0.000000,0.158548,0.058073,0.078735,0.0,0.0,0.059870,0.064068,0.226144,0.000000
3,0.069442,-0.000002,0.140122,,0.102926,0.075084,0.085995,0.070309,0.351871,0.074340,...,0.285414,0.000000,0.296594,0.356285,0.0,0.0,0.302387,0.513676,0.090488,0.043437
4,-0.076363,0.395242,0.323957,0.102926,,0.000000,0.000000,-0.149896,-0.120206,-0.169487,...,0.134813,0.112533,-0.095808,-0.122276,0.0,0.0,-0.098239,-0.347526,0.000000,-0.215601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,,0.000000,0.000000,0.000000,0.000000
96,-0.030164,0.060102,0.059870,0.302387,-0.098239,-0.072564,-0.082703,-0.068096,0.000000,-0.071870,...,0.182418,0.322610,0.000000,0.000000,0.0,0.0,,0.152133,-0.086848,-0.835968
97,-0.032296,0.062147,0.064068,0.513676,-0.347526,-0.245211,-0.284500,-0.228334,0.177207,-0.242571,...,0.195423,0.552929,0.149200,0.179446,0.0,0.0,0.152133,,-0.791014,-0.131299
98,-0.054242,0.846121,0.226144,0.090488,0.000000,0.000000,0.000000,0.000000,-0.103582,0.000000,...,0.099127,0.097830,-0.084943,-0.105115,0.0,0.0,-0.086848,-0.791014,,-0.180888


np.nanmean(norm_err)=np.float64(0.010399561134308916)
    np.nanmean(np.abs(norm_err))=np.float64(0.15685331942055925)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.09090372276143507)
    


100%|██████████| 100/100 [00:00<00:00, 29240.83it/s]
100%|██████████| 100/100 [00:00<00:00, 429.43it/s]
5968it [00:00, 549444.80it/s]
100%|██████████| 100/100 [00:00<00:00, 282635.04it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.041050e-07,-1.629769e-07,-1.464194e-07,-2.721872e-07,-2.059373e-07,-2.910329e-07,-1.407874e-07,-4.201862e-07,-2.759711e-07,...,-1.941949e-07,-2.376854e-07,-5.339346e-07,-2.328852e-07,-4.692653e-07,-3.333856e-07,-1.964218e-07,-1.451023e-07,-2.670254e-07,-1.568010e-07
1,-2.041050e-07,,0.000000e+00,0.000000e+00,8.230406e-03,0.000000e+00,-2.232014e-07,0.000000e+00,-4.151206e-07,8.400470e-03,...,0.000000e+00,0.000000e+00,-5.371722e-07,0.000000e+00,-4.092757e-07,-2.772201e-07,0.000000e+00,9.061237e-03,0.000000e+00,0.000000e+00
2,-1.629769e-07,0.000000e+00,,0.000000e+00,-1.396870e-07,0.000000e+00,-1.496322e-07,0.000000e+00,-2.187814e-07,-1.416809e-07,...,0.000000e+00,0.000000e+00,-2.811552e-07,0.000000e+00,-2.455218e-07,-1.721160e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,-1.464194e-07,0.000000e+00,0.000000e+00,,1.309329e-02,0.000000e+00,-1.571990e-07,0.000000e+00,-2.304575e-07,1.328781e-02,...,0.000000e+00,0.000000e+00,-2.637231e-07,0.000000e+00,-2.286448e-07,-1.811229e-07,0.000000e+00,1.402296e-02,0.000000e+00,0.000000e+00
4,-2.721872e-07,8.230406e-03,-1.396870e-07,1.309329e-02,,1.164381e-02,5.681178e-02,7.995594e-03,-1.688340e-03,-2.928846e-07,...,-6.280118e-02,1.340993e-02,-4.640323e-07,-7.549711e-02,-4.088487e-07,9.449940e-03,1.111298e-02,7.123004e-02,-2.134453e-03,8.895316e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.333856e-07,-2.772201e-07,-1.721160e-07,-1.811229e-07,9.449940e-03,-2.845356e-07,-3.594863e-07,-1.735641e-07,-5.700346e-07,9.609489e-03,...,-2.565268e-07,-3.489312e-07,-6.753971e-07,-3.286528e-07,-5.644996e-07,,-2.666854e-07,1.021832e-02,-4.010063e-07,-2.299890e-07
96,-1.964218e-07,0.000000e+00,0.000000e+00,0.000000e+00,1.111298e-02,0.000000e+00,-2.147988e-07,0.000000e+00,-3.845289e-07,1.133428e-02,...,0.000000e+00,0.000000e+00,-4.870336e-07,0.000000e+00,-3.795086e-07,-2.666854e-07,,1.219097e-02,0.000000e+00,0.000000e+00
97,-1.451023e-07,9.061237e-03,0.000000e+00,1.402296e-02,7.123004e-02,1.283274e-02,6.091212e-02,8.538884e-03,-1.863276e-03,3.038401e-02,...,-9.868626e-02,1.501165e-02,-2.594806e-07,-2.194552e-01,-2.254490e-07,1.021832e-02,1.219097e-02,,-2.422048e-03,9.572915e-03
98,-2.670254e-07,0.000000e+00,0.000000e+00,0.000000e+00,-2.134453e-03,0.000000e+00,-2.970230e-07,0.000000e+00,-9.380454e-07,-2.191986e-03,...,0.000000e+00,0.000000e+00,-1.414178e-06,0.000000e+00,-7.758768e-07,-4.010063e-07,0.000000e+00,-2.422048e-03,,0.000000e+00


np.nanmean(norm_err)=np.float64(0.0011111696208202279)
    np.nanmean(np.abs(norm_err))=np.float64(0.008610601833873128)
    np.nanmedian(norm_err)=np.float64(-1.5700346126800715e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(3.4677248637412804e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34421.86it/s]
100%|██████████| 100/100 [00:00<00:00, 422.97it/s]
5950it [00:00, 647285.92it/s]
100%|██████████| 100/100 [00:00<00:00, 389082.00it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33508.86it/s]
100%|██████████| 100/100 [00:00<00:00, 1025.07it/s]
5951it [00:00, 619746.82it/s]
100%|██████████| 100/100 [00:00<00:00, 340723.31it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.036969e-07,0.000000e+00,0.000000e+00,-1.067051e-07,-1.067827e-07,-1.129934e-07,-1.066213e-07,0.000000e+00,...,-1.037427e-07,0.000000e+00,-1.068662e-07,-1.038764e-07,-1.129077e-07,-1.067770e-07,0.000000e+00,0.000000e+00,-1.036987e-07,0.000000e+00
1,0.000000e+00,,-1.034560e-07,0.000000e+00,0.000000e+00,-1.064501e-07,-1.065273e-07,-1.094730e-07,-1.063667e-07,0.000000e+00,...,-1.035016e-07,0.000000e+00,-1.066104e-07,-1.036347e-07,-1.093926e-07,-1.065216e-07,0.000000e+00,0.000000e+00,-1.034578e-07,0.000000e+00
2,-1.036969e-07,-1.034560e-07,,-1.198182e-07,-1.093759e-07,-2.070313e-07,-2.071773e-07,-2.069726e-07,-2.068736e-07,-1.064007e-07,...,-2.126764e-07,-1.062675e-07,-2.073346e-07,-2.129573e-07,-2.068289e-07,-2.071665e-07,-1.033642e-07,-1.036210e-07,-2.186771e-07,-1.033886e-07
3,0.000000e+00,0.000000e+00,-1.198182e-07,,0.000000e+00,-1.036623e-07,-1.037355e-07,-1.036328e-07,-1.035832e-07,0.000000e+00,...,-1.064929e-07,0.000000e+00,-1.038143e-07,-1.066338e-07,-1.035608e-07,-1.037301e-07,0.000000e+00,0.000000e+00,-1.095022e-07,0.000000e+00
4,0.000000e+00,0.000000e+00,-1.093759e-07,0.000000e+00,,-1.035509e-07,-1.036240e-07,-1.035215e-07,-1.034720e-07,0.000000e+00,...,-1.063754e-07,0.000000e+00,-1.037027e-07,-1.065160e-07,-1.034497e-07,-1.036186e-07,0.000000e+00,0.000000e+00,-1.321048e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.067770e-07,-1.065216e-07,-2.071665e-07,-1.037301e-07,-1.036186e-07,-2.326725e-07,-2.194607e-07,-2.131072e-07,-2.191199e-07,-1.036883e-07,...,-2.072579e-07,-1.035618e-07,-2.403991e-07,-2.075247e-07,-2.129549e-07,,-1.127132e-07,-1.097666e-07,-2.071701e-07,-1.064500e-07
96,0.000000e+00,0.000000e+00,-1.033642e-07,0.000000e+00,0.000000e+00,-1.126332e-07,-1.094845e-07,-1.063218e-07,-1.093148e-07,0.000000e+00,...,-1.034097e-07,0.000000e+00,-1.128127e-07,-1.035425e-07,-1.062460e-07,-1.127132e-07,,0.000000e+00,-1.033660e-07,0.000000e+00
97,0.000000e+00,0.000000e+00,-1.036210e-07,0.000000e+00,0.000000e+00,-1.096907e-07,-1.130251e-07,-1.065936e-07,-1.128443e-07,0.000000e+00,...,-1.036667e-07,0.000000e+00,-1.098610e-07,-1.038002e-07,-1.065173e-07,-1.097666e-07,0.000000e+00,,-1.036228e-07,0.000000e+00
98,-1.036987e-07,-1.034578e-07,-2.186771e-07,-1.095022e-07,-1.321048e-07,-2.070349e-07,-2.071810e-07,-2.069762e-07,-2.068772e-07,-1.064026e-07,...,-2.126802e-07,-1.062694e-07,-2.073382e-07,-2.129611e-07,-2.068325e-07,-2.071701e-07,-1.033660e-07,-1.036228e-07,,-1.033904e-07


np.nanmean(norm_err)=np.float64(-1.2616690529408356e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.2616690529408356e-07)
    np.nanmedian(norm_err)=np.float64(-1.0662766085003218e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0662766085003218e-07)
    
