In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-20T00:27:25.483274+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1030-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 12129.98it/s]
100%|██████████| 100/100 [00:00<00:00, 403.11it/s]
6135it [00:00, 644187.13it/s]
100%|██████████| 100/100 [00:00<00:00, 146551.50it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.0,,0.000000,-0.115722,0.145049,-0.146879,0.0,0.132659,0.147280,0.000000,...,0.242889,0.0,0.287434,0.0,0.228246,-0.600205,0.229019,0.0,0.269145,0.177201
2,0.0,0.000000,,-0.130272,0.159980,-0.162209,0.0,0.145039,0.162698,0.000000,...,0.268021,0.0,0.312179,0.0,0.250302,-0.670864,0.251232,0.0,0.290722,0.190213
3,0.0,-0.115722,-0.130272,,0.000000,0.000000,0.0,0.000000,0.000000,0.128614,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
4,0.0,0.145049,0.159980,0.000000,,0.000000,0.0,0.000000,0.000000,-0.102826,...,0.115718,0.0,0.074574,0.0,0.100378,0.000000,0.101129,0.0,0.066382,0.061452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,-0.600205,-0.670864,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.662862,...,0.000000,0.0,0.000000,0.0,0.000000,,0.000000,0.0,0.000000,0.000000
96,0.0,0.229019,0.251232,0.000000,0.101129,0.000000,0.0,0.084602,0.104439,-0.248758,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,,0.0,-0.512209,0.000000
97,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,,0.000000,0.000000
98,0.0,0.269145,0.290722,0.000000,0.066382,0.000000,0.0,0.058837,0.067792,-0.197817,...,-0.573213,0.0,0.246379,0.0,-0.145040,0.000000,-0.512209,0.0,,-0.099501


np.nanmean(norm_err)=np.float64(0.000738834696058258)
    np.nanmean(np.abs(norm_err))=np.float64(0.11222127304756163)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29137.23it/s]
100%|██████████| 100/100 [00:00<00:00, 428.33it/s]
5960it [00:00, 559065.43it/s]
100%|██████████| 100/100 [00:00<00:00, 271125.02it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,8.604632e-03,0.000000,-1.624354e-07,2.326894e-03,0.000000e+00,-1.500533e-07,1.496870e-03,1.097431e-02,3.475573e-02,...,-1.005420e-02,-1.761252e-02,-1.517261e-07,-1.397962e-07,-1.545205e-02,3.083273e-02,1.548855e-02,2.218055e-03,5.446068e-03,4.148486e-03
1,8.604632e-03,,-0.050852,-4.549941e-03,-2.225947e-07,2.424209e-02,2.563425e-02,-1.559343e-07,0.000000e+00,0.000000e+00,...,-1.320825e-07,0.000000e+00,-4.290057e-03,-3.994719e-03,0.000000e+00,-4.415223e-03,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,0.000000e+00,-5.085181e-02,,2.556539e-03,-2.146039e-03,0.000000e+00,-1.423992e-07,-1.420013e-03,-6.455955e-02,-1.088000e-02,...,7.202069e-03,1.221075e-02,2.394194e-03,2.212350e-03,1.081234e-02,-1.462160e-02,-9.032884e-02,-2.052655e-03,-5.192360e-03,-3.999621e-03
3,-1.624354e-07,-4.549941e-03,0.002557,,-2.670360e-03,-1.486866e-07,-3.181999e-07,-1.632130e-03,-5.899915e-03,-6.168832e-03,...,4.394232e-02,2.703266e-01,-3.923082e-07,-3.533270e-07,3.486505e-02,-5.430741e-03,-8.600520e-03,-2.527380e-03,-3.594323e-02,-1.775840e-02
4,2.326894e-03,-2.225947e-07,-0.002146,-2.670360e-03,,1.998425e-03,2.243019e-03,4.523894e-02,-3.507314e-07,-3.831361e-07,...,-5.874152e-07,-1.631315e-06,-2.334376e-03,-2.007448e-03,-8.832233e-07,-6.007321e-07,-8.975206e-07,-2.118579e-06,-3.487183e-07,-2.108801e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3.083273e-02,-4.415223e-03,-0.014622,-5.430741e-03,-6.007321e-07,9.333839e-03,9.959361e-03,-3.809699e-07,-5.668851e-03,1.387584e-01,...,-3.121183e-07,-2.765424e-07,-5.064559e-03,-4.658022e-03,-2.418206e-07,,-8.104053e-03,-2.857412e-07,-1.710528e-07,-1.295247e-07
96,1.548855e-02,0.000000e+00,-0.090329,-8.600520e-03,-8.975206e-07,4.156940e-02,4.583832e-02,-3.295253e-07,0.000000e+00,0.000000e+00,...,-2.385077e-07,0.000000e+00,-7.716879e-03,-6.811087e-03,0.000000e+00,-8.104053e-03,,0.000000e+00,0.000000e+00,0.000000e+00
97,2.218055e-03,0.000000e+00,-0.002053,-2.527380e-03,-2.118579e-06,1.917643e-03,2.141711e-03,-1.007005e-01,0.000000e+00,0.000000e+00,...,-2.797088e-07,0.000000e+00,-2.224339e-03,-1.925500e-03,0.000000e+00,-2.857412e-07,0.000000e+00,,0.000000e+00,0.000000e+00
98,5.446068e-03,0.000000e+00,-0.005192,-3.594323e-02,-3.487183e-07,4.973603e-03,5.331084e-03,-2.088497e-07,0.000000e+00,0.000000e+00,...,-1.681746e-07,0.000000e+00,-3.329642e-02,-3.039891e-02,0.000000e+00,-1.710528e-07,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(0.0010606978882936333)
    np.nanmean(np.abs(norm_err))=np.float64(0.011225852105255512)
    np.nanmedian(norm_err)=np.float64(-1.626634821589743e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.985058787615333e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34166.70it/s]
100%|██████████| 100/100 [00:00<00:00, 437.19it/s]
5950it [00:00, 612645.36it/s]
100%|██████████| 100/100 [00:00<00:00, 365676.02it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 27340.49it/s]
100%|██████████| 100/100 [00:00<00:00, 1008.09it/s]
5950it [00:00, 629552.96it/s]
100%|██████████| 100/100 [00:00<00:00, 367598.95it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.065728e-07,-1.036011e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.063777e-07,-1.034198e-07,-1.035238e-07,-1.034355e-07,-1.063821e-07,0.000000e+00,-1.034466e-07,-1.063770e-07,-1.034487e-07,0.000000e+00
1,0.000000e+00,,-1.066266e-07,-1.036520e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.064314e-07,-1.034705e-07,-1.035746e-07,-1.034863e-07,-1.064358e-07,0.000000e+00,-1.034973e-07,-1.064306e-07,-1.034995e-07,0.000000e+00
2,-1.065728e-07,-1.066266e-07,,-2.078120e-07,-1.068119e-07,-1.037514e-07,-1.068632e-07,-1.038194e-07,-1.039458e-07,-1.098327e-07,...,-2.402817e-07,-2.074474e-07,-2.076565e-07,-2.074791e-07,-2.260536e-07,-1.098801e-07,-2.075012e-07,-2.260420e-07,-2.075056e-07,-1.038747e-07
3,-1.036011e-07,-1.036520e-07,-2.078120e-07,,-1.038270e-07,-1.065694e-07,-1.038755e-07,-1.066412e-07,-1.165624e-07,-1.037770e-07,...,-2.074411e-07,-2.130805e-07,-2.194361e-07,-2.131139e-07,-2.074496e-07,-1.038193e-07,-2.192627e-07,-2.074398e-07,-2.131419e-07,-1.164731e-07
4,0.000000e+00,0.000000e+00,-1.068119e-07,-1.038270e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.066159e-07,-1.036449e-07,-1.037494e-07,-1.036608e-07,-1.066204e-07,0.000000e+00,-1.036718e-07,-1.066152e-07,-1.036740e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,-1.098801e-07,-1.038193e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.096727e-07,-1.036372e-07,-1.037416e-07,-1.036531e-07,-1.096775e-07,,-1.036641e-07,-1.096720e-07,-1.036663e-07,0.000000e+00
96,-1.034466e-07,-1.034973e-07,-2.075012e-07,-2.192627e-07,-1.036718e-07,-1.064060e-07,-1.037202e-07,-1.064776e-07,-1.096756e-07,-1.036220e-07,...,-2.071314e-07,-2.127537e-07,-2.255671e-07,-2.127870e-07,-2.071398e-07,-1.036641e-07,,-2.071301e-07,-2.128150e-07,-1.095965e-07
97,-1.063770e-07,-1.064306e-07,-2.260420e-07,-2.074398e-07,-1.066152e-07,-1.035658e-07,-1.066663e-07,-1.036336e-07,-1.037595e-07,-1.096248e-07,...,-2.256032e-07,-2.070764e-07,-2.072848e-07,-2.071080e-07,-2.397954e-07,-1.096720e-07,-2.071301e-07,,-2.071344e-07,-1.036887e-07
98,-1.034487e-07,-1.034995e-07,-2.075056e-07,-2.131419e-07,-1.036740e-07,-1.094616e-07,-1.037224e-07,-1.198628e-07,-1.066128e-07,-1.036242e-07,...,-2.071358e-07,-2.188617e-07,-2.129783e-07,-2.188969e-07,-2.071442e-07,-1.036663e-07,-2.128150e-07,-2.071344e-07,,-1.065380e-07


np.nanmean(norm_err)=np.float64(-1.0240732065731298e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0240732065731298e-07)
    np.nanmedian(norm_err)=np.float64(-1.0381719743320545e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0381719743320545e-07)
    
