In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-10T00:25:00.402760+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
numpy                             : 2.1.2
pandas                            : 2.2.3
hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10645.44it/s]
100%|██████████| 100/100 [00:00<00:00, 367.71it/s]
6129it [00:00, 655670.91it/s]
100%|██████████| 100/100 [00:00<00:00, 261327.35it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.102618,0.000000,0.000000,0.130140,-0.045962,-0.061252,-0.539281,0.130075,0.340156,...,-0.049021,0.000000,0.000000,-0.059170,-0.047247,-0.035647,-0.038370,0.000000,-0.174565,0.108736
1,0.102618,,0.132209,0.106989,0.082135,-0.045998,-0.061315,0.000000,0.082109,0.081230,...,-0.049062,0.000000,-0.090599,-0.059229,-0.047285,-0.035669,-0.038395,-0.200942,-0.382787,-1.435558
2,0.000000,0.132209,,0.000000,0.181739,-0.057495,-0.083600,-0.685976,0.181611,0.451932,...,-0.062363,0.000000,0.000000,-0.079769,-0.059521,-0.042215,-0.046088,0.000000,-0.205930,0.142542
3,0.000000,0.106989,0.000000,,-0.137261,-0.047709,-0.064394,-0.436866,-0.137188,0.224186,...,-0.051013,0.000000,0.000000,-0.062097,-0.049095,-0.036689,-0.039580,0.000000,-0.179557,0.113656
4,0.130140,0.082135,0.181739,-0.137261,,-0.037566,-0.047195,-0.342755,0.000000,0.266680,...,-0.039585,0.000000,0.000000,-0.045949,-0.038420,-0.030381,-0.032336,0.000000,-0.149238,0.086009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.035647,-0.035669,-0.042215,-0.036689,-0.030381,0.000000,0.000000,-0.029899,-0.030374,-0.030132,...,0.000000,0.123270,-0.052367,0.000000,0.000000,,-0.680194,-0.041685,0.030607,-0.037122
96,-0.038370,-0.038395,-0.046088,-0.039580,-0.032336,0.151597,0.208898,-0.031791,-0.032328,-0.032055,...,-0.502120,0.140502,-0.058460,0.129568,0.156268,-0.680194,,-0.045456,0.633349,-0.040083
97,0.000000,-0.200942,0.000000,0.000000,0.000000,-0.056516,-0.081546,0.519138,0.000000,0.000000,...,-0.061213,0.000000,0.000000,-0.077897,-0.058472,-0.041685,-0.045456,,-0.346608,-0.216872
98,-0.174565,-0.382787,-0.205930,-0.179557,-0.149238,0.160772,0.205638,-0.146912,-0.149204,-0.148038,...,0.039963,0.106234,-0.312674,0.146153,0.164688,0.030607,0.633349,-0.346608,,-0.399560


np.nanmean(norm_err)=np.float64(-0.018587581854332815)
    np.nanmean(np.abs(norm_err))=np.float64(0.10750391235707761)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.04682175006573493)
    


100%|██████████| 100/100 [00:00<00:00, 33857.80it/s]
100%|██████████| 100/100 [00:00<00:00, 419.98it/s]
5970it [00:00, 630460.38it/s]
100%|██████████| 100/100 [00:00<00:00, 292489.82it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,2.169248e-02,6.340567e-03,-1.891176e-07,-6.283224e-03,-7.973736e-02,-4.414599e-03,0.000000e+00,4.771099e-03,1.890404e-02,...,-3.045194e-07,1.532246e-02,1.089850e-02,0.000000e+00,1.639951e-02,2.871816e-02,4.593526e-03,-2.755694e-07,-2.338433e-07,-4.005765e-03
1,2.169248e-02,,0.000000e+00,3.203816e-03,0.000000e+00,0.000000e+00,0.000000e+00,2.016911e-02,-1.331726e-07,0.000000e+00,...,2.055526e-02,0.000000e+00,-1.315199e-07,4.696399e-03,0.000000e+00,2.138490e-02,-1.297325e-07,4.087878e-03,3.690386e-03,0.000000e+00
2,6.340567e-03,0.000000e+00,,9.858911e-03,0.000000e+00,0.000000e+00,0.000000e+00,5.710244e-03,-1.773524e-07,0.000000e+00,...,5.866076e-03,0.000000e+00,-1.724617e-07,1.697187e-02,0.000000e+00,-2.062537e-07,2.350853e-02,2.979079e-02,3.975436e-02,0.000000e+00
3,-1.891176e-07,3.203816e-03,9.858911e-03,,-4.413939e-02,-4.021789e-03,-1.202955e-02,-1.749608e-07,2.681807e-02,4.908558e-03,...,-3.570751e-07,4.299624e-03,3.396692e-03,-2.637782e-07,4.492830e-03,3.910134e-03,7.865532e-03,-5.918954e-03,-5.299186e-03,-1.118146e-02
4,-6.283224e-03,0.000000e+00,0.000000e+00,-4.413939e-02,,0.000000e+00,0.000000e+00,-5.663693e-03,-1.744832e-07,0.000000e+00,...,-5.817432e-03,0.000000e+00,-1.706969e-07,-7.658758e-02,0.000000e+00,-2.037345e-07,-1.686248e-07,-2.157626e-02,-1.868012e-02,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2.871816e-02,2.138490e-02,-2.062537e-07,3.910134e-03,-2.037345e-07,-1.941299e-07,-1.523470e-07,2.610758e-02,-3.266261e-07,-2.506613e-07,...,2.675834e-02,-2.128339e-07,-3.216691e-07,6.387928e-03,-2.245823e-07,,-3.163393e-07,5.312343e-03,4.660037e-03,-1.402104e-07
96,4.593526e-03,-1.297325e-07,2.350853e-02,7.865532e-03,-1.686248e-07,-1.619914e-07,-1.318228e-07,4.253371e-03,-2.811983e-07,-1.982478e-07,...,4.339190e-03,-1.738149e-07,-2.750156e-07,1.181673e-02,-1.815720e-07,-3.163393e-07,,2.194776e-02,3.032378e-02,-1.226374e-07
97,-2.755694e-07,4.087878e-03,2.979079e-02,-5.918954e-03,-2.157626e-02,-5.520318e-03,-4.416432e-02,-2.465058e-07,1.055573e-02,7.340864e-03,...,-5.073311e-07,6.057801e-03,4.407243e-03,-1.057455e-02,6.448502e-03,5.312343e-03,2.194776e-02,,-4.491223e-02,-6.157619e-02
98,-2.338433e-07,3.690386e-03,3.975436e-02,-5.299186e-03,-1.868012e-02,-4.819382e-03,-3.969441e-02,-2.125752e-07,9.431518e-03,6.151106e-03,...,-4.357567e-07,5.223977e-03,3.948684e-03,-8.746856e-03,5.511968e-03,4.660037e-03,3.032378e-02,-4.491223e-02,,-4.608164e-02


np.nanmean(norm_err)=np.float64(-0.0005838026290135183)
    np.nanmean(np.abs(norm_err))=np.float64(0.011822394522846927)
    np.nanmedian(norm_err)=np.float64(-1.6729109778180005e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0028624509620355154)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35605.30it/s]
100%|██████████| 100/100 [00:00<00:00, 449.91it/s]
5951it [00:00, 666585.74it/s]
100%|██████████| 100/100 [00:00<00:00, 383391.59it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33188.04it/s]
100%|██████████| 100/100 [00:00<00:00, 1018.12it/s]
5952it [00:00, 47056.48it/s]
100%|██████████| 100/100 [00:00<00:00, 412419.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.038411e-07,-2.073229e-07,-1.038650e-07,-1.068113e-07,-1.036674e-07,-1.038979e-07,-1.037451e-07,-1.038727e-07,-1.065425e-07,...,-2.132965e-07,-2.130182e-07,-1.067995e-07,-1.037940e-07,-1.066614e-07,-1.129963e-07,-2.136017e-07,-2.192399e-07,-2.075060e-07,-1.163649e-07
1,-1.038411e-07,,-1.238053e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.037154e-07,-1.035838e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038597e-07,-1.036299e-07,-1.066166e-07,0.000000e+00
2,-2.073229e-07,-1.238053e-07,,-1.065450e-07,-1.036898e-07,-1.093863e-07,-1.096430e-07,-1.127073e-07,-1.096149e-07,-1.034364e-07,...,-2.070723e-07,-2.068100e-07,-1.036787e-07,-1.095273e-07,-1.035485e-07,-1.035628e-07,-2.073599e-07,-2.069020e-07,-2.128544e-07,-1.034989e-07
3,-1.038650e-07,0.000000e+00,-1.065450e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.037392e-07,-1.036075e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038835e-07,-1.036537e-07,-1.129573e-07,0.000000e+00
4,-1.068113e-07,0.000000e+00,-1.036898e-07,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.097474e-07,-1.096000e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.131696e-07,-1.065879e-07,-1.037814e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.129963e-07,0.000000e+00,-1.035628e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.065438e-07,-1.064049e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.066961e-07,-1.095096e-07,-1.036541e-07,0.000000e+00
96,-2.136017e-07,-1.038597e-07,-2.073599e-07,-1.038835e-07,-1.131696e-07,-1.036859e-07,-1.039164e-07,-1.037636e-07,-1.038913e-07,-1.199669e-07,...,-2.194726e-07,-2.191780e-07,-1.131564e-07,-1.038125e-07,-1.201177e-07,-1.066961e-07,,-2.131549e-07,-2.075430e-07,-1.066283e-07
97,-2.192399e-07,-1.036299e-07,-2.069020e-07,-1.036537e-07,-1.065879e-07,-1.034569e-07,-1.036864e-07,-1.035343e-07,-1.036614e-07,-1.063202e-07,...,-2.128510e-07,-2.125739e-07,-1.065761e-07,-1.035830e-07,-1.064385e-07,-1.095096e-07,-2.131549e-07,,-2.070843e-07,-1.094383e-07
98,-2.075060e-07,-1.066166e-07,-2.128544e-07,-1.129573e-07,-1.037814e-07,-1.064335e-07,-1.066764e-07,-1.065154e-07,-1.066499e-07,-1.035276e-07,...,-2.072549e-07,-2.069922e-07,-1.037703e-07,-1.065669e-07,-1.036398e-07,-1.036541e-07,-2.075430e-07,-2.070843e-07,,-1.035902e-07


np.nanmean(norm_err)=np.float64(-1.0022920138577732e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0022920138577732e-07)
    np.nanmedian(norm_err)=np.float64(-1.0381386713491134e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0381386713491134e-07)
    
