In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-10-05T00:23:30.553919+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
numpy                             : 2.1.2
hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10369.11it/s]
100%|██████████| 100/100 [00:00<00:00, 396.72it/s]
6130it [00:00, 674654.51it/s]
100%|██████████| 100/100 [00:00<00:00, 228697.06it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.132860,0.00000,1.017950,0.000000,0.000000,0.000000,0.000000,0.083888,0.000000,...,0.0,0.725375,-0.201496,0.000000,0.000000,0.000000,0.000000,0.355516,0.000000,0.000000
1,0.132860,,0.00000,-0.730498,0.851599,0.000000,0.000000,0.000000,0.000000,0.345980,...,0.0,-0.327550,-0.075052,0.384313,-0.624198,0.255695,0.000000,0.213066,0.000000,0.158210
2,0.000000,0.000000,,0.000000,0.000000,-0.290483,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.094231,0.000000,0.000000,0.000000,0.000000,-0.510670,0.000000,0.000000
3,1.017950,-0.730498,0.00000,,0.116349,0.000000,0.000000,0.000000,-0.258329,0.134910,...,0.0,0.215601,-0.062641,0.118219,-0.452271,0.087690,0.000000,0.254648,0.000000,0.452959
4,0.000000,0.851599,0.00000,0.116349,,0.000000,0.000000,0.000000,0.590911,0.000000,...,0.0,0.096804,-0.419681,0.000000,0.000000,0.000000,0.000000,0.269390,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.255695,0.00000,0.087690,0.000000,0.000000,0.000000,0.000000,0.170538,0.000000,...,0.0,0.076109,-0.422957,0.000000,0.000000,,0.000000,0.188061,0.000000,0.000000
96,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.177707,0.000000,0.000000,0.000000,,-0.229914,0.000000,0.000000
97,0.355516,0.213066,-0.51067,0.254648,0.269390,-1.054404,-0.709671,-0.421017,0.138801,0.329352,...,0.0,0.216396,0.000000,0.275152,0.111716,0.188061,-0.229914,,-0.429882,0.429117
98,0.000000,0.000000,0.00000,0.000000,0.000000,-0.254246,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.082751,0.000000,0.000000,0.000000,0.000000,-0.429882,,0.000000


np.nanmean(norm_err)=np.float64(-5.0586714738393036e-05)
    np.nanmean(np.abs(norm_err))=np.float64(0.11648256817702775)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 34138.89it/s]
100%|██████████| 100/100 [00:00<00:00, 447.00it/s]
5966it [00:00, 555220.17it/s]
100%|██████████| 100/100 [00:00<00:00, 191258.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.006801,-1.733606e-02,-5.885966e-07,0.048129,-1.005010e-06,-1.002634e-03,-4.765289e-07,-7.750505e-07,-1.509212e-06,...,-9.243643e-07,-6.632467e-07,-9.286616e-07,-2.240744e-07,-9.315277e-04,-1.175206e-06,8.085278e-02,4.398496e-02,-1.867953e-07,-9.974470e-07
1,6.800515e-03,,1.961135e-02,-2.931360e-02,-0.035695,1.109054e-03,-1.473741e-07,-1.289683e-02,-1.171213e-02,1.017792e-03,...,8.422684e-04,-3.066742e-02,2.637075e-02,-1.324658e-02,-1.408957e-07,9.306711e-04,3.752523e-03,4.058897e-03,5.087140e-04,2.721004e-02
2,-1.733606e-02,0.019611,,-2.495837e-07,0.006671,-3.027798e-07,-6.408182e-04,-2.269517e-07,-4.091491e-07,-5.505227e-07,...,-4.472907e-07,-2.620922e-07,5.277011e-02,-1.476992e-07,-6.110088e-04,-4.988097e-07,3.389539e-03,3.682471e-03,-1.305284e-07,5.463696e-02
3,-5.885966e-07,-0.029314,-2.495837e-07,,-0.060299,0.000000e+00,1.454745e-02,0.000000e+00,1.228405e-02,-6.693080e-07,...,-4.287182e-07,0.000000e+00,-4.260580e-07,0.000000e+00,1.354672e-02,-5.345551e-07,-1.951825e-07,-2.199400e-07,0.000000e+00,-4.548389e-07
4,4.812889e-02,-0.035695,6.670773e-03,-6.029943e-02,,2.790710e-03,-4.386362e-07,-2.420753e-02,-2.034461e-02,2.277546e-03,...,1.553109e-03,-6.632203e-02,1.086693e-02,-1.928140e-02,-4.105445e-07,1.883006e-03,1.750448e-02,1.960539e-02,7.029067e-04,1.151516e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.175206e-06,0.000931,-4.988097e-07,-5.345551e-07,0.001883,-8.631252e-07,-4.622426e-02,-4.404769e-07,-7.266821e-07,2.413366e-02,...,1.491979e-02,-5.954180e-07,-8.510743e-07,-2.157702e-07,-2.815762e-02,,-2.910669e-01,-2.737935e-02,-1.220527e-02,-9.084909e-07
96,8.085278e-02,0.003753,3.389539e-03,-1.951825e-07,0.017504,-1.855563e-03,-2.595410e-02,-1.810622e-07,-3.330596e-07,-2.621086e-02,...,-2.223439e-02,-2.027498e-07,4.395766e-03,-1.267868e-07,-1.635842e-02,-2.910669e-01,,8.570328e-03,-2.169778e-02,4.515735e-03
97,4.398496e-02,0.004059,3.682471e-03,-2.199400e-07,0.019605,-2.134340e-03,-1.847347e-02,-2.021735e-07,-3.684452e-07,-2.848234e-01,...,-3.831831e-02,-2.295962e-07,4.901406e-03,-1.367888e-07,-2.228245e-02,-2.737935e-02,8.570328e-03,,-1.509640e-02,5.051031e-03
98,-1.867953e-07,0.000509,-1.305284e-07,0.000000e+00,0.000703,0.000000e+00,-5.041665e-02,0.000000e+00,-1.561558e-07,-1.989773e-07,...,-1.705276e-07,0.000000e+00,-1.666223e-07,0.000000e+00,-1.555868e-02,-1.220527e-02,-2.169778e-02,-1.509640e-02,,-1.708503e-07


np.nanmean(norm_err)=np.float64(-0.0013849992252868185)
    np.nanmean(np.abs(norm_err))=np.float64(0.01126252701716604)
    np.nanmedian(norm_err)=np.float64(-2.2696011581898092e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(8.540610390104903e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34467.12it/s]
100%|██████████| 100/100 [00:00<00:00, 464.17it/s]
5959it [00:00, 671769.54it/s]
100%|██████████| 100/100 [00:00<00:00, 270077.53it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28466.84it/s]
100%|██████████| 100/100 [00:00<00:00, 472.54it/s]
5950it [00:00, 622114.14it/s]
100%|██████████| 100/100 [00:00<00:00, 262965.77it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.035028e-07,-2.068994e-07,-1.160531e-07,-1.063333e-07,-2.075088e-07,-1.036999e-07,-2.130859e-07,-2.068614e-07,-1.201018e-07,...,-2.322846e-07,-2.074285e-07,-1.034334e-07,-1.095912e-07,-1.064991e-07,-2.069719e-07,-2.126829e-07,-2.132119e-07,-2.254655e-07,-2.189536e-07
1,-1.035028e-07,,-1.159753e-07,0.000000e+00,0.000000e+00,-1.163583e-07,0.000000e+00,-1.036018e-07,-1.093064e-07,0.000000e+00,...,-1.034874e-07,-1.065609e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.160208e-07,-1.034113e-07,-1.036614e-07,-1.035060e-07,-1.034881e-07
2,-2.068994e-07,-1.159753e-07,,-1.033636e-07,-1.033506e-07,-2.398959e-07,-1.127865e-07,-2.070974e-07,-2.184945e-07,-1.036672e-07,...,-2.068687e-07,-2.130094e-07,-1.233849e-07,-1.035371e-07,-1.035073e-07,-2.391786e-07,-2.067167e-07,-2.072164e-07,-2.069059e-07,-2.068701e-07
3,-1.160531e-07,0.000000e+00,-1.033636e-07,,0.000000e+00,-1.036678e-07,0.000000e+00,-1.064516e-07,-1.033446e-07,0.000000e+00,...,-1.276854e-07,-1.036277e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.033998e-07,-1.062505e-07,-1.065145e-07,-1.126305e-07,-1.093804e-07
4,-1.063333e-07,0.000000e+00,-1.033506e-07,0.000000e+00,,-1.036547e-07,0.000000e+00,-1.161613e-07,-1.033317e-07,0.000000e+00,...,-1.063171e-07,-1.036147e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.033868e-07,-1.092801e-07,-1.198893e-07,-1.063367e-07,-1.063178e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.069719e-07,-1.160208e-07,-2.391786e-07,-1.033998e-07,-1.033868e-07,-2.477878e-07,-1.128296e-07,-2.071700e-07,-2.185754e-07,-1.037036e-07,...,-2.069412e-07,-2.130862e-07,-1.195675e-07,-1.035734e-07,-1.035436e-07,,-2.067891e-07,-2.072891e-07,-2.069784e-07,-2.069425e-07
96,-2.126829e-07,-1.034113e-07,-2.067167e-07,-1.062505e-07,-1.092801e-07,-2.073250e-07,-1.036081e-07,-2.190032e-07,-2.066788e-07,-1.065713e-07,...,-2.126505e-07,-2.072449e-07,-1.033421e-07,-1.064338e-07,-1.094553e-07,-2.067891e-07,,-2.191363e-07,-2.126897e-07,-2.126519e-07
97,-2.132119e-07,-1.036614e-07,-2.072164e-07,-1.065145e-07,-1.198893e-07,-2.078277e-07,-1.038592e-07,-2.329735e-07,-2.071783e-07,-1.068370e-07,...,-2.131793e-07,-2.077472e-07,-1.035918e-07,-1.066988e-07,-1.201001e-07,-2.072891e-07,-2.191363e-07,,-2.132188e-07,-2.131807e-07
98,-2.254655e-07,-1.035060e-07,-2.069059e-07,-1.126305e-07,-1.063367e-07,-2.075153e-07,-1.037031e-07,-2.130927e-07,-2.068679e-07,-1.129911e-07,...,-2.254291e-07,-2.074350e-07,-1.034366e-07,-1.095948e-07,-1.065025e-07,-2.069784e-07,-2.126897e-07,-2.132188e-07,,-2.189608e-07


np.nanmean(norm_err)=np.float64(-9.191880742969607e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.191880742969607e-08)
    np.nanmedian(norm_err)=np.float64(-1.0379383596457703e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0379383596457703e-07)
    
