In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-11T00:23:20.781602+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10189.50it/s]
100%|██████████| 100/100 [00:00<00:00, 403.30it/s]
6140it [00:00, 638130.35it/s]
100%|██████████| 100/100 [00:00<00:00, 254354.40it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,-0.104324,0.000000,0.026339,0.000000,0.000000,0.000000,...,0.000000,-0.089294,-0.179564,-0.039165,0.000000,0.739005,0.029434,0.000000,-0.868873,0.000000
1,0.000000,,0.160055,0.000000,0.272978,0.000000,0.000000,0.065310,0.000000,0.081078,...,0.000000,0.215385,0.000000,-0.645957,0.000000,0.000000,0.000000,0.000000,0.206138,0.000000
2,0.000000,0.160055,,0.000000,0.255713,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.214018,0.000000,-0.424584,0.000000,0.000000,0.000000,0.107302,0.782372,0.000000
3,0.000000,0.000000,0.000000,,-0.117761,0.000000,0.031478,0.000000,0.000000,0.000000,...,0.000000,-0.098959,-0.241621,-0.047893,0.000000,-0.502719,0.036002,0.000000,-0.162673,0.000000
4,-0.104324,0.272978,0.255713,-0.117761,,-0.036411,-0.790942,0.168783,-0.135424,0.190004,...,-0.165078,0.161291,-0.194524,-0.030285,-0.072006,-0.074784,-0.584487,0.220227,0.091289,-0.103836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.739005,0.000000,0.000000,-0.502719,-0.074784,0.000000,0.016829,0.000000,0.257074,0.000000,...,0.000000,-0.066733,-0.095054,-0.024015,0.472807,,0.018041,0.000000,-0.109767,0.436360
96,0.029434,0.000000,0.000000,0.036002,-0.584487,0.000000,-0.434104,0.000000,0.046511,0.000000,...,0.071512,-0.698192,-0.321386,-0.049384,0.017137,0.018041,,0.000000,-0.097087,0.029214
97,0.000000,0.000000,0.107302,0.000000,0.220227,0.000000,0.000000,0.054395,0.000000,0.064910,...,0.000000,0.181148,0.000000,-0.388828,0.000000,0.000000,0.000000,,0.174562,0.000000
98,-0.868873,0.206138,0.782372,-0.162673,0.091289,-0.027950,-0.090872,0.612192,-0.437029,0.693788,...,-0.156435,0.080659,-0.123570,-0.024188,-0.780578,-0.109767,-0.097087,0.174562,,-0.576877


np.nanmean(norm_err)=np.float64(-0.0005274178109962906)
    np.nanmean(np.abs(norm_err))=np.float64(0.10153971230634466)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 34709.57it/s]
100%|██████████| 100/100 [00:00<00:00, 683.37it/s]
5972it [00:00, 604726.67it/s]
100%|██████████| 100/100 [00:00<00:00, 266643.61it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.057212e-06,-0.008964,-9.373595e-07,-1.605672e-03,-5.957062e-03,-0.020476,-4.990059e-07,-1.969006e-06,-3.896025e-07,...,-3.485359e-06,-8.828334e-07,-1.466657e-06,-6.842394e-02,-1.669744e-02,-7.009209e-07,-1.781493e-03,-0.002362,-7.520287e-07,-0.006905
1,-1.057212e-06,,-0.014532,0.000000e+00,-1.475864e-03,2.593359e-02,-0.004967,-2.247772e-07,-7.104945e-07,0.000000e+00,...,-9.855028e-07,0.000000e+00,0.000000e+00,2.622252e-02,-2.321076e-02,-3.005676e-07,-1.623140e-03,-0.220822,0.000000e+00,-0.002044
2,-8.964465e-03,-1.453215e-02,,-3.399994e-03,3.028149e-02,-3.996525e-03,-0.006469,-1.517687e-03,-1.409420e-02,-8.440569e-03,...,-4.293118e-03,-1.341607e-02,-8.035115e-03,-4.031021e-03,-5.485709e-07,-3.875247e-03,3.289805e-02,-0.006851,-1.241092e-02,-0.003336
3,-9.373595e-07,0.000000e+00,-0.003400,,-3.585492e-02,-3.011413e-07,0.033003,-1.487465e-02,-6.338885e-07,0.000000e+00,...,-9.329218e-02,0.000000e+00,0.000000e+00,-3.043309e-07,-5.264155e-03,-2.900248e-07,-3.941490e-02,0.006123,0.000000e+00,0.005975
4,-1.605672e-03,-1.475864e-03,0.030281,-3.585492e-02,,-1.108163e-03,-0.001409,-3.123618e-02,-1.457405e-03,-1.138090e-03,...,-2.608636e-02,-1.427044e-03,-1.541134e-03,-1.113491e-03,3.500632e-02,-1.089106e-03,-1.175310e-07,-0.001010,-1.378942e-03,-0.000998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-7.009209e-07,-3.005676e-07,-0.003875,-2.900248e-07,-1.089106e-03,-3.193007e-03,-0.006862,-3.130320e-07,-5.887730e-07,-2.021071e-07,...,-6.768228e-07,-2.845865e-07,-4.299787e-07,-6.439578e-03,-4.845081e-03,,-1.167294e-03,-0.001401,-2.694771e-07,-0.004137
96,-1.781493e-03,-1.623140e-03,0.032898,-3.941490e-02,-1.175310e-07,-1.189210e-03,-0.001543,-3.319484e-02,-1.600824e-03,-1.223749e-03,...,-2.893995e-02,-1.564292e-03,-1.702427e-03,-1.195347e-03,3.855092e-02,-1.167294e-03,,-0.001077,-1.506687e-03,-0.001063
97,-2.362445e-03,-2.208215e-01,-0.006851,6.122504e-03,-1.010192e-03,-1.431938e-03,0.003942,3.533274e-03,-1.450108e-02,-1.641643e-02,...,1.575800e-01,-2.222162e-02,-2.227552e-03,-1.440701e-03,-8.316150e-03,-1.400781e-03,-1.077122e-03,,-1.343415e-02,0.002515
98,-7.520287e-07,0.000000e+00,-0.012411,0.000000e+00,-1.378942e-03,1.381021e-02,-0.004029,-2.069236e-07,-7.504293e-07,0.000000e+00,...,-7.150199e-07,0.000000e+00,0.000000e+00,1.394659e-02,-1.823318e-02,-2.694771e-07,-1.506687e-03,-0.013434,,-0.001865


np.nanmean(norm_err)=np.float64(-0.00203130345983721)
    np.nanmean(np.abs(norm_err))=np.float64(0.011143645858227254)
    np.nanmedian(norm_err)=np.float64(-9.666515454615202e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002078827276217718)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 37019.45it/s]
100%|██████████| 100/100 [00:00<00:00, 494.32it/s]
5958it [00:00, 654350.96it/s]
100%|██████████| 100/100 [00:00<00:00, 296836.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33378.20it/s]
100%|██████████| 100/100 [00:00<00:00, 940.60it/s]
5951it [00:00, 666567.94it/s]
100%|██████████| 100/100 [00:00<00:00, 385860.53it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,-1.035686e-07,0.000000e+00,0.000000e+00,-1.035214e-07,0.000000e+00,0.000000e+00,-1.036337e-07,...,0.000000e+00,-1.038247e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.064743e-07,0.000000e+00,-1.036961e-07,-1.036729e-07,-1.037655e-07
1,0.000000e+00,,0.000000e+00,-1.035967e-07,0.000000e+00,0.000000e+00,-1.035494e-07,0.000000e+00,0.000000e+00,-1.036619e-07,...,0.000000e+00,-1.038530e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065040e-07,0.000000e+00,-1.037243e-07,-1.037011e-07,-1.037937e-07
2,0.000000e+00,0.000000e+00,,-1.034455e-07,0.000000e+00,0.000000e+00,-1.033983e-07,0.000000e+00,0.000000e+00,-1.035104e-07,...,0.000000e+00,-1.037010e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.093938e-07,0.000000e+00,-1.035727e-07,-1.035495e-07,-1.036419e-07
3,-1.035686e-07,-1.035967e-07,-1.034455e-07,,-1.035940e-07,-1.064509e-07,-2.186969e-07,-1.095746e-07,-1.035711e-07,-2.322796e-07,...,-1.127846e-07,-2.258686e-07,-1.063228e-07,-1.036990e-07,-1.095411e-07,-2.069678e-07,-1.065186e-07,-2.324363e-07,-2.190351e-07,-2.257284e-07
4,0.000000e+00,0.000000e+00,0.000000e+00,-1.035940e-07,,0.000000e+00,-1.035467e-07,0.000000e+00,0.000000e+00,-1.036591e-07,...,0.000000e+00,-1.038502e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065011e-07,0.000000e+00,-1.037216e-07,-1.036983e-07,-1.037910e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.064743e-07,-1.065040e-07,-1.093938e-07,-2.069678e-07,-1.065011e-07,-1.035783e-07,-2.068735e-07,-1.036390e-07,-1.095343e-07,-2.070978e-07,...,-1.036132e-07,-2.074792e-07,-1.034571e-07,-1.066121e-07,-1.036091e-07,,-1.036424e-07,-2.072224e-07,-2.071761e-07,-2.073609e-07
96,0.000000e+00,0.000000e+00,0.000000e+00,-1.065186e-07,0.000000e+00,0.000000e+00,-1.064686e-07,0.000000e+00,0.000000e+00,-1.065874e-07,...,0.000000e+00,-1.067895e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036424e-07,,-1.066534e-07,-1.066289e-07,-1.067268e-07
97,-1.036961e-07,-1.037243e-07,-1.035727e-07,-2.324363e-07,-1.037216e-07,-1.065856e-07,-2.189812e-07,-1.097173e-07,-1.036987e-07,-2.399147e-07,...,-1.129358e-07,-2.261718e-07,-1.064572e-07,-1.038268e-07,-1.096837e-07,-2.072224e-07,-1.066534e-07,,-2.193203e-07,-2.260313e-07
98,-1.036729e-07,-1.037011e-07,-1.035495e-07,-2.190351e-07,-1.036983e-07,-1.065610e-07,-2.253973e-07,-1.163846e-07,-1.036754e-07,-2.191807e-07,...,-1.096624e-07,-2.196079e-07,-1.064327e-07,-1.038036e-07,-1.129033e-07,-2.071761e-07,-1.066289e-07,-2.193203e-07,,-2.194754e-07


np.nanmean(norm_err)=np.float64(-9.188004857961647e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.188004857961647e-08)
    np.nanmedian(norm_err)=np.float64(-1.0375155776726217e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0375155776726217e-07)
    
