In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-15T23:50:54.016602+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10239.00it/s]
100%|██████████| 100/100 [00:00<00:00, 410.62it/s]
6116it [00:00, 582928.77it/s]
100%|██████████| 100/100 [00:00<00:00, 243430.30it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.233915,0.000000,0.000000,-0.329733,0.587528,0.000000,-0.199838,0.000000,...,0.000000,0.000000,0.527417,0.000000,0.000000,0.0000,0.000000,0.000000,0.498744,0.000000
1,0.000000,,0.481504,0.000000,0.000000,-0.517202,0.255999,0.000000,-0.256097,0.000000,...,0.000000,0.000000,0.227724,0.000000,0.000000,0.0000,0.000000,0.000000,0.214417,0.000000
2,0.233915,0.481504,,0.000000,0.000000,-0.143612,0.216568,0.331978,-0.067335,0.000000,...,0.000000,0.000000,0.189963,0.000000,0.000000,0.0000,0.000000,0.000000,0.177696,0.286303
3,0.000000,0.000000,0.000000,,0.840352,0.301465,-0.160140,0.000000,0.348289,-0.101511,...,0.289386,-0.164180,-0.148598,0.000000,0.000000,0.0000,0.081311,0.253127,-0.142815,0.000000
4,0.000000,0.000000,0.000000,0.840352,,0.364023,-0.368084,0.000000,0.397629,0.201492,...,-0.068988,0.000000,-0.336087,-0.188471,0.000000,0.0000,0.000000,0.044619,-0.320434,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.000000,0.000000,0.334265,-0.144137,0.000000,0.144204,0.000000,...,0.000000,0.000000,-0.126455,0.000000,0.000000,,0.000000,0.000000,-0.118300,0.000000
96,0.000000,0.000000,0.000000,0.081311,0.000000,1.042594,-0.353451,0.000000,0.353611,0.072090,...,0.000000,0.000000,-0.310826,-0.081036,0.000000,0.0000,,0.000000,-0.291099,0.000000
97,0.000000,0.000000,0.000000,0.253127,0.044619,0.444137,-0.955787,0.000000,0.453456,0.230208,...,0.000000,0.000000,-0.844012,-0.577888,0.000000,0.0000,0.000000,,-0.791968,0.000000
98,0.498744,0.214417,0.177696,-0.142815,-0.320434,0.000000,0.000000,0.194867,0.000000,-0.135218,...,-0.441931,-0.221484,0.000000,-0.499023,-0.111911,-0.1183,-0.291099,-0.791968,,0.410546


np.nanmean(norm_err)=np.float64(0.0042929902966752)
    np.nanmean(np.abs(norm_err))=np.float64(0.11498586569436275)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30090.42it/s]
100%|██████████| 100/100 [00:00<00:00, 380.66it/s]
5975it [00:00, 405006.08it/s]
100%|██████████| 100/100 [00:00<00:00, 274137.52it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.477727e-02,8.718282e-04,-2.859383e-03,5.225173e-03,1.302893e-03,4.530274e-03,1.679941e-02,4.456901e-03,-1.108541e-02,...,9.733187e-04,-1.137505e-02,6.850357e-02,1.041504e-02,-2.730818e-02,-9.918787e-03,1.558666e-02,2.220524e-03,-1.924163e-02,-3.586078e-07
1,-2.477727e-02,,-2.260780e-07,-1.521634e-03,-3.340111e-07,-3.425450e-07,-2.881034e-07,-7.097230e-07,-5.666032e-07,-1.064141e-02,...,-2.532114e-07,-1.092569e-02,-3.518666e-07,-2.170742e-07,-4.152702e-02,-9.499637e-03,-2.746340e-07,-2.777057e-07,-1.877317e-02,7.268670e-04
2,8.718282e-04,-2.260780e-07,,-5.272764e-07,0.000000e+00,0.000000e+00,0.000000e+00,-4.971645e-07,-3.672134e-07,-1.997639e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-2.542289e-07,0.000000e+00,-1.544205e-07,-1.563655e-07,0.000000e+00,-8.530256e-04
3,-2.859383e-03,-1.521634e-03,-5.272764e-07,,-2.140418e-06,-2.547054e-06,-1.059028e-06,-6.866081e-06,-1.993833e-06,-6.109170e-07,...,-7.029608e-07,-3.201596e-07,-3.171859e-06,-4.807677e-07,-9.085462e-07,-2.520396e-07,-4.205475e-07,-4.277933e-07,-1.294132e-06,2.757673e-03
4,5.225173e-03,-3.340111e-07,0.000000e+00,-2.140418e-06,,0.000000e+00,0.000000e+00,-1.755068e-06,-8.007582e-07,-2.795975e-07,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-3.993415e-07,0.000000e+00,-1.986414e-07,-2.032157e-07,0.000000e+00,-5.059022e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-9.918787e-03,-9.499637e-03,0.000000e+00,-2.520396e-07,0.000000e+00,0.000000e+00,0.000000e+00,-2.442287e-07,-2.080590e-07,-4.250294e-02,...,0.000000e+00,7.192935e-02,0.000000e+00,0.000000e+00,-1.681461e-07,,-1.168369e-07,-1.179469e-07,-2.611033e-01,9.766609e-03
96,1.558666e-02,-2.746340e-07,-1.544205e-07,-4.205475e-07,-1.986414e-07,-2.011305e-07,-1.814467e-07,-4.144634e-07,-3.590606e-07,-2.542887e-07,...,-1.666156e-07,-1.296218e-07,1.360569e-02,-7.451179e-02,-2.944368e-07,-1.168369e-07,,-2.163998e-07,-1.864270e-07,-1.301125e-01
97,2.220524e-03,-2.777057e-07,-1.563655e-07,-4.277933e-07,-2.032157e-07,-2.044427e-07,-1.852558e-07,-4.186084e-07,-4.123231e-07,-2.569199e-07,...,-1.688821e-07,-1.309895e-07,-2.082587e-07,-1.522892e-07,-2.979703e-07,-1.179469e-07,-2.163998e-07,,-1.892692e-07,-2.190379e-03
98,-1.924163e-02,-1.877317e-02,0.000000e+00,-1.294132e-06,0.000000e+00,0.000000e+00,0.000000e+00,-1.111592e-06,-6.205720e-07,-2.251287e-01,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-3.633327e-07,-2.611033e-01,-1.864270e-07,-1.892692e-07,,1.867760e-02


np.nanmean(norm_err)=np.float64(-0.0006951576459843935)
    np.nanmean(np.abs(norm_err))=np.float64(0.010238334859134772)
    np.nanmedian(norm_err)=np.float64(-2.0162188636304673e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.270884880309108e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33508.86it/s]
100%|██████████| 100/100 [00:00<00:00, 462.14it/s]
5994it [00:00, 641965.63it/s]
100%|██████████| 100/100 [00:00<00:00, 264959.19it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30242.30it/s]
100%|██████████| 100/100 [00:00<00:00, 448.99it/s]
5960it [00:00, 433512.28it/s]
100%|██████████| 100/100 [00:00<00:00, 397941.56it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.065585e-07,0.000000e+00,0.000000e+00,-1.037691e-07,...,0.000000e+00,-1.036096e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037702e-07,0.000000e+00,-1.038014e-07,0.000000e+00,0.000000e+00
1,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.096215e-07,0.000000e+00,0.000000e+00,-1.037699e-07,...,0.000000e+00,-1.036104e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037711e-07,0.000000e+00,-1.038023e-07,0.000000e+00,0.000000e+00
2,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,0.000000e+00,-1.035930e-07,0.000000e+00,0.000000e+00,-1.129036e-07,...,0.000000e+00,-1.064256e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.096593e-07,0.000000e+00,-1.096941e-07,0.000000e+00,0.000000e+00
3,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,-1.064963e-07,0.000000e+00,0.000000e+00,-1.037102e-07,...,0.000000e+00,-1.035508e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037113e-07,0.000000e+00,-1.037424e-07,0.000000e+00,0.000000e+00
4,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,-1.067121e-07,0.000000e+00,0.000000e+00,-1.039148e-07,...,0.000000e+00,-1.037549e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.039160e-07,0.000000e+00,-1.039472e-07,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.037702e-07,-1.037711e-07,-1.096593e-07,-1.037113e-07,-1.039160e-07,-1.065817e-07,-2.075274e-07,-1.066175e-07,-1.036588e-07,-2.196988e-07,...,-1.038408e-07,-2.132116e-07,-1.036596e-07,-1.095365e-07,-1.037715e-07,,-1.066973e-07,-2.332067e-07,-1.066717e-07,-1.036141e-07
96,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036895e-07,0.000000e+00,0.000000e+00,-1.066961e-07,...,0.000000e+00,-1.095878e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.066973e-07,,-1.067303e-07,0.000000e+00,0.000000e+00
97,-1.038014e-07,-1.038023e-07,-1.096941e-07,-1.037424e-07,-1.039472e-07,-1.066146e-07,-2.075898e-07,-1.066504e-07,-1.036899e-07,-2.197687e-07,...,-1.038720e-07,-2.132774e-07,-1.036907e-07,-1.095712e-07,-1.038027e-07,-2.332067e-07,-1.067303e-07,,-1.067047e-07,-1.036452e-07
98,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036654e-07,0.000000e+00,0.000000e+00,-1.066706e-07,...,0.000000e+00,-1.128005e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.066717e-07,0.000000e+00,-1.067047e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0873334548045808e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0873334548045808e-07)
    np.nanmedian(norm_err)=np.float64(-1.0631219629625126e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0631219629625126e-07)
    
