In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-20T21:18:39.992604+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10441.38it/s]
100%|██████████| 100/100 [00:00<00:00, 364.89it/s]
6143it [00:00, 594280.13it/s]
100%|██████████| 100/100 [00:00<00:00, 241051.95it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.031922,0.000000,0.000000,0.023658,0.000000,0.000000,-0.691371,...,0.000000,0.696226,0.000000,-0.141694,1.026868,0.035786,0.000000,-0.120246,0.000000,0.000000
1,0.000000,,0.000000,-0.344596,0.000000,0.000000,-0.234268,0.000000,0.125159,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.303008,-0.403310,0.000000,0.000000,0.097920,0.000000
2,0.000000,0.000000,,-0.041762,0.000000,0.000000,-0.028663,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.063604,-0.048632,0.000000,0.000000,0.000000,0.000000
3,0.031922,-0.344596,-0.041762,,-0.126371,-0.139138,0.000000,-0.324655,-0.106164,0.272752,...,-0.421389,0.034943,-0.166856,0.019114,-0.113904,0.000000,-0.392933,0.017434,-0.092995,-0.287807
4,0.000000,0.000000,0.000000,-0.126371,,0.000000,-0.099009,0.000000,-0.190140,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.430945,-0.138174,0.000000,0.000000,-0.160983,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.035786,-0.403310,-0.048632,0.000000,-0.138174,-0.153583,0.000000,-0.348557,-0.114372,0.314120,...,-0.462559,0.039627,-0.188069,0.020435,-0.126032,,-0.441827,0.018527,-0.099233,-0.334255
96,0.000000,0.000000,0.000000,-0.392933,0.000000,0.000000,-0.289349,0.000000,0.223687,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.193593,-0.441827,,0.000000,0.184397,0.000000
97,-0.120246,0.000000,0.000000,0.017434,0.000000,0.000000,0.014641,0.000000,0.000000,-0.193734,...,0.000000,0.256695,0.000000,0.000000,0.175460,0.018527,0.000000,,0.000000,0.000000
98,0.000000,0.097920,0.000000,-0.092995,-0.160983,-0.175645,-0.077279,0.360499,0.000000,0.000000,...,0.459026,0.000000,-0.206583,0.000000,-0.665883,-0.099233,0.184397,0.000000,,0.063006


np.nanmean(norm_err)=np.float64(-0.01971772210117643)
    np.nanmean(np.abs(norm_err))=np.float64(0.1044084187897866)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31907.98it/s]
100%|██████████| 100/100 [00:00<00:00, 412.91it/s]
5983it [00:00, 398205.63it/s]
100%|██████████| 100/100 [00:00<00:00, 309542.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-7.173137e-02,-2.202180e-07,-1.093199e-01,1.441457e-02,-2.906508e-06,-4.557198e-02,-5.851070e-02,-5.372771e-02,-6.028139e-02,...,-2.509132e-07,-9.648533e-02,-5.069460e-07,-6.282779e-03,-2.194182e-07,1.510173e-02,-1.439556e-06,-5.120068e-03,-0.000002,1.216887e-02
1,-7.173137e-02,,-1.495494e-07,-3.210825e-07,2.921350e-03,5.797467e-03,-1.682119e-07,-4.103604e-07,-1.871060e-07,-2.180235e-07,...,-1.630991e-07,-2.683139e-07,1.200615e-02,-3.434184e-03,7.359335e-03,3.004937e-03,-3.520561e-07,-3.055054e-03,0.018231,1.892092e-02
2,-2.202180e-07,-1.495494e-07,,0.000000e+00,-2.171300e-03,-2.095453e-07,0.000000e+00,-1.378462e-07,0.000000e+00,0.000000e+00,...,-1.024499e-02,0.000000e+00,0.000000e+00,2.442151e-03,0.000000e+00,0.000000e+00,0.000000e+00,2.243856e-03,0.000000,-6.049425e-03
3,-1.093199e-01,-3.210825e-07,0.000000e+00,,3.458661e-03,8.350977e-03,0.000000e+00,-2.493856e-07,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,1.521777e-02,-4.200786e-03,8.452887e-03,3.576435e-03,0.000000e+00,-3.647070e-03,0.026828,2.522596e-02
4,1.441457e-02,2.921350e-03,-2.171300e-03,3.458661e-03,,4.454374e-03,2.320038e-03,2.650823e-03,2.537496e-03,2.690704e-03,...,-2.345368e-03,3.304444e-03,3.294674e-03,0.000000e+00,2.161826e-03,4.301265e-02,-4.466529e-03,-1.630050e-07,0.004597,-2.336811e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.510173e-02,3.004937e-03,0.000000e+00,3.576435e-03,4.301265e-02,4.651667e-03,2.372444e-03,2.719464e-03,2.600319e-03,2.761449e-03,...,0.000000e+00,3.411786e-03,3.401372e-03,0.000000e+00,2.207258e-03,,0.000000e+00,-1.672229e-07,0.004808,3.932273e-03
96,-1.439556e-06,-3.520561e-07,0.000000e+00,0.000000e+00,-4.466529e-03,-1.079981e-06,0.000000e+00,-2.934129e-07,0.000000e+00,0.000000e+00,...,-3.917201e-03,0.000000e+00,0.000000e+00,5.786735e-03,0.000000e+00,0.000000e+00,,4.785129e-03,0.000000,-2.872298e-01
97,-5.120068e-03,-3.055054e-03,2.243856e-03,-3.647070e-03,-1.630050e-07,-1.446304e-02,-2.403371e-03,-2.760495e-03,-2.637504e-03,-2.803411e-03,...,1.247716e-01,-3.476018e-03,-1.499089e-01,-1.308157e-02,-4.600667e-02,-1.672229e-07,4.785129e-03,,-0.220389,-4.934476e-07
98,-1.739740e-06,1.823094e-02,0.000000e+00,2.682808e-02,4.597438e-03,-2.096592e-02,1.187751e-02,1.505976e-02,1.389290e-02,1.548955e-02,...,0.000000e+00,2.396013e-02,0.000000e+00,-1.284887e-01,0.000000e+00,4.807894e-03,0.000000e+00,-2.203895e-01,,1.186611e-01


np.nanmean(norm_err)=np.float64(-0.0007062727201991688)
    np.nanmean(np.abs(norm_err))=np.float64(0.012402921976458276)
    np.nanmedian(norm_err)=np.float64(-2.0598983945902524e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002153274063202641)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34903.09it/s]
100%|██████████| 100/100 [00:00<00:00, 454.19it/s]
5934it [00:00, 649233.10it/s]
100%|██████████| 100/100 [00:00<00:00, 365995.11it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33581.30it/s]
100%|██████████| 100/100 [00:00<00:00, 1034.70it/s]
5939it [00:00, 588749.03it/s]
100%|██████████| 100/100 [00:00<00:00, 344077.44it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.197600e-07,-2.329289e-07,-1.096627e-07,-2.260529e-07,-2.135201e-07,-2.328833e-07,-1.036467e-07,-2.137305e-07,-1.037428e-07,...,-2.137355e-07,-2.074999e-07,-2.259280e-07,-2.075531e-07,-1.036229e-07,-1.129959e-07,-1.067231e-07,-2.072923e-07,-2.077138e-07,-1.039299e-07
1,-2.197600e-07,,-2.194478e-07,-1.128680e-07,-2.194712e-07,-2.134476e-07,-2.194073e-07,-1.036125e-07,-2.136578e-07,-1.037086e-07,...,-2.136628e-07,-2.074314e-07,-2.193534e-07,-2.074845e-07,-1.035888e-07,-1.097068e-07,-1.066868e-07,-2.072239e-07,-2.076451e-07,-1.038956e-07
2,-2.329289e-07,-2.194478e-07,,-1.095073e-07,-2.257227e-07,-2.132255e-07,-2.476275e-07,-1.035078e-07,-2.134352e-07,-1.036037e-07,...,-2.134403e-07,-2.072217e-07,-2.255981e-07,-2.072747e-07,-1.034841e-07,-1.128309e-07,-1.065759e-07,-2.070145e-07,-2.074349e-07,-1.037903e-07
3,-1.096627e-07,-1.128680e-07,-1.095073e-07,,-1.095189e-07,-1.065188e-07,-1.094871e-07,0.000000e+00,-1.066235e-07,0.000000e+00,...,-1.066260e-07,-1.035221e-07,-1.094603e-07,-1.035486e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.034187e-07,-1.036286e-07,0.000000e+00
4,-2.260529e-07,-2.194712e-07,-2.257227e-07,-1.095189e-07,,-2.132475e-07,-2.256798e-07,-1.035182e-07,-2.134573e-07,-1.036141e-07,...,-2.134623e-07,-2.072424e-07,-2.324984e-07,-2.072955e-07,-1.034945e-07,-1.279873e-07,-1.065868e-07,-2.070353e-07,-2.074557e-07,-1.038007e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.129959e-07,-1.097068e-07,-1.128309e-07,0.000000e+00,-1.279873e-07,-1.065966e-07,-1.128095e-07,0.000000e+00,-1.067014e-07,0.000000e+00,...,-1.067039e-07,-1.035956e-07,-1.162169e-07,-1.036221e-07,0.000000e+00,,0.000000e+00,-1.034920e-07,-1.037022e-07,0.000000e+00
96,-1.067231e-07,-1.066868e-07,-1.065759e-07,0.000000e+00,-1.065868e-07,-1.129592e-07,-1.065568e-07,0.000000e+00,-1.165312e-07,0.000000e+00,...,-1.098242e-07,-1.036398e-07,-1.065313e-07,-1.036663e-07,0.000000e+00,0.000000e+00,,-1.035362e-07,-1.037465e-07,0.000000e+00
97,-2.072923e-07,-2.072239e-07,-2.070145e-07,-1.034187e-07,-2.070353e-07,-2.071420e-07,-2.069785e-07,-1.062411e-07,-2.073399e-07,-1.196883e-07,...,-2.073447e-07,-2.126993e-07,-2.069305e-07,-2.188582e-07,-1.275201e-07,-1.034920e-07,-1.035362e-07,,-2.129240e-07,-1.065387e-07
98,-2.077138e-07,-2.076451e-07,-2.074349e-07,-1.036286e-07,-2.074557e-07,-2.075629e-07,-2.073987e-07,-1.127562e-07,-2.077616e-07,-1.065640e-07,...,-2.077664e-07,-2.326410e-07,-2.073506e-07,-2.131991e-07,-1.064375e-07,-1.037022e-07,-1.037465e-07,-2.129240e-07,,-1.098354e-07


np.nanmean(norm_err)=np.float64(-1.1947454683193908e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1947454683193908e-07)
    np.nanmedian(norm_err)=np.float64(-1.0653560230830106e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0653560230830106e-07)
    
