In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-14T04:10:43.794245+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
downstream                        : 1.14.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10324.95it/s]
100%|██████████| 100/100 [00:00<00:00, 394.60it/s]
6139it [00:00, 625731.04it/s]
100%|██████████| 100/100 [00:00<00:00, 240361.26it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.064782,0.320457,-0.112951,-0.182083,0.277893,0.000000,0.660253,0.000000,-1.275557,...,-0.355581,-0.154733,0.041861,0.000000,-0.451610,-0.136487,-0.112823,-0.111585,0.046852,-0.107112
1,-0.064782,,0.047267,-0.301355,-0.743347,0.052719,0.000000,0.053301,-0.073761,-0.094469,...,-0.077117,-0.198551,0.679601,-0.061992,-0.090470,-0.076562,-0.301070,-1.275531,0.761904,-0.066355
2,0.320457,0.047267,,0.000000,0.000000,-0.180648,-0.231456,0.187668,0.413943,0.000000,...,0.000000,0.130050,-0.421221,0.295764,0.000000,0.000000,0.000000,0.000000,-0.497325,0.000000
3,-0.112951,-0.301355,0.000000,,0.000000,0.000000,0.330855,0.000000,-0.129438,0.262726,...,0.381729,-0.342806,0.000000,-0.107871,0.472562,0.000000,0.000000,0.000000,0.000000,0.000000
4,-0.182083,-0.743347,0.000000,0.000000,,0.000000,0.507042,0.000000,-0.229132,0.902044,...,0.297529,-0.808702,0.000000,-0.169237,0.401406,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.136487,-0.076562,0.000000,0.000000,0.000000,0.000000,0.089545,0.000000,-0.197204,0.000000,...,0.000000,-0.436941,0.000000,-0.122539,0.000000,,0.000000,0.000000,0.000000,0.000000
96,-0.112823,-0.301070,0.000000,0.000000,0.000000,0.000000,0.330512,0.000000,-0.129270,0.262412,...,0.381344,-0.342218,0.000000,-0.107755,0.471973,0.000000,,0.000000,0.000000,0.000000
97,-0.111585,-1.275531,0.000000,0.000000,0.000000,0.000000,0.327181,0.000000,-0.127647,0.784653,...,0.202514,-0.336556,0.000000,-0.106625,0.245811,0.000000,0.000000,,0.000000,0.000000
98,0.046852,0.761904,-0.497325,0.000000,0.000000,-0.216007,-0.162367,-0.141030,0.053984,0.000000,...,0.000000,0.137533,0.000000,0.044670,0.000000,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(0.009772809246813835)
    np.nanmean(np.abs(norm_err))=np.float64(0.15228542593708108)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.05023479013942939)
    


100%|██████████| 100/100 [00:00<00:00, 29283.70it/s]
100%|██████████| 100/100 [00:00<00:00, 400.03it/s]
5982it [00:00, 603364.91it/s]
100%|██████████| 100/100 [00:00<00:00, 168445.94it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.006127,0.000000e+00,-4.983919e-03,-1.646660e-07,-1.325998e-07,-0.000883,-0.006513,-1.877823e-03,0.000000e+00,...,-2.107334e-07,0.000000e+00,0.000000e+00,-1.383817e-03,-1.215381e-03,-1.545334e-03,0.000000e+00,0.000000e+00,0.000000e+00,-1.244978e-03
1,-6.126884e-03,,-4.007839e-04,-7.317838e-03,-1.013354e-02,-7.359704e-03,-0.001387,-0.013552,1.978360e-01,-6.316302e-04,...,-1.491337e-02,-7.185800e-03,-4.997534e-04,9.688646e-02,2.287095e-02,3.172221e-02,-1.319414e-02,-6.474293e-04,-3.249323e-04,1.064745e-01
2,0.000000e+00,-0.000401,,-3.048056e-04,-2.361112e-07,-1.713809e-07,0.001114,0.028443,-5.804476e-04,0.000000e+00,...,-3.478311e-07,0.000000e+00,0.000000e+00,-3.741153e-04,-3.149937e-04,-4.354873e-04,0.000000e+00,0.000000e+00,0.000000e+00,-3.250060e-04
3,-4.983919e-03,-0.007318,-3.048056e-04,,-7.346914e-03,-5.770195e-03,-0.001139,-0.242948,-3.876614e-07,-4.221415e-04,...,-9.570911e-03,-5.662721e-03,-3.588530e-04,-2.814512e-07,-1.229888e-07,-1.579309e-07,-8.832250e-03,-4.291405e-04,-2.588506e-04,-1.260950e-07
4,-1.646660e-07,-0.010134,-2.361112e-07,-7.346914e-03,,-4.083996e-07,-0.001259,-0.011658,-3.824123e-03,-4.225117e-07,...,-1.015261e-06,-1.984735e-07,-3.102859e-07,-2.214416e-03,-1.812584e-03,-2.659306e-03,-4.449688e-07,-4.371324e-07,-1.842048e-07,-1.879204e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.545334e-03,0.031722,-4.354873e-04,-1.579309e-07,-2.659306e-03,-1.878853e-03,-0.057043,-0.002562,-3.342082e-07,-7.225811e-04,...,-4.111744e-03,-1.831201e-03,-5.549654e-04,-2.024688e-07,0.000000e+00,,-3.572443e-03,-7.433428e-04,-3.473408e-04,0.000000e+00
96,0.000000e+00,-0.013194,0.000000e+00,-8.832250e-03,-4.449688e-07,-2.555880e-07,-0.001484,-0.016197,-6.046731e-03,0.000000e+00,...,-1.014900e-06,0.000000e+00,0.000000e+00,-2.813053e-03,-2.194916e-03,-3.572443e-03,,0.000000e+00,0.000000e+00,-2.293378e-03
97,0.000000e+00,-0.000647,0.000000e+00,-4.291405e-04,-4.371324e-07,-2.572481e-07,0.001515,0.066058,-1.294886e-03,0.000000e+00,...,-1.078397e-06,0.000000e+00,0.000000e+00,-5.805742e-04,-4.497084e-04,-7.433428e-04,0.000000e+00,,0.000000e+00,-4.703974e-04
98,0.000000e+00,-0.000325,0.000000e+00,-2.588506e-04,-1.842048e-07,-1.422798e-07,0.026203,0.001470,-4.337901e-04,0.000000e+00,...,-2.457965e-07,0.000000e+00,0.000000e+00,-3.071794e-04,-2.661411e-04,-3.473408e-04,0.000000e+00,0.000000e+00,,-2.732536e-04


np.nanmean(norm_err)=np.float64(-0.0011216075916993678)
    np.nanmean(np.abs(norm_err))=np.float64(0.006244680697979491)
    np.nanmedian(norm_err)=np.float64(-1.382409539618028e-06)
    np.nanmedian(np.abs(norm_err))=np.float64(0.00042559373353942683)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34475.62it/s]
100%|██████████| 100/100 [00:00<00:00, 425.75it/s]
5960it [00:00, 603482.41it/s]
100%|██████████| 100/100 [00:00<00:00, 356355.48it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31338.19it/s]
100%|██████████| 100/100 [00:00<00:00, 435.32it/s]
5955it [00:00, 626934.75it/s]
100%|██████████| 100/100 [00:00<00:00, 315836.14it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.035839e-07,-1.038231e-07,-2.131578e-07,-1.067693e-07,-1.038725e-07,-1.066629e-07,-1.038477e-07,-1.037461e-07,-2.074894e-07,...,-2.072976e-07,-2.076890e-07,-2.129480e-07,-1.038179e-07,-2.077902e-07,-1.035797e-07,-1.037252e-07,-1.238883e-07,-1.036948e-07,-2.256840e-07
1,-1.035839e-07,,0.000000e+00,-1.034498e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.063963e-07,...,-1.125688e-07,-1.065012e-07,-1.033510e-07,0.000000e+00,-1.065544e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.034122e-07
2,-1.038231e-07,0.000000e+00,,-1.036884e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066487e-07,...,-1.128514e-07,-1.067541e-07,-1.035891e-07,0.000000e+00,-1.068076e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036506e-07
3,-2.131578e-07,-1.034498e-07,-1.036884e-07,,-1.129406e-07,-1.037377e-07,-1.238064e-07,-1.037130e-07,-1.036116e-07,-2.072204e-07,...,-2.070291e-07,-2.074194e-07,-2.187625e-07,-1.036832e-07,-2.075204e-07,-1.034456e-07,-1.035908e-07,-1.064394e-07,-1.035604e-07,-2.127942e-07
4,-1.067693e-07,0.000000e+00,0.000000e+00,-1.129406e-07,,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037902e-07,...,-1.036942e-07,-1.038900e-07,-1.095819e-07,0.000000e+00,-1.039407e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.065869e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.035797e-07,0.000000e+00,0.000000e+00,-1.034456e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.063918e-07,...,-1.093375e-07,-1.064968e-07,-1.033468e-07,0.000000e+00,-1.065500e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.034080e-07
96,-1.037252e-07,0.000000e+00,0.000000e+00,-1.035908e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.065454e-07,...,-1.094997e-07,-1.066506e-07,-1.034917e-07,0.000000e+00,-1.067040e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.035531e-07
97,-1.238883e-07,0.000000e+00,0.000000e+00,-1.064394e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036125e-07,...,-1.035168e-07,-1.037120e-07,-1.063348e-07,0.000000e+00,-1.037625e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.126856e-07
98,-1.036948e-07,0.000000e+00,0.000000e+00,-1.035604e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.095727e-07,...,-1.064122e-07,-1.096840e-07,-1.034614e-07,0.000000e+00,-1.097405e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.035227e-07


np.nanmean(norm_err)=np.float64(-1.0244131838079801e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0244131838079801e-07)
    np.nanmedian(norm_err)=np.float64(-1.0382574509904222e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0382574509904222e-07)
    
