In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-11T00:23:23.829430+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
pandas                            : 2.2.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10482.09it/s]
100%|██████████| 100/100 [00:00<00:00, 365.69it/s]
6143it [00:00, 660301.11it/s]
100%|██████████| 100/100 [00:00<00:00, 253279.23it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.069130,...,0.000000,0.000000,0.000000,0.000000,0.388545,-0.272281,0.000000,0.069534,0.000000,0.000000
1,0.000000,,0.000000,0.118325,0.167810,0.097781,0.000000,0.101661,0.0,0.142974,...,0.098369,0.000000,0.124162,0.090449,0.141161,-0.123718,0.000000,0.143613,0.216015,0.106138
2,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,-0.369901,...,0.000000,0.000000,0.000000,0.000000,-0.430689,0.573764,0.000000,-0.371439,0.000000,0.000000
3,0.000000,0.118325,0.000000,,0.000000,0.119234,0.111745,0.125053,0.0,0.199623,...,0.407096,0.171725,0.518538,1.296039,0.097784,-0.233809,0.167998,0.200404,0.133158,0.000000
4,0.000000,0.167810,0.000000,0.000000,,0.370724,0.154876,0.399634,0.0,0.244967,...,0.731671,0.300199,1.058877,0.486393,0.151475,-0.357112,0.212475,0.246145,0.238825,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.272281,-0.123718,0.573764,-0.233809,-0.357112,-0.187645,-0.113549,-0.196158,0.0,0.000000,...,-0.389582,-0.234712,-0.525110,-0.351357,0.000000,,-0.072284,0.000000,-0.263052,-0.206098
96,0.000000,0.000000,0.000000,0.167998,0.212475,0.146194,0.000000,0.150487,0.0,0.504470,...,0.146850,0.000000,0.173798,0.137841,0.077908,-0.072284,,0.506475,0.100996,0.155336
97,0.069534,0.143613,-0.371439,0.200404,0.246145,0.176972,0.137778,0.181640,0.0,0.000000,...,0.177688,0.184985,0.206522,0.167812,-0.231425,0.000000,0.506475,,0.160947,0.186881
98,0.000000,0.216015,0.000000,0.133158,0.238825,0.101242,0.188936,0.106872,0.0,0.160007,...,0.102085,0.888529,0.143263,0.091053,0.356814,-0.263052,0.100996,0.160947,,0.113588


np.nanmean(norm_err)=np.float64(0.019681383498809563)
    np.nanmean(np.abs(norm_err))=np.float64(0.17199232207960635)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.11610805057045359)
    


100%|██████████| 100/100 [00:00<00:00, 33827.76it/s]
100%|██████████| 100/100 [00:00<00:00, 735.79it/s]
5968it [00:00, 689101.34it/s]
100%|██████████| 100/100 [00:00<00:00, 286105.32it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,3.908875e-03,-1.593511e-02,-5.259265e-07,-0.025983,-8.062921e-02,-0.002119,-1.179785e-01,-3.195059e-03,-0.017999,...,-5.021396e-03,-1.876221e-02,-1.146585e-02,-3.417305e-03,4.861912e-03,-3.743310e-03,-0.001970,2.254861e-02,-9.340273e-03,-3.812460e-03
1,3.908875e-03,,-3.767633e-07,5.314494e-03,0.010671,1.360305e-02,0.004746,2.132140e-02,-2.151379e-07,0.007100,...,-3.573820e-07,-4.564837e-07,-2.595201e-07,-4.632135e-07,-1.691442e-06,-2.562008e-07,0.004373,-9.910110e-07,-7.683038e-07,-2.614762e-07
2,-1.593511e-02,-3.767633e-07,,-1.937024e-02,-0.011999,-1.430929e-02,-0.003645,-1.916852e-02,8.711279e-03,-0.008717,...,5.740458e-02,-4.326994e-02,0.000000e+00,9.253133e-03,-4.444353e-07,4.423117e-02,-0.003421,0.000000e+00,3.594017e-02,1.020047e-02
3,-5.259265e-07,5.314494e-03,-1.937024e-02,,-0.019512,-2.515541e-02,-0.002467,-4.063568e-02,-3.579076e-03,-0.012808,...,-6.039622e-03,-2.371360e-02,-1.314300e-02,-3.860258e-03,7.245810e-03,-4.281472e-03,-0.002268,3.618574e-02,-1.360648e-02,-4.372169e-03
4,-2.598289e-02,1.067069e-02,-1.199949e-02,-1.951158e-02,,8.723262e-03,0.000000,1.134657e-02,-2.666207e-03,0.000000,...,-3.828057e-03,-1.353529e-02,-9.276620e-03,-2.819238e-03,3.744886e-02,-3.037440e-03,0.000000,3.265014e-03,-5.912101e-03,-3.082812e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.743310e-03,-2.562008e-07,4.423117e-02,-4.281472e-03,-0.003037,-3.464570e-03,-0.011508,-4.251644e-03,0.000000e+00,-0.002359,...,0.000000e+00,2.605922e-02,3.541304e-02,-1.529930e-07,-2.857920e-07,,-0.005447,0.000000e+00,0.000000e+00,0.000000e+00
96,-1.969955e-03,4.373309e-03,-3.420798e-03,-2.267656e-03,0.000000,-2.220266e-07,0.005971,-2.750639e-07,-4.844480e-03,0.000000,...,-6.678555e-03,-3.790634e-03,-2.731420e-03,-5.094578e-03,4.906984e-03,-5.446825e-03,,2.585009e-03,-9.626775e-03,-5.519364e-03
97,2.254861e-02,-9.910110e-07,0.000000e+00,3.618574e-02,0.003265,4.454355e-03,0.002852,8.566164e-03,0.000000e+00,0.002012,...,0.000000e+00,0.000000e+00,0.000000e+00,-2.691359e-07,-1.653082e-06,0.000000e+00,0.002585,,0.000000e+00,0.000000e+00
98,-9.340273e-03,-7.683038e-07,3.594017e-02,-1.360648e-02,-0.005912,-7.778109e-03,-0.021222,-1.330973e-02,0.000000e+00,-0.003791,...,0.000000e+00,4.511470e-02,2.355090e-02,-2.541536e-07,-1.114295e-06,0.000000e+00,-0.009627,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(0.0015957189179747345)
    np.nanmean(np.abs(norm_err))=np.float64(0.012340479080534308)
    np.nanmedian(norm_err)=np.float64(-2.0947429185657433e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002654073076902368)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35699.24it/s]
100%|██████████| 100/100 [00:00<00:00, 476.89it/s]
5933it [00:00, 654312.31it/s]
100%|██████████| 100/100 [00:00<00:00, 277768.48it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33253.82it/s]
100%|██████████| 100/100 [00:00<00:00, 962.74it/s]
5968it [00:00, 539067.65it/s]
100%|██████████| 100/100 [00:00<00:00, 368568.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,0.000000e+00,-1.036675e-07,-1.036240e-07,0.000000e+00,-1.037911e-07,-1.065143e-07,-1.036471e-07,...,0.000000e+00,0.000000e+00,-1.037916e-07,-1.129538e-07,0.000000e+00,0.000000e+00,-1.096791e-07,-1.064625e-07,-1.064802e-07,0.000000e+00
1,0.000000e+00,,0.000000e+00,0.000000e+00,-1.037884e-07,-1.037447e-07,0.000000e+00,-1.039122e-07,-1.066419e-07,-1.037679e-07,...,0.000000e+00,0.000000e+00,-1.039127e-07,-1.098407e-07,0.000000e+00,0.000000e+00,-1.130693e-07,-1.065899e-07,-1.066076e-07,0.000000e+00
2,0.000000e+00,0.000000e+00,,0.000000e+00,-1.065885e-07,-1.065424e-07,0.000000e+00,-1.201660e-07,-1.036475e-07,-1.065669e-07,...,0.000000e+00,0.000000e+00,-1.067196e-07,-1.037652e-07,0.000000e+00,0.000000e+00,-1.037416e-07,-1.035984e-07,-1.036151e-07,0.000000e+00
3,0.000000e+00,0.000000e+00,0.000000e+00,,-1.067118e-07,-1.066657e-07,0.000000e+00,-1.099214e-07,-1.037641e-07,-1.066901e-07,...,0.000000e+00,0.000000e+00,-1.068432e-07,-1.038821e-07,0.000000e+00,0.000000e+00,-1.038584e-07,-1.037149e-07,-1.037317e-07,0.000000e+00
4,-1.036675e-07,-1.037884e-07,-1.065885e-07,-1.067118e-07,,-2.324935e-07,-1.035283e-07,-2.133723e-07,-2.072329e-07,-2.191894e-07,...,-1.163015e-07,-1.164273e-07,-2.195125e-07,-2.074681e-07,-1.038213e-07,-1.066581e-07,-2.074210e-07,-2.071347e-07,-2.071682e-07,-1.035404e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066581e-07,-1.066120e-07,0.000000e+00,-1.098644e-07,-1.037133e-07,-1.066365e-07,...,0.000000e+00,0.000000e+00,-1.067894e-07,-1.038312e-07,0.000000e+00,,-1.038076e-07,-1.036641e-07,-1.036809e-07,0.000000e+00
96,-1.096791e-07,-1.130693e-07,-1.037416e-07,-1.038584e-07,-2.074210e-07,-2.073338e-07,-1.064664e-07,-2.076683e-07,-2.131194e-07,-2.073801e-07,...,-1.037104e-07,-1.038105e-07,-2.076692e-07,-2.195070e-07,-1.067764e-07,-1.038076e-07,,-2.130155e-07,-2.130510e-07,-1.095368e-07
97,-1.064625e-07,-1.065899e-07,-1.035984e-07,-1.037149e-07,-2.071347e-07,-2.070477e-07,-1.196547e-07,-2.073813e-07,-2.189239e-07,-2.070939e-07,...,-1.035673e-07,-1.036670e-07,-2.073822e-07,-2.130652e-07,-1.129381e-07,-1.036641e-07,-2.130155e-07,,-2.188517e-07,-1.063284e-07
98,-1.064802e-07,-1.066076e-07,-1.036151e-07,-1.037317e-07,-2.071682e-07,-2.070812e-07,-1.093822e-07,-2.074149e-07,-2.322949e-07,-2.071274e-07,...,-1.035840e-07,-1.036838e-07,-2.074158e-07,-2.131007e-07,-1.097094e-07,-1.036809e-07,-2.130510e-07,-2.188517e-07,,-1.063461e-07


np.nanmean(norm_err)=np.float64(-1.0667262263160858e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0667262263160858e-07)
    np.nanmedian(norm_err)=np.float64(-1.0626439784290664e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0626439784290664e-07)
    
