In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-17T00:24:20.027376+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
numpy                             : 2.1.2
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10141.46it/s]
100%|██████████| 100/100 [00:00<00:00, 372.16it/s]
6116it [00:00, 632423.53it/s]
100%|██████████| 100/100 [00:00<00:00, 255438.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.461104,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,,0.096198,0.000000,-0.314146,0.000000,0.104690,0.254663,-0.282128,0.0,...,-0.351973,-0.074117,-0.284133,-0.074611,0.0,0.000000,-0.265577,0.000000,0.176134,0.186882
2,0.000000,0.096198,,0.000000,0.000000,0.106906,0.000000,0.216677,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,-0.437042,0.000000,-0.311380,-0.421280,-0.488473
3,0.461104,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,-0.314146,0.000000,0.000000,,-0.342121,0.000000,0.044403,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.228268,0.000000,0.182186,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,-0.437042,0.000000,0.228268,0.000000,-0.498246,0.047622,0.493375,0.0,...,0.260862,0.091230,0.498501,0.091979,0.0,,0.452296,-0.114795,-0.879239,-0.749509
96,0.000000,-0.265577,0.000000,0.000000,0.000000,-0.298604,0.000000,0.069531,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.452296,,0.319043,0.000000,0.000000
97,0.000000,0.000000,-0.311380,0.000000,0.182186,0.000000,-0.341245,0.037680,0.338949,0.0,...,0.202366,0.072825,0.341361,0.073302,0.0,-0.114795,0.319043,,-0.769713,-0.529591
98,0.000000,0.176134,-0.421280,0.000000,0.000000,0.193915,-0.512265,0.317158,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,-0.879239,0.000000,-0.769713,,0.000000


np.nanmean(norm_err)=np.float64(0.026021784389783944)
    np.nanmean(np.abs(norm_err))=np.float64(0.10987765864926954)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33827.76it/s]
100%|██████████| 100/100 [00:00<00:00, 441.10it/s]
5981it [00:00, 634561.81it/s]
100%|██████████| 100/100 [00:00<00:00, 287675.17it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,4.361061e-03,-0.006185,1.085630e-02,-6.170598e-07,-5.017831e-03,-0.008415,-0.034555,-7.789334e-03,1.943254e-02,...,5.445751e-03,0.000000e+00,-0.011032,-1.803145e-01,0.000000e+00,2.124277e-02,2.524239e-02,0.000000e+00,2.260240e-02,-3.527403e-07
1,4.361061e-03,,-0.001010,2.799966e-03,-4.634192e-07,-2.308492e-07,-0.001224,-0.000951,-1.168845e-03,3.537176e-03,...,1.173038e-01,-1.959788e-07,-0.001423,-3.981710e-07,3.361506e-02,3.016222e-03,3.370190e-03,3.574268e-03,5.025120e-03,4.306519e-03
2,-6.184729e-03,-1.009605e-03,,-1.564488e-02,8.288948e-03,-4.094210e-03,0.000000,-0.003035,-1.560491e-07,-2.463321e-03,...,-1.174569e-03,1.701749e-03,0.000000,-7.015013e-03,-1.231161e-02,-2.084455e-03,-2.341167e-03,-5.007591e-03,-3.579803e-03,-6.102650e-03
3,1.085630e-02,2.799966e-03,-0.015645,,-2.099788e-07,-1.715827e-03,-0.018784,-0.003663,-2.396835e-01,1.097976e-01,...,3.210588e-03,0.000000e+00,-0.021640,-4.503495e-03,-5.367437e-03,9.573273e-03,1.063668e-02,9.005765e-03,9.245186e-03,1.072970e-02
4,-6.170598e-07,-4.634192e-07,0.008289,-2.099788e-07,,-3.278296e-06,0.012856,0.001815,1.144947e-02,-3.409334e-07,...,-3.131335e-07,-1.230981e-01,0.020163,-2.007687e-06,-5.979114e-07,-4.839368e-07,-3.051230e-07,-3.495249e-07,-2.283827e-06,-1.182225e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2.124277e-02,3.016222e-03,-0.002084,9.573273e-03,-4.839368e-07,-5.941398e-03,-0.002540,-0.001961,-2.422234e-03,-1.495476e-07,...,3.498192e-03,-2.032679e-07,-0.002967,-1.694412e-03,2.978768e-03,,-2.932977e-02,1.727393e-02,-4.289821e-07,2.096633e-02
96,2.524239e-02,3.370190e-03,-0.002341,1.063668e-02,-3.051230e-07,-7.490025e-03,-0.002932,-0.002187,-2.775951e-03,0.000000e+00,...,3.983368e-03,0.000000e+00,-0.003515,-2.058289e-03,3.527598e-03,-2.932977e-02,,1.982881e-02,-2.626882e-07,2.485305e-02
97,0.000000e+00,3.574268e-03,-0.005008,9.005765e-03,-3.495249e-07,-2.851359e-03,-0.006376,-0.057569,-6.010056e-03,1.501453e-02,...,4.271625e-03,0.000000e+00,-0.007773,-5.423573e-02,0.000000e+00,1.727393e-02,1.982881e-02,,1.439255e-02,-2.493860e-07
98,2.260240e-02,5.025120e-03,-0.003580,9.245186e-03,-2.283827e-06,-2.794364e-02,-0.005174,-0.003230,-4.706820e-03,-2.888043e-07,...,6.522009e-03,-6.014790e-07,-0.007316,-5.189863e-03,7.304459e-03,-4.289821e-07,-2.626882e-07,1.439255e-02,,2.188599e-02


np.nanmean(norm_err)=np.float64(0.0009269776826089347)
    np.nanmean(np.abs(norm_err))=np.float64(0.012595358097174417)
    np.nanmedian(norm_err)=np.float64(-2.2294120820806882e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.004152213077134783)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35940.91it/s]
100%|██████████| 100/100 [00:00<00:00, 464.03it/s]
5915it [00:00, 636723.85it/s]
100%|██████████| 100/100 [00:00<00:00, 424524.70it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31268.11it/s]
100%|██████████| 100/100 [00:00<00:00, 1023.14it/s]
5936it [00:00, 672339.08it/s]
100%|██████████| 100/100 [00:00<00:00, 409600.00it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.073378e-07,-1.162747e-07,-1.036595e-07,-1.037237e-07,-1.097956e-07,-2.072406e-07,-1.036435e-07,-1.067502e-07,-2.130776e-07,...,-2.563551e-07,-2.075951e-07,-2.648221e-07,-2.072342e-07,-1.066861e-07,-2.074251e-07,-1.038982e-07,-1.038278e-07,-1.037873e-07,-2.257343e-07
1,-2.073378e-07,,-1.034396e-07,-1.063415e-07,-1.094625e-07,-1.036195e-07,-2.126005e-07,-1.126016e-07,-1.036443e-07,-2.068900e-07,...,-2.071410e-07,-2.255669e-07,-2.068687e-07,-2.251410e-07,-1.035838e-07,-2.253663e-07,-1.065928e-07,-1.279564e-07,-1.095334e-07,-2.069326e-07
2,-1.162747e-07,-1.034396e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.033912e-07,0.000000e+00,0.000000e+00,-1.062966e-07,...,-1.161509e-07,-1.035677e-07,-1.159797e-07,-1.033880e-07,0.000000e+00,-1.034831e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.125954e-07
3,-1.036595e-07,-1.063415e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.093368e-07,0.000000e+00,0.000000e+00,-1.034356e-07,...,-1.035611e-07,-1.064768e-07,-1.034250e-07,-1.062870e-07,0.000000e+00,-1.063874e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.034569e-07
4,-1.037237e-07,-1.094625e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.063579e-07,0.000000e+00,0.000000e+00,-1.034996e-07,...,-1.036252e-07,-1.096059e-07,-1.034889e-07,-1.094048e-07,0.000000e+00,-1.095112e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035209e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.074251e-07,-2.253663e-07,-1.034831e-07,-1.063874e-07,-1.095112e-07,-1.036631e-07,-2.126924e-07,-1.160811e-07,-1.036879e-07,-2.069770e-07,...,-2.072281e-07,-2.325489e-07,-2.069557e-07,-2.393784e-07,-1.036274e-07,,-1.066389e-07,-1.128710e-07,-1.095821e-07,-2.070196e-07
96,-1.038982e-07,-1.065928e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.238343e-07,0.000000e+00,0.000000e+00,-1.036733e-07,...,-1.037994e-07,-1.067288e-07,-1.036626e-07,-1.065380e-07,0.000000e+00,-1.066389e-07,,0.000000e+00,0.000000e+00,-1.036947e-07
97,-1.038278e-07,-1.279564e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064674e-07,0.000000e+00,0.000000e+00,-1.036033e-07,...,-1.037291e-07,-1.129716e-07,-1.035926e-07,-1.127579e-07,0.000000e+00,-1.128710e-07,0.000000e+00,,0.000000e+00,-1.036246e-07
98,-1.037873e-07,-1.095334e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064248e-07,0.000000e+00,0.000000e+00,-1.035630e-07,...,-1.036887e-07,-1.096770e-07,-1.035523e-07,-1.094756e-07,0.000000e+00,-1.095821e-07,0.000000e+00,0.000000e+00,,-1.035843e-07


np.nanmean(norm_err)=np.float64(-1.1103481366025082e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1103481366025082e-07)
    np.nanmedian(norm_err)=np.float64(-1.0639006951512976e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0639006951512976e-07)
    
