In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-19T19:57:53.589513+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10849.49it/s]
100%|██████████| 100/100 [00:00<00:00, 362.10it/s]
6141it [00:00, 633120.00it/s]
100%|██████████| 100/100 [00:00<00:00, 227333.55it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.129683,0.062306,0.000000,0.000000,0.027506,0.000000,0.000000,-0.082620,-0.059940,...,0.500879,-0.083198,-0.027670,0.000000,0.266341,-0.053356,0.000000,0.000000,-0.242647,-0.080408
1,-0.129683,,0.000000,-0.119888,-0.125297,0.000000,-0.098327,-0.116613,0.301147,-0.206747,...,-0.101472,0.000000,0.000000,-0.111691,-0.432758,-0.166304,-0.207663,-0.136655,-0.109323,0.240797
2,0.062306,0.000000,,0.059953,0.061275,0.000000,0.054029,0.059122,0.000000,0.075898,...,0.054964,0.000000,0.000000,0.057830,0.093900,0.069677,0.170300,0.063872,0.458979,0.000000
3,0.000000,-0.119888,0.059953,,0.000000,0.025448,0.000000,0.000000,-0.077008,-0.056224,...,0.166200,-0.076926,-0.025588,0.000000,0.242594,-0.050392,0.000000,0.000031,-0.232289,-0.075083
4,0.000000,-0.125297,0.061275,0.000000,,0.026585,0.000000,0.000000,-0.080118,-0.058289,...,1.523833,-0.080390,-0.026738,0.000000,0.255621,-0.052045,0.000000,0.000000,-0.238098,-0.078037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.053356,-0.166304,0.069677,-0.050392,-0.052045,0.035173,-0.043393,-0.049372,-0.102812,0.000000,...,-0.628780,-0.106631,-0.035440,-0.047811,0.000000,,0.000000,-0.055391,-0.175786,-0.099410
96,0.000000,-0.207663,0.170300,0.000000,0.000000,0.043778,0.000000,0.000000,-0.124225,0.000000,...,0.000000,-0.133062,-0.044193,0.000000,0.000000,0.000000,,0.000000,0.092339,-0.119292
97,0.000000,-0.136655,0.063872,0.000031,0.000000,0.028969,-0.000016,0.000000,-0.086558,-0.062519,...,0.179562,-0.087662,-0.029151,0.000000,0.283682,-0.055391,0.000000,,-0.249596,-0.084134
98,-0.242647,-0.109323,0.458979,-0.232289,-0.238098,0.023225,-0.206679,-0.228660,-0.070851,-0.193268,...,-0.135694,-0.070159,-0.023341,-0.223039,-0.178826,-0.175786,0.092339,-0.249596,,-0.069218


np.nanmean(norm_err)=np.float64(0.0038452507684769866)
    np.nanmean(np.abs(norm_err))=np.float64(0.08479159845550543)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0501290981427419)
    


100%|██████████| 100/100 [00:00<00:00, 33049.44it/s]
100%|██████████| 100/100 [00:00<00:00, 375.41it/s]
5978it [00:00, 552585.11it/s]
100%|██████████| 100/100 [00:00<00:00, 275759.63it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.001653,-1.820310e-07,0.000000e+00,-0.143048,0.000000e+00,-5.021851e-03,-0.006501,1.404688e-03,-2.304532e-07,...,-1.421079e-07,-0.007445,-2.618477e-07,0.000000e+00,-0.007878,-0.038981,0.000000e+00,-0.030791,-2.122710e-07,-0.006716
1,-1.652783e-03,,-2.000216e-03,-1.381052e-03,0.009067,-1.405164e-03,1.149971e-02,0.016348,-2.353419e-07,-5.708619e-03,...,-2.931253e-03,0.011289,-3.422191e-03,2.050584e-02,0.012155,0.073689,-3.974422e-03,0.009508,-5.049428e-03,0.017133
2,-1.820310e-07,-0.002000,,-1.641327e-07,-0.004669,-1.970451e-07,-3.527104e-02,-0.048019,1.647631e-03,-5.726201e-07,...,-3.230837e-07,-0.005680,9.556487e-02,-2.585459e-07,-0.006062,-0.006359,-3.780841e-07,-0.004873,-5.175449e-07,-0.033035
3,0.000000e+00,-0.001381,-1.641327e-07,,-0.003710,0.000000e+00,-2.663522e-02,-0.033314,1.203424e-03,-1.887225e-07,...,-1.250560e-07,-0.004322,1.505694e-01,0.000000e+00,-0.004539,-0.004704,0.000000e+00,-0.003838,-1.763523e-07,-0.022694
4,-1.430484e-01,0.009067,-4.668811e-03,-3.710136e-03,,-3.752860e-03,2.078762e-02,0.015275,-2.769314e-07,-1.384257e-03,...,-9.460561e-04,0.018509,-6.139476e-03,1.597312e-02,0.010726,-0.022355,-4.687802e-02,-0.011095,-1.301216e-03,0.015667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.898104e-02,0.073689,-6.358860e-03,-4.703586e-03,-0.022355,-4.772466e-03,5.381101e-03,0.007107,-3.728532e-07,-2.032728e-03,...,-1.209833e-03,0.002653,-9.438063e-03,1.519433e-02,0.002819,,-2.324093e-01,-0.047126,-1.858554e-03,0.007364
96,0.000000e+00,-0.003974,-3.780841e-07,0.000000e+00,-0.046878,0.000000e+00,-8.940805e-03,-0.015030,2.789969e-03,-6.708616e-07,...,-2.387632e-07,-0.013124,-1.030547e-06,0.000000e+00,-0.014531,-0.232409,,-0.087069,-5.369697e-07,-0.016231
97,-3.079094e-02,0.009508,-4.873433e-03,-3.838179e-03,-0.011095,-3.883923e-03,1.289970e-02,0.059968,-1.443359e-07,-1.457596e-03,...,-9.797070e-04,0.010631,-6.498331e-03,1.677497e-02,0.186794,-0.047126,-8.706947e-02,,-1.365803e-03,0.027544
98,-2.122710e-07,-0.005049,-5.175449e-07,-1.763523e-07,-0.001301,-1.795229e-07,-1.659688e-03,-0.002386,3.255519e-02,-7.609814e-07,...,-3.802080e-07,-0.001628,-9.133075e-07,-3.267337e-07,-0.001757,-0.001859,-5.369697e-07,-0.001366,,-0.002505


np.nanmean(norm_err)=np.float64(-0.00019347096111803258)
    np.nanmean(np.abs(norm_err))=np.float64(0.012758793557868384)
    np.nanmedian(norm_err)=np.float64(-3.8107790431445525e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0021611582986793302)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33597.44it/s]
100%|██████████| 100/100 [00:00<00:00, 450.93it/s]
5963it [00:00, 654060.90it/s]
100%|██████████| 100/100 [00:00<00:00, 380954.04it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33396.80it/s]
100%|██████████| 100/100 [00:00<00:00, 989.83it/s]
5933it [00:00, 607361.26it/s]
100%|██████████| 100/100 [00:00<00:00, 380954.04it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.035204e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.064842e-07,0.000000e+00,0.000000e+00,-1.159570e-07,...,-1.036457e-07,0.000000e+00,-1.128697e-07,0.000000e+00,-1.094711e-07,-1.034848e-07,0.000000e+00,0.000000e+00,-1.064820e-07,-1.034930e-07
1,0.000000e+00,,-1.035482e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.128135e-07,0.000000e+00,0.000000e+00,-1.062956e-07,...,-1.036735e-07,0.000000e+00,-1.065931e-07,0.000000e+00,-1.064466e-07,-1.035125e-07,0.000000e+00,0.000000e+00,-1.128111e-07,-1.035208e-07
2,-1.035204e-07,-1.035482e-07,,-1.066749e-07,-1.096938e-07,-1.035632e-07,-2.074033e-07,-1.035463e-07,-1.036272e-07,-2.069901e-07,...,-2.133058e-07,-1.127014e-07,-2.075541e-07,-1.037754e-07,-2.072763e-07,-2.255573e-07,-1.063801e-07,-1.065484e-07,-2.073992e-07,-2.190988e-07
3,0.000000e+00,0.000000e+00,-1.066749e-07,,0.000000e+00,0.000000e+00,-1.038483e-07,0.000000e+00,0.000000e+00,-1.036411e-07,...,-1.098846e-07,0.000000e+00,-1.039239e-07,0.000000e+00,-1.037846e-07,-1.066371e-07,0.000000e+00,0.000000e+00,-1.038462e-07,-1.066458e-07
4,0.000000e+00,0.000000e+00,-1.096938e-07,0.000000e+00,,0.000000e+00,-1.038035e-07,0.000000e+00,0.000000e+00,-1.035965e-07,...,-1.067606e-07,0.000000e+00,-1.038791e-07,0.000000e+00,-1.037399e-07,-1.096539e-07,0.000000e+00,0.000000e+00,-1.038015e-07,-1.163528e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.034848e-07,-1.035125e-07,-2.255573e-07,-1.066371e-07,-1.096539e-07,-1.035276e-07,-2.073319e-07,-1.035107e-07,-1.035916e-07,-2.069190e-07,...,-2.132302e-07,-1.160876e-07,-2.074826e-07,-1.037397e-07,-2.072050e-07,,-1.063425e-07,-1.065107e-07,-2.073278e-07,-2.190191e-07
96,0.000000e+00,0.000000e+00,-1.063801e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035688e-07,0.000000e+00,0.000000e+00,-1.033628e-07,...,-1.095718e-07,0.000000e+00,-1.036440e-07,0.000000e+00,-1.035055e-07,-1.063425e-07,,0.000000e+00,-1.035668e-07,-1.063511e-07
97,0.000000e+00,0.000000e+00,-1.065484e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037284e-07,0.000000e+00,0.000000e+00,-1.035217e-07,...,-1.130016e-07,0.000000e+00,-1.038039e-07,0.000000e+00,-1.036649e-07,-1.065107e-07,0.000000e+00,,-1.037264e-07,-1.065194e-07
98,-1.064820e-07,-1.128111e-07,-2.073992e-07,-1.038462e-07,-1.038015e-07,-1.065273e-07,-2.402173e-07,-1.065094e-07,-1.096593e-07,-2.129104e-07,...,-2.076507e-07,-1.035986e-07,-2.135072e-07,-1.067519e-07,-2.132133e-07,-2.073278e-07,-1.035668e-07,-1.037264e-07,,-2.073442e-07


np.nanmean(norm_err)=np.float64(-1.1958779705768853e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1958779705768853e-07)
    np.nanmedian(norm_err)=np.float64(-1.0653193643745305e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0653193643745305e-07)
    
