In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-01T00:26:00.851108+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 12172.93it/s]
100%|██████████| 100/100 [00:00<00:00, 401.07it/s]
6127it [00:00, 653174.58it/s]
100%|██████████| 100/100 [00:00<00:00, 257319.26it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.093694,0.179500,0.000000,0.174706,-0.034081,0.358450,-0.092190,0.000000,-0.021491,...,0.000000,0.121995,0.000000,0.000000,0.169056,0.179424,0.000000,0.289329,-0.109110,0.000000
1,-0.093694,,0.095963,-0.105806,0.092628,-0.048599,0.075966,0.356502,-0.112994,-0.026478,...,-0.222151,-0.087257,-0.112414,-0.105841,0.088767,-0.809783,-0.086067,0.059742,0.225811,-0.121321
2,0.179500,0.095963,,0.206314,0.000000,-0.865541,-0.178864,0.093687,0.222682,-0.339590,...,0.522518,0.165630,0.221347,0.206393,0.000000,0.195367,0.163093,-0.124248,0.121548,0.242086
3,0.000000,-0.105806,0.206314,,0.200005,-0.039326,0.406761,-0.103893,0.000000,-0.023464,...,0.000000,-0.143336,0.000000,0.000000,0.192635,0.201513,0.000000,0.320007,-0.125893,0.000000
4,0.174706,0.092628,0.000000,0.200005,,-0.805573,-0.919790,0.090505,0.215351,-0.465538,...,0.483865,0.161540,0.214102,0.200080,0.000000,0.189701,0.159126,-0.587680,0.116245,0.233446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.179424,-0.809783,0.195367,0.201513,0.189701,-0.037180,0.519476,-0.208693,0.340012,-0.022683,...,0.087343,0.167588,0.213462,0.318427,0.183059,,0.258800,0.408279,-0.253119,0.229457
96,0.000000,-0.086067,0.163093,0.000000,0.159126,-0.030892,0.555812,-0.084797,0.000000,-0.020177,...,0.000000,0.000000,0.000000,0.000000,0.154425,0.258800,,0.448174,-0.098904,0.000000
97,0.289329,0.059742,-0.124248,0.320007,-0.587680,-0.384126,0.000000,0.058852,0.573243,-0.315504,...,0.298712,0.272497,0.336217,0.540652,-0.115885,0.408279,0.448174,,0.068752,0.357506
98,-0.109110,0.225811,0.121548,-0.125893,0.116245,-0.063503,0.091154,0.000000,-0.136202,-0.030361,...,-0.334064,-0.100479,-0.135360,-0.125943,0.110229,-0.253119,-0.098904,0.068752,,-0.148487


np.nanmean(norm_err)=np.float64(-0.025976896096066557)
    np.nanmean(np.abs(norm_err))=np.float64(0.1527053592555017)
    np.nanmedian(norm_err)=np.float64(-0.020075782691730354)
    np.nanmedian(np.abs(norm_err))=np.float64(0.11199892885476118)
    


100%|██████████| 100/100 [00:00<00:00, 36456.36it/s]
100%|██████████| 100/100 [00:00<00:00, 428.92it/s]
5976it [00:00, 590825.02it/s]
100%|██████████| 100/100 [00:00<00:00, 303056.65it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.001331,-1.924399e-07,-2.332437e-07,-0.001042,-4.198136e-07,-2.144080e-07,-6.635583e-07,-3.863479e-07,-2.312690e-07,...,-0.002960,0.021173,-3.328639e-07,-0.000806,2.597449e-02,-2.911565e-07,-1.436921e-06,-0.003110,-1.975624e-07,-0.007506
1,-1.331360e-03,,-5.815674e-04,-6.695451e-04,-0.069689,-6.197595e-04,-1.905360e-03,-1.221361e-03,-2.809643e-03,-6.637952e-04,...,0.021058,-0.006352,-8.443898e-04,-0.009863,-2.390481e-07,-2.354137e-03,-1.265102e-03,-0.132309,-1.794691e-03,-0.000564
2,-1.924399e-07,-0.000582,,0.000000e+00,-0.000519,-1.171901e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-0.000766,0.008592,0.000000e+00,-0.000453,2.016025e-02,0.000000e+00,-1.866509e-07,-0.000775,0.000000e+00,-0.004347
3,-2.332437e-07,-0.000670,0.000000e+00,,-0.000587,-1.278660e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-0.000926,0.007115,0.000000e+00,-0.000504,8.322259e-03,0.000000e+00,-2.247933e-07,-0.000940,0.000000e+00,-0.030264
4,-1.041903e-03,-0.069689,-5.186149e-04,-5.874499e-04,,-5.487872e-04,-1.682343e-03,-9.732545e-04,-2.350226e-03,-5.830188e-04,...,-0.006126,-0.005340,-7.178706e-04,-0.033442,-1.984843e-07,-2.022825e-03,-1.000880e-03,0.000000,-1.595474e-03,-0.000504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.911565e-07,-0.002354,0.000000e+00,0.000000e+00,-0.002023,-1.435152e-07,2.437667e-01,0.000000e+00,0.000000e+00,0.000000e+00,...,-0.003485,-0.001679,0.000000e+00,-0.001701,9.018766e-04,,-2.781062e-07,-0.003552,0.000000e+00,0.033872
96,-1.436921e-06,-0.001265,-1.866509e-07,-2.247933e-07,-0.001001,-4.060758e-07,-2.072464e-07,-6.795378e-07,-3.637011e-07,-2.229586e-07,...,-0.002652,0.067195,-3.368373e-07,-0.000782,2.431752e-02,-2.781062e-07,,-0.002771,-1.914659e-07,-0.007290
97,-3.109955e-03,-0.132309,-7.753003e-04,-9.399531e-04,0.000000,-8.446192e-04,-2.620461e-03,-2.570097e-03,-4.701583e-03,-9.286600e-04,...,-0.017590,-0.010411,-1.325173e-03,-0.051005,-4.126542e-07,-3.551624e-03,-2.770953e-03,,-2.415598e-03,-0.000744
98,-1.975624e-07,-0.001795,0.000000e+00,0.000000e+00,-0.001595,-1.163465e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,-0.002384,-0.001259,0.000000e+00,-0.001388,6.639486e-04,0.000000e+00,-1.914659e-07,-0.002416,,0.027832


np.nanmean(norm_err)=np.float64(-0.00016765979989993051)
    np.nanmean(np.abs(norm_err))=np.float64(0.008015458018687659)
    np.nanmedian(norm_err)=np.float64(-4.388881801314654e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0007005034774298377)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 31557.47it/s]
100%|██████████| 100/100 [00:00<00:00, 467.55it/s]
5915it [00:00, 647864.11it/s]
100%|██████████| 100/100 [00:00<00:00, 337433.95it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33146.07it/s]
100%|██████████| 100/100 [00:00<00:00, 1027.66it/s]
5960it [00:00, 658051.28it/s]
100%|██████████| 100/100 [00:00<00:00, 333675.74it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.035621e-07,-1.065296e-07,-2.074515e-07,-1.064049e-07,-1.035444e-07,-1.199939e-07,-1.036592e-07,-2.069682e-07,-2.252678e-07,...,-2.071303e-07,-1.160154e-07,-1.037117e-07,-2.070770e-07,-2.126413e-07,-2.073882e-07,-2.192004e-07,-2.189760e-07,-1.036697e-07,-1.034132e-07
1,-1.035621e-07,,0.000000e+00,-1.066722e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.064167e-07,-1.035160e-07,...,-1.095612e-07,0.000000e+00,0.000000e+00,-1.064742e-07,-1.034850e-07,-1.066388e-07,-1.036920e-07,-1.035916e-07,0.000000e+00,0.000000e+00
2,-1.065296e-07,0.000000e+00,,-1.038470e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036048e-07,-1.064809e-07,...,-1.036860e-07,0.000000e+00,0.000000e+00,-1.036593e-07,-1.127400e-07,-1.038153e-07,-1.066671e-07,-1.065608e-07,0.000000e+00,0.000000e+00
3,-2.074515e-07,-1.066722e-07,-1.038470e-07,,-1.037284e-07,-1.066534e-07,-1.038980e-07,-1.098500e-07,-2.257973e-07,-2.073591e-07,...,-2.133509e-07,-1.036434e-07,-1.068309e-07,-2.259268e-07,-2.072967e-07,-2.484009e-07,-2.077122e-07,-2.075106e-07,-1.098618e-07,-1.065142e-07
4,-1.064049e-07,0.000000e+00,0.000000e+00,-1.037284e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.034868e-07,-1.063563e-07,...,-1.035678e-07,0.000000e+00,0.000000e+00,-1.035412e-07,-1.093719e-07,-1.036968e-07,-1.065421e-07,-1.064360e-07,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.073882e-07,-1.066388e-07,-1.038153e-07,-2.484009e-07,-1.036968e-07,-1.066200e-07,-1.038662e-07,-1.098145e-07,-2.257224e-07,-2.072959e-07,...,-2.132840e-07,-1.036118e-07,-1.067974e-07,-2.258518e-07,-2.072336e-07,,-2.076488e-07,-2.074473e-07,-1.098263e-07,-1.064809e-07
96,-2.192004e-07,-1.036920e-07,-1.066671e-07,-2.077122e-07,-1.065421e-07,-1.036743e-07,-1.097925e-07,-1.037894e-07,-2.072277e-07,-2.190972e-07,...,-2.073902e-07,-1.095083e-07,-1.038420e-07,-2.073368e-07,-2.129152e-07,-2.076488e-07,,-2.257546e-07,-1.037999e-07,-1.035427e-07
97,-2.189760e-07,-1.035916e-07,-1.065608e-07,-2.075106e-07,-1.064360e-07,-1.035739e-07,-1.096799e-07,-1.036887e-07,-2.070271e-07,-2.188730e-07,...,-2.071893e-07,-1.093962e-07,-1.037412e-07,-2.071359e-07,-2.127034e-07,-2.074473e-07,-2.257546e-07,,-1.036992e-07,-1.034426e-07
98,-1.036697e-07,0.000000e+00,0.000000e+00,-1.098618e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.095908e-07,-1.036235e-07,...,-1.066162e-07,0.000000e+00,0.000000e+00,-1.096518e-07,-1.035924e-07,-1.098263e-07,-1.037999e-07,-1.036992e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.130464758351772e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.130464758351772e-07)
    np.nanmedian(norm_err)=np.float64(-1.0639789140663147e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0639789140663147e-07)
    
