In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-18T21:37:02.340029+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
hstrat                            : 1.20.10
pandas                            : 2.2.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10251.76it/s]
100%|██████████| 100/100 [00:00<00:00, 400.95it/s]
6142it [00:00, 609016.91it/s]
100%|██████████| 100/100 [00:00<00:00, 166837.87it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.228573,0.000000,0.000000,0.00000,0.361531,0.484510,0.000000,...,-0.135576,-0.085156,0.000000,0.000000,0.000000,-0.281772,0.000000,0.000000,0.000000,0.000000
1,0.000000,,0.000000,0.000000,0.830448,-0.631763,0.00000,-0.554081,0.000000,0.236165,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.131715,0.194642,0.066004,0.129921,0.265766
2,0.000000,0.000000,,0.000000,0.361659,-0.415865,0.00000,-0.431338,0.000000,0.178445,...,0.000000,0.000000,0.000000,0.000000,-0.336314,0.478663,0.153674,0.042793,0.084704,0.194843
3,0.228573,0.000000,0.000000,,0.000000,0.000000,0.00000,0.462155,0.000000,0.000000,...,0.000000,0.000000,0.214678,0.000000,0.000000,-0.890530,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.830448,0.361659,0.000000,,0.000000,0.00000,-0.043946,0.000000,-0.133257,...,0.000000,0.000000,0.000000,0.266930,0.268268,0.778700,-0.112877,0.040381,0.000000,-0.147125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.281772,1.131715,0.478663,-0.890530,0.778700,0.079464,-0.01349,0.000000,-0.965359,0.269024,...,-0.471471,-0.279582,-0.267540,0.339235,0.341127,,0.227547,0.105332,0.202773,0.297320
96,0.000000,0.194642,0.153674,0.000000,-0.112877,0.000000,0.00000,-0.047800,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.113676,0.114243,0.227547,,0.047400,0.000000,0.000000
97,0.000000,0.066004,0.042793,0.000000,0.040381,0.123442,0.00000,-0.043161,0.000000,0.063786,...,0.000000,0.000000,0.000000,0.026950,0.027142,0.105332,0.047400,,0.272756,0.077835
98,0.000000,0.129921,0.084704,0.000000,0.000000,0.000000,0.00000,-0.081961,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.053549,0.053927,0.202773,0.000000,0.272756,,0.000000


np.nanmean(norm_err)=np.float64(0.019784796118665234)
    np.nanmean(np.abs(norm_err))=np.float64(0.10856740007468817)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31937.14it/s]
100%|██████████| 100/100 [00:00<00:00, 401.74it/s]
5969it [00:00, 599186.29it/s]
100%|██████████| 100/100 [00:00<00:00, 162004.79it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.193603e-07,0.011097,-2.407101e-07,0.007222,-3.017372e-07,-3.925623e-07,9.089751e-03,-1.297377e-07,0.007993,...,0.009555,0.008218,-4.201085e-07,2.705563e-03,0.008242,-1.859911e-07,-3.684925e-07,-0.001553,0.010546,-2.903361e-07
1,-2.193603e-07,,0.033416,0.000000e+00,0.012774,-3.147897e-07,-6.085788e-07,1.743810e-02,0.000000e+00,0.015405,...,0.022490,0.016262,-7.638751e-07,6.292996e-03,0.016358,0.000000e+00,-5.060835e-07,-0.004487,0.028873,-2.909507e-07
2,1.109664e-02,3.341647e-02,,7.512013e-03,0.000000,-2.563467e-02,1.205206e-02,1.374994e-02,1.763994e-03,0.000000,...,0.000000,0.000000,4.827264e-03,8.216105e-03,0.000000,1.058881e-02,-1.047041e-02,0.000000,0.000000,2.141598e-03
3,-2.407101e-07,0.000000e+00,0.007512,,0.002120,-3.957843e-07,-9.906282e-07,0.000000e+00,0.000000e+00,0.002645,...,0.004257,0.002824,-1.639703e-06,-2.537727e-02,0.002844,0.000000e+00,-7.542260e-07,-0.021053,0.006021,-3.609420e-07
4,7.221501e-03,1.277425e-02,0.000000,2.120351e-03,,-1.433033e-02,5.106958e-03,8.258681e-03,1.104490e-03,0.000000,...,0.000000,0.000000,1.832515e-03,4.263482e-03,0.000000,4.824564e-03,-4.799940e-03,0.000000,0.000000,1.241492e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.859911e-07,0.000000e+00,0.010589,0.000000e+00,0.004825,-2.655873e-07,-4.616480e-07,-1.507710e-02,0.000000e+00,0.005679,...,0.007790,0.005949,-5.269730e-07,2.593097e-02,0.005979,,-3.899434e-07,-0.003393,0.009485,-2.484148e-07
96,-3.684925e-07,-5.060835e-07,-0.010470,-7.542260e-07,-0.004800,-5.309258e-07,-8.761919e-07,-2.257128e-07,-2.049454e-07,-0.005645,...,-0.007726,-0.005911,-1.035109e-06,-6.183124e-07,-0.005941,-3.899434e-07,,0.010104,-0.009390,-4.926039e-07
97,-1.552525e-03,-4.487174e-03,0.000000,-2.105263e-02,0.000000,5.305733e-02,-3.842340e-03,-1.913975e-03,-3.343451e-02,0.000000,...,0.000000,0.000000,-1.385665e-02,-7.989640e-03,0.000000,-3.393307e-03,1.010355e-02,,0.000000,-4.932854e-02
98,1.054558e-02,2.887307e-02,0.000000,6.021086e-03,0.000000,-2.380597e-02,1.064266e-02,1.291380e-02,1.667061e-03,0.000000,...,0.000000,0.000000,4.164522e-03,7.535781e-03,0.000000,9.485243e-03,-9.390166e-03,0.000000,,2.000368e-03


np.nanmean(norm_err)=np.float64(0.0024535560732618115)
    np.nanmean(np.abs(norm_err))=np.float64(0.008756619693219064)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.001775016230034437)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 33770.56it/s]
100%|██████████| 100/100 [00:00<00:00, 418.81it/s]
5947it [00:00, 606219.95it/s]
100%|██████████| 100/100 [00:00<00:00, 329740.88it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33950.98it/s]
100%|██████████| 100/100 [00:00<00:00, 1036.95it/s]
5952it [00:00, 635228.94it/s]
100%|██████████| 100/100 [00:00<00:00, 279433.98it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.065426e-07,-1.067021e-07,-1.035879e-07,-1.036244e-07,-1.131208e-07,-1.035982e-07,0.000000e+00,-1.038193e-07,0.000000e+00,...,0.000000e+00,-1.037213e-07,0.000000e+00,-1.036723e-07,0.000000e+00,-1.035868e-07,0.000000e+00,-1.127517e-07,0.000000e+00,0.000000e+00
1,-1.065426e-07,,-2.326302e-07,-2.069212e-07,-2.069938e-07,-2.133042e-07,-2.069417e-07,-1.064502e-07,-2.073828e-07,-1.064363e-07,...,-1.035259e-07,-2.071872e-07,-1.064921e-07,-2.070895e-07,-1.095628e-07,-2.069188e-07,-1.066405e-07,-2.126480e-07,-1.036958e-07,-1.036133e-07
2,-1.067021e-07,-2.326302e-07,,-2.072220e-07,-2.072949e-07,-2.136239e-07,-2.072426e-07,-1.066095e-07,-2.076850e-07,-1.065955e-07,...,-1.036765e-07,-2.074888e-07,-1.066515e-07,-2.073908e-07,-1.097315e-07,-2.072196e-07,-1.068003e-07,-2.129657e-07,-1.038469e-07,-1.037642e-07
3,-1.035879e-07,-2.069212e-07,-2.072220e-07,,-2.251864e-07,-2.073829e-07,-2.319696e-07,-1.035006e-07,-2.256469e-07,-1.034875e-07,...,-1.093976e-07,-2.128384e-07,-1.035402e-07,-2.127353e-07,-1.035514e-07,-2.125552e-07,-1.036804e-07,-2.067625e-07,-1.065270e-07,-1.236974e-07
4,-1.036244e-07,-2.069938e-07,-2.072949e-07,-2.251864e-07,,-2.074558e-07,-2.252107e-07,-1.035369e-07,-2.649254e-07,-1.035238e-07,...,-1.094382e-07,-2.129152e-07,-1.035766e-07,-2.128121e-07,-1.035877e-07,-2.126318e-07,-1.037169e-07,-2.068351e-07,-1.065655e-07,-1.127741e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.035868e-07,-2.069188e-07,-2.072196e-07,-2.125552e-07,-2.126318e-07,-2.073805e-07,-2.125768e-07,-1.034994e-07,-2.130423e-07,-1.034863e-07,...,-1.063465e-07,-2.395687e-07,-1.035390e-07,-2.252969e-07,-1.035502e-07,,-1.036793e-07,-2.067602e-07,-1.095860e-07,-1.064388e-07
96,0.000000e+00,-1.066405e-07,-1.068003e-07,-1.036804e-07,-1.037169e-07,-1.099670e-07,-1.036907e-07,0.000000e+00,-1.039122e-07,0.000000e+00,...,0.000000e+00,-1.038140e-07,0.000000e+00,-1.037650e-07,0.000000e+00,-1.036793e-07,,-1.096182e-07,0.000000e+00,0.000000e+00
97,-1.127517e-07,-2.126480e-07,-2.129657e-07,-2.067625e-07,-2.068351e-07,-2.326321e-07,-2.067830e-07,-1.094172e-07,-2.072235e-07,-1.094025e-07,...,-1.034465e-07,-2.070281e-07,-1.094614e-07,-2.069306e-07,-1.064199e-07,-2.067602e-07,-1.096182e-07,,-1.036161e-07,-1.035337e-07
98,0.000000e+00,-1.036958e-07,-1.038469e-07,-1.065270e-07,-1.065655e-07,-1.039277e-07,-1.065379e-07,0.000000e+00,-1.067718e-07,0.000000e+00,...,0.000000e+00,-1.097365e-07,0.000000e+00,-1.096818e-07,0.000000e+00,-1.095860e-07,0.000000e+00,-1.036161e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0240408432968257e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0240408432968257e-07)
    np.nanmedian(norm_err)=np.float64(-1.0384471020511613e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0384471020511613e-07)
    
