In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-20T00:28:27.194674+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1030-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

numpy                             : 2.1.2
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10659.24it/s]
100%|██████████| 100/100 [00:00<00:00, 405.76it/s]
6112it [00:00, 657238.46it/s]
100%|██████████| 100/100 [00:00<00:00, 233926.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.059259,-0.451842,-0.053032,-0.235016,-0.035203,-0.031603,-0.206000,-0.054352,0.000000,...,-0.132611,-0.121289,-0.234600,-0.151353,-0.109479,0.244959,-0.125844,-0.069518,-0.023262,-0.092210
1,-0.059259,,-0.092625,0.000000,-0.105795,-0.130612,0.000000,-0.062454,-0.376365,-0.748596,...,-0.041530,-0.038178,-0.070251,0.118317,-0.034645,-0.475116,-0.039531,-0.022406,-0.364958,-0.029411
2,-0.451842,-0.092625,,-0.078262,0.000000,-0.044787,-0.039118,0.000000,-0.081172,0.140407,...,0.000000,0.000000,0.000000,-0.196522,0.000000,0.000000,0.000000,0.000000,-0.027093,0.000000
3,-0.053032,0.000000,-0.078262,,-0.087461,-0.115661,0.000000,-0.055577,-0.274220,-0.685133,...,-0.038372,-0.035493,-0.061667,0.109750,-0.032420,-0.594271,-0.036659,-0.021453,-0.486921,-0.027791
4,-0.235016,-0.105795,0.000000,-0.087461,,-0.047655,-0.041288,0.000000,-0.091111,0.097448,...,0.000000,0.000000,0.000000,-0.153745,0.000000,0.000000,0.000000,0.000000,-0.028117,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.244959,-0.475116,0.000000,-0.594271,0.000000,-0.303048,-0.073475,0.000000,-0.386761,-0.327833,...,0.000000,0.000000,0.000000,-0.019125,0.000000,,0.000000,0.000000,-0.346233,0.000000
96,-0.125844,-0.039531,0.000000,-0.036659,0.000000,-0.027153,-0.024960,0.000000,-0.037285,0.502467,...,0.000000,0.000000,0.000000,-0.243610,0.000000,0.000000,,-0.110902,-0.019451,0.000000
97,-0.069518,-0.022406,0.000000,-0.021453,0.000000,-0.017805,-0.016835,0.000000,-0.021666,0.625461,...,-0.114773,-0.108216,0.000000,-0.153382,-0.100930,0.000000,-0.110902,,-0.014135,-0.089358
98,-0.023262,-0.364958,-0.027093,-0.486921,-0.028117,-0.220262,0.077469,-0.023738,-0.282470,-0.337102,...,-0.019923,-0.019118,-0.024784,0.827912,-0.018189,-0.346233,-0.019451,-0.014135,,-0.016635


np.nanmean(norm_err)=np.float64(-0.06159562383314079)
    np.nanmean(np.abs(norm_err))=np.float64(0.14137361495948653)
    np.nanmedian(norm_err)=np.float64(-0.022336703545435863)
    np.nanmedian(np.abs(norm_err))=np.float64(0.03766482253138131)
    


100%|██████████| 100/100 [00:00<00:00, 35772.32it/s]
100%|██████████| 100/100 [00:00<00:00, 446.14it/s]
5991it [00:00, 564358.79it/s]
100%|██████████| 100/100 [00:00<00:00, 183317.48it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.843403e-03,-2.268334e-03,-2.903633e-03,2.714757e-03,-2.495663e-01,-2.716972e-03,-3.230595e-03,6.521440e-02,1.794740e-02,...,-2.372418e-03,-1.069418e-02,-9.598364e-03,-2.960105e-03,1.713166e-02,-3.045216e-03,-3.429078e-03,7.332080e-02,0.000000e+00,0.000000e+00
1,-0.002843,,-2.834089e-03,-2.745649e-07,-4.454550e-07,-6.337018e-03,2.232179e-02,-3.251343e-07,-2.578880e-03,-2.675623e-02,...,-2.999851e-03,-5.202055e-03,2.176770e-02,-4.016309e-03,-2.988233e-02,-5.913267e-07,-4.070358e-07,-8.394215e-02,2.902161e-02,1.903415e-02
2,-0.002268,-2.834089e-03,,-2.893916e-03,1.093545e-02,-4.921184e-07,-2.708474e-03,-3.218549e-03,-1.413547e-01,-1.546381e-07,...,-3.244784e-07,-2.162875e-07,-4.764803e-03,-1.880934e-07,-4.130669e-07,-3.034522e-03,-3.415497e-03,-1.388843e-07,-2.624712e-07,-1.180878e-07
3,-0.002904,-2.745649e-07,-2.893916e-03,,-2.287870e-07,-6.644681e-03,2.294148e-02,0.000000e+00,-2.628310e-03,-2.742235e-02,...,-3.066976e-03,-5.407546e-03,2.227053e-02,-4.137645e-03,-3.100421e-02,-3.260829e-07,0.000000e+00,-3.285033e-01,3.063873e-02,1.936911e-02
4,0.002715,-4.454550e-07,1.093545e-02,-2.287870e-07,,1.154422e-02,-4.205410e-07,-2.628532e-07,2.472285e-03,3.112427e-03,...,1.154732e-02,9.637412e-03,-1.781767e-07,1.523726e-02,4.518351e-03,-4.865024e-07,-2.846393e-07,2.729035e-03,-6.377966e-03,-1.386047e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.003045,-5.913267e-07,-3.034522e-03,-3.260829e-07,-4.865024e-07,-7.435107e-03,2.442258e-02,-3.800399e-07,-2.743791e-03,-2.900109e-02,...,-3.225349e-03,-5.919807e-03,2.345402e-02,-4.431102e-03,-3.377694e-02,,-4.156876e-07,-1.314027e-01,3.488565e-02,2.014339e-02
96,-0.003429,-4.070358e-07,-3.415497e-03,0.000000e+00,-2.846393e-07,-1.023259e-02,2.868246e-02,0.000000e+00,-3.051540e-03,-3.342627e-02,...,-3.659187e-03,-7.566918e-03,2.672068e-02,-5.293504e-03,-4.252158e-02,-4.156876e-07,,-1.022117e-01,5.121595e-02,2.218349e-02
97,0.073321,-8.394215e-02,-1.388843e-07,-3.285033e-01,2.729035e-03,-2.481395e-07,-5.492118e-03,-1.104560e-01,1.043032e-01,0.000000e+00,...,-1.452785e-07,0.000000e+00,-2.387424e-03,0.000000e+00,-2.123388e-07,-1.314027e-01,-1.022117e-01,,0.000000e+00,0.000000e+00
98,0.000000,2.902161e-02,-2.624712e-07,3.063873e-02,-6.377966e-03,-1.563246e-06,2.595785e-02,4.170615e-02,-2.284409e-07,0.000000e+00,...,-2.862840e-07,0.000000e+00,1.937201e-02,0.000000e+00,-7.137359e-07,3.488565e-02,5.121595e-02,0.000000e+00,,1.313711e-02


np.nanmean(norm_err)=np.float64(-0.0004973750193175946)
    np.nanmean(np.abs(norm_err))=np.float64(0.012145267867655398)
    np.nanmedian(norm_err)=np.float64(-2.0674388537067818e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0028279107380754743)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 26370.98it/s]
100%|██████████| 100/100 [00:00<00:00, 460.02it/s]
5962it [00:00, 631555.51it/s]
100%|██████████| 100/100 [00:00<00:00, 399077.45it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30133.66it/s]
100%|██████████| 100/100 [00:00<00:00, 1007.83it/s]
5924it [00:00, 594968.08it/s]
100%|██████████| 100/100 [00:00<00:00, 310459.22it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.097063e-07,-1.038688e-07,-1.130363e-07,-1.038450e-07,0.000000e+00,0.000000e+00,-1.036406e-07,0.000000e+00,...,0.000000e+00,-1.037305e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.064968e-07,0.000000e+00,0.000000e+00,-1.037935e-07,0.000000e+00
1,0.000000e+00,,-1.036456e-07,-1.129933e-07,-1.037143e-07,-1.097160e-07,0.000000e+00,0.000000e+00,-1.094879e-07,0.000000e+00,...,0.000000e+00,-1.065279e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035109e-07,0.000000e+00,0.000000e+00,-1.163477e-07,0.000000e+00
2,-1.097063e-07,-1.036456e-07,,-2.074971e-07,-2.192977e-07,-2.074495e-07,-1.201277e-07,-1.095411e-07,-2.070416e-07,-1.066699e-07,...,-1.065664e-07,-2.072211e-07,-1.066676e-07,-1.096850e-07,-1.064732e-07,-2.127407e-07,-1.063802e-07,-1.037480e-07,-2.073468e-07,-1.130159e-07
3,-1.038688e-07,-1.129933e-07,-2.074971e-07,,-2.076347e-07,-2.196627e-07,-1.039157e-07,-1.037207e-07,-2.192054e-07,-1.038977e-07,...,-1.037996e-07,-2.132733e-07,-1.038956e-07,-1.038497e-07,-1.037110e-07,-2.072270e-07,-1.036229e-07,-1.098574e-07,-2.260526e-07,-1.039206e-07
4,-1.130363e-07,-1.037143e-07,-2.192977e-07,-2.076347e-07,,-2.075871e-07,-1.098356e-07,-1.238539e-07,-2.071787e-07,-1.067426e-07,...,-1.066391e-07,-2.073584e-07,-1.067404e-07,-1.130137e-07,-1.065456e-07,-2.128854e-07,-1.064526e-07,-1.038168e-07,-2.074843e-07,-1.098410e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.064968e-07,-1.035109e-07,-2.127407e-07,-2.072270e-07,-2.128854e-07,-2.071796e-07,-1.065461e-07,-1.063411e-07,-2.067728e-07,-1.095874e-07,...,-1.094783e-07,-2.069518e-07,-1.095851e-07,-1.064767e-07,-1.093798e-07,,-1.159236e-07,-1.036130e-07,-2.070772e-07,-1.065512e-07
96,0.000000e+00,0.000000e+00,-1.063802e-07,-1.036229e-07,-1.064526e-07,-1.035991e-07,0.000000e+00,0.000000e+00,-1.033957e-07,0.000000e+00,...,0.000000e+00,-1.034852e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.159236e-07,,0.000000e+00,-1.035479e-07,0.000000e+00
97,0.000000e+00,0.000000e+00,-1.037480e-07,-1.098574e-07,-1.038168e-07,-1.130867e-07,0.000000e+00,0.000000e+00,-1.324320e-07,0.000000e+00,...,0.000000e+00,-1.066361e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036130e-07,0.000000e+00,,-1.097732e-07,0.000000e+00
98,-1.037935e-07,-1.163477e-07,-2.073468e-07,-2.260526e-07,-2.074843e-07,-2.194943e-07,-1.038403e-07,-1.036456e-07,-2.190378e-07,-1.038223e-07,...,-1.037244e-07,-2.131146e-07,-1.038202e-07,-1.037745e-07,-1.036360e-07,-2.070772e-07,-1.035479e-07,-1.097732e-07,,-1.038452e-07


np.nanmean(norm_err)=np.float64(-8.324080284770178e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.324080284770178e-08)
    np.nanmedian(norm_err)=np.float64(-1.0368928014096228e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0368928014096228e-07)
    
