In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-15T15:25:44.447143+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
pandas                            : 2.2.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10006.45it/s]
100%|██████████| 100/100 [00:00<00:00, 404.63it/s]
6117it [00:00, 638286.34it/s]
100%|██████████| 100/100 [00:00<00:00, 255127.98it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031681,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.037460,0.000000,0.000000
1,0.00000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.118816,0.647299,...,0.000000,0.000000,-0.130858,0.000000,0.000000,0.000000,0.00000,0.581702,0.000000,0.000000
2,0.00000,0.000000,,0.000000,0.000000,-0.160789,0.000000,0.000000,0.000000,-0.365133,...,0.000000,0.145861,0.000000,0.000000,0.000000,0.000000,0.00000,-0.349359,0.000000,-0.179674
3,0.00000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,-0.031338,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,-0.036982,0.000000,0.000000
4,0.00000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.000000,0.072619,0.298568,...,0.000000,0.000000,-0.096909,0.000000,0.000000,0.000000,0.00000,0.330612,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.062777,0.274940,...,0.000000,0.000000,-0.087731,0.000000,0.000000,,0.00000,0.301884,0.000000,0.000000
96,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.316326,...,0.000000,0.097202,0.000000,0.000000,0.000000,0.000000,,-1.138560,0.000000,0.000000
97,0.03746,0.581702,-0.349359,-0.036982,0.330612,-0.289702,-0.259623,-0.285285,0.105718,0.142596,...,-0.371451,-0.154822,0.223991,0.271262,-0.311822,0.301884,-1.13856,,0.309205,-0.307627
98,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.065184,0.280998,...,0.000000,0.000000,-0.090055,0.000000,0.000000,0.000000,0.00000,0.309205,,0.000000


np.nanmean(norm_err)=np.float64(0.003119050522863299)
    np.nanmean(np.abs(norm_err))=np.float64(0.1248436661300111)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33104.21it/s]
100%|██████████| 100/100 [00:00<00:00, 435.62it/s]
5983it [00:00, 601686.07it/s]
100%|██████████| 100/100 [00:00<00:00, 292898.32it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.022532,0.000000e+00,-1.060078e-02,-1.674825e-07,0.000000e+00,0.000000e+00,-0.005508,0.000000e+00,-0.021257,...,-1.631389e-07,0.000000e+00,0.064413,0.000000e+00,0.000000e+00,1.565064e-03,-1.162931e-07,0.000000e+00,-1.424953e-07,0.000000e+00
1,-2.253249e-02,,-7.923369e-02,-1.206538e-02,1.569839e-03,1.395321e-02,9.554572e-04,0.007399,0.000000e+00,-0.134320,...,1.688818e-02,1.803469e-03,0.000000,-3.674900e-02,1.250364e-03,3.696235e-03,7.331604e-03,-2.693551e-02,-2.709921e-02,7.813720e-03
2,0.000000e+00,-0.079234,,-3.432215e-02,-5.859719e-07,0.000000e+00,0.000000e+00,-0.012910,0.000000e+00,-0.070467,...,-5.360388e-07,0.000000e+00,0.048165,0.000000e+00,0.000000e+00,9.099736e-03,-2.306936e-07,0.000000e+00,-2.990976e-07,0.000000e+00
3,-1.060078e-02,-0.012065,-3.432215e-02,,-2.719833e-03,-1.898701e-03,-1.275409e-03,-0.011065,1.045857e-01,-0.011044,...,-2.581390e-03,-3.521969e-03,-0.001531,-2.228475e-02,-1.875474e-03,-4.665645e-07,-1.441615e-03,-1.361771e-02,-1.373868e-02,-1.577531e-03
4,-1.674825e-07,0.001570,-5.859719e-07,-2.719833e-03,,-2.494004e-07,-1.652444e-07,0.001941,-5.704036e-07,0.007182,...,-6.927115e-07,8.077277e-03,-0.008167,-3.655088e-07,1.293454e-01,2.586590e-02,-3.733835e-07,-2.172079e-07,-4.384362e-07,-2.051718e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.565064e-03,0.003696,9.099736e-03,-4.665645e-07,2.586590e-02,5.090066e-03,1.247429e-02,0.004782,-5.423898e-02,0.013419,...,7.848925e-03,5.398365e-02,0.001912,4.208688e-03,1.518079e-02,,3.579758e-03,2.130552e-03,2.154329e-03,4.005685e-03
96,-1.162931e-07,0.007332,-2.306936e-07,-1.441615e-03,-3.733835e-07,-1.517761e-07,-1.147763e-07,0.008405,-2.282411e-07,0.000981,...,-3.658821e-07,-2.136274e-07,-0.001070,-1.864245e-07,-1.494683e-07,3.579758e-03,,-1.382730e-07,-2.781697e-07,-1.457051e-07
97,0.000000e+00,-0.026936,0.000000e+00,-1.361771e-02,-2.172079e-07,0.000000e+00,0.000000e+00,-0.006748,0.000000e+00,-0.025133,...,-2.099581e-07,0.000000e+00,0.067468,0.000000e+00,0.000000e+00,2.130552e-03,-1.382730e-07,,-1.672642e-07,0.000000e+00
98,-1.424953e-07,-0.027099,-2.990976e-07,-1.373868e-02,-4.384362e-07,-1.711435e-07,-1.262971e-07,-0.006795,-2.794557e-07,-0.025276,...,-4.236716e-07,-2.573148e-07,0.078159,-2.432739e-07,-1.696173e-07,2.154329e-03,-2.781697e-07,-1.672642e-07,,-1.490891e-07


np.nanmean(norm_err)=np.float64(0.0004346011748598278)
    np.nanmean(np.abs(norm_err))=np.float64(0.011195507483071326)
    np.nanmedian(norm_err)=np.float64(-1.444740246404429e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0011118845999666506)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 32953.36it/s]
100%|██████████| 100/100 [00:00<00:00, 457.69it/s]
5929it [00:00, 655958.12it/s]
100%|██████████| 100/100 [00:00<00:00, 435093.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31345.22it/s]
100%|██████████| 100/100 [00:00<00:00, 1020.07it/s]
5948it [00:00, 47307.17it/s]
100%|██████████| 100/100 [00:00<00:00, 406819.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.036348e-07,-2.075365e-07,-2.130138e-07,-2.070851e-07,-1.035832e-07,-2.072244e-07,-1.036357e-07,-2.075407e-07,-1.036182e-07,...,-2.072799e-07,-1.098434e-07,-2.072678e-07,-2.256570e-07,-2.131785e-07,-2.255530e-07,-1.064161e-07,-2.072862e-07,-1.036335e-07,-1.130336e-07
1,-1.036348e-07,,-1.096489e-07,-1.035370e-07,-1.063472e-07,0.000000e+00,-1.064207e-07,0.000000e+00,-1.065875e-07,0.000000e+00,...,-1.064499e-07,0.000000e+00,-1.064435e-07,-1.035559e-07,-1.036147e-07,-1.035121e-07,0.000000e+00,-1.064532e-07,0.000000e+00,0.000000e+00
2,-2.075365e-07,-1.096489e-07,,-2.073404e-07,-2.129754e-07,-1.095912e-07,-2.131228e-07,-1.065863e-07,-2.134574e-07,-1.065677e-07,...,-2.131815e-07,-1.039185e-07,-2.131687e-07,-2.073783e-07,-2.074963e-07,-2.072904e-07,-1.035842e-07,-2.131881e-07,-1.096475e-07,-1.038623e-07
3,-2.130138e-07,-1.035370e-07,-2.073404e-07,,-2.068899e-07,-1.034855e-07,-2.070289e-07,-1.035379e-07,-2.073446e-07,-1.035204e-07,...,-2.070843e-07,-1.066652e-07,-2.070722e-07,-2.128472e-07,-2.255647e-07,-2.127547e-07,-1.160126e-07,-2.070906e-07,-1.035357e-07,-1.066061e-07
4,-2.070851e-07,-1.063472e-07,-2.129754e-07,-2.068899e-07,,-1.062929e-07,-2.252004e-07,-1.093980e-07,-2.190961e-07,-1.093785e-07,...,-2.188055e-07,-1.036921e-07,-2.642623e-07,-2.069276e-07,-2.070451e-07,-2.068401e-07,-1.033593e-07,-2.188125e-07,-1.063459e-07,-1.036363e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.255530e-07,-1.035121e-07,-2.072904e-07,-2.127547e-07,-2.068401e-07,-1.034606e-07,-2.069792e-07,-1.035130e-07,-2.072947e-07,-1.034955e-07,...,-2.070345e-07,-1.097056e-07,-2.070224e-07,-2.472795e-07,-2.129189e-07,,-1.062868e-07,-2.070407e-07,-1.035108e-07,-1.163302e-07
96,-1.064161e-07,0.000000e+00,-1.035842e-07,-1.160126e-07,-1.033593e-07,0.000000e+00,-1.034287e-07,0.000000e+00,-1.035863e-07,0.000000e+00,...,-1.034563e-07,0.000000e+00,-1.034503e-07,-1.063330e-07,-1.126805e-07,-1.062868e-07,,-1.034595e-07,0.000000e+00,0.000000e+00
97,-2.072862e-07,-1.064532e-07,-2.131881e-07,-2.070906e-07,-2.188125e-07,-1.063988e-07,-2.189680e-07,-1.476610e-07,-2.258126e-07,-1.161587e-07,...,-2.255039e-07,-1.037930e-07,-2.190165e-07,-2.071284e-07,-2.072461e-07,-2.070407e-07,-1.034595e-07,,-1.064519e-07,-1.037370e-07
98,-1.036335e-07,0.000000e+00,-1.096475e-07,-1.035357e-07,-1.063459e-07,0.000000e+00,-1.064194e-07,0.000000e+00,-1.065862e-07,0.000000e+00,...,-1.064486e-07,0.000000e+00,-1.064422e-07,-1.035547e-07,-1.036135e-07,-1.035108e-07,0.000000e+00,-1.064519e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-8.969082927514254e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.969082927514254e-08)
    np.nanmedian(norm_err)=np.float64(-1.0371987708987237e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0371987708987237e-07)
    
