In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-18T21:37:52.605921+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10147.59it/s]
100%|██████████| 100/100 [00:00<00:00, 364.46it/s]
6156it [00:00, 614399.42it/s]
100%|██████████| 100/100 [00:00<00:00, 222745.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.308387,0.424569,0.548467,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.240109,0.0,0.000000,0.554150,0.000000,0.0,0.0,0.000000
1,0.000000,,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
2,0.000000,0.0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
3,0.308387,0.0,0.0,,0.223125,0.338995,-0.672223,0.056143,-0.061974,0.000000,...,-0.055027,-0.053522,0.438205,0.0,-0.453389,0.641604,0.000000,0.0,0.0,-0.064068
4,0.424569,0.0,0.0,0.223125,,0.326550,0.114108,0.261133,0.074213,-0.074782,...,0.064467,0.062410,-0.115107,0.0,0.130541,0.328557,-0.172346,0.0,0.0,0.077236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.554150,0.0,0.0,0.641604,0.328557,0.000000,-0.490905,0.108889,-0.084592,0.000000,...,-0.072158,-0.069591,0.554737,0.0,-0.599037,,0.000000,0.0,0.0,-0.088540
96,0.000000,0.0,0.0,0.000000,-0.172346,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,-0.062910,0.0,0.000000,0.000000,,0.0,0.0,0.000000
97,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0.0,0.000000
98,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,,0.000000


np.nanmean(norm_err)=np.float64(-0.009144975973820241)
    np.nanmean(np.abs(norm_err))=np.float64(0.090165525815515)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33675.66it/s]
100%|██████████| 100/100 [00:00<00:00, 396.19it/s]
5976it [00:00, 559977.67it/s]
100%|██████████| 100/100 [00:00<00:00, 267153.12it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.042463,0.032879,5.211370e-04,0.055126,0.064024,0.165579,3.441954e-04,3.716993e-02,0.002428,...,0.055287,4.976468e-02,-1.032030e-02,3.858762e-04,-9.912001e-03,-3.740986e-02,-2.739290e-02,-0.006284,3.740874e-04,0.001925
1,0.042463,,0.038476,6.562576e-04,-0.163693,0.197546,0.073317,3.984220e-04,-2.480263e-07,0.002937,...,0.064269,7.045113e-02,-1.363998e-02,4.553068e-04,-1.202430e-02,-4.926390e-02,-3.325156e-02,-0.007377,4.389892e-04,0.002646
2,0.032879,0.038476,,-1.044732e-02,0.048596,0.015194,0.014983,-1.447242e-02,4.446226e-02,-0.006989,...,0.022508,4.479165e-02,3.143304e-03,-3.296263e-02,-1.427462e-07,3.071825e-03,2.304408e-03,0.001975,-1.567650e-02,0.003507
3,0.000521,0.000656,-0.010447,,0.000960,-0.009871,-0.009685,-3.132193e-07,8.249062e-04,-0.185686,...,-0.020092,-1.022105e-06,1.081900e-03,-1.829529e-07,-2.200386e-07,1.036007e-03,6.390323e-04,0.000509,-1.752570e-07,0.001348
4,0.055126,-0.163693,0.048596,9.603481e-04,,0.093441,0.091459,4.932429e-04,-3.825914e-07,0.003950,...,0.080209,1.433354e-01,-2.249027e-02,5.834366e-04,-1.626463e-02,-8.044744e-02,-4.503407e-02,-0.009372,5.569102e-04,0.004881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.037410,-0.049264,0.003072,1.036007e-03,-0.080447,0.002882,0.002821,5.123765e-04,-6.573870e-02,0.183507,...,0.002844,1.314311e-02,-1.032798e-06,6.104288e-04,-1.722643e-02,,-5.838981e-07,0.006581,5.814427e-04,0.063774
96,-0.027393,-0.033252,0.002304,6.390323e-04,-0.045034,0.002196,0.002160,3.919553e-04,-4.002142e-02,0.084451,...,0.002174,5.420141e-03,-5.408073e-07,4.469054e-04,-1.176673e-02,-5.838981e-07,,0.004861,4.311688e-04,0.028384
97,-0.006284,-0.007377,0.001975,5.089380e-04,-0.009372,0.001895,0.001868,3.388698e-04,-8.554346e-03,0.014462,...,0.001878,3.892203e-03,-2.725105e-02,3.791821e-04,-2.945611e-02,6.580670e-03,4.860874e-03,,3.677965e-04,0.194181
98,0.000374,0.000439,-0.015676,-1.752570e-07,0.000557,-0.022819,-0.022497,-1.653390e-07,5.084426e-04,-0.009189,...,-0.007244,-2.394908e-07,5.956178e-04,0.000000e+00,0.000000e+00,5.814427e-04,4.311688e-04,0.000368,,0.000668


np.nanmean(norm_err)=np.float64(0.0011506866619268288)
    np.nanmean(np.abs(norm_err))=np.float64(0.012160266797574471)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0009481161253665249)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35723.57it/s]
100%|██████████| 100/100 [00:00<00:00, 444.75it/s]
5949it [00:00, 593161.09it/s]
100%|██████████| 100/100 [00:00<00:00, 367277.06it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33093.77it/s]
100%|██████████| 100/100 [00:00<00:00, 1003.93it/s]
5905it [00:00, 629236.18it/s]
100%|██████████| 100/100 [00:00<00:00, 339070.65it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.064341e-07,0.000000e+00,-1.063866e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.128425e-07,-1.037230e-07,...,0.000000e+00,-1.096017e-07,-1.126811e-07,-1.163144e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.097657e-07,-1.035447e-07,0.000000e+00
1,0.000000e+00,,-1.035708e-07,0.000000e+00,-1.035258e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036705e-07,-1.066463e-07,...,0.000000e+00,-1.036716e-07,-1.035342e-07,-1.036961e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.038183e-07,-1.127510e-07,0.000000e+00
2,-1.064341e-07,-1.035708e-07,,-1.037165e-07,-2.252860e-07,-1.095180e-07,-1.064878e-07,-1.035614e-07,-2.130286e-07,-2.073984e-07,...,-1.035131e-07,-2.130310e-07,-2.127410e-07,-2.130827e-07,-1.036477e-07,-1.035375e-07,-1.064130e-07,-2.133408e-07,-2.070420e-07,-1.035589e-07
3,0.000000e+00,0.000000e+00,-1.037165e-07,,-1.036713e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.038165e-07,-1.068008e-07,...,0.000000e+00,-1.038176e-07,-1.036798e-07,-1.038421e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.039648e-07,-1.163685e-07,0.000000e+00
4,-1.063866e-07,-1.035258e-07,-2.252860e-07,-1.036713e-07,,-1.094677e-07,-1.064402e-07,-1.035164e-07,-2.129334e-07,-2.073082e-07,...,-1.034681e-07,-2.129358e-07,-2.126460e-07,-2.129874e-07,-1.036027e-07,-1.034926e-07,-1.063655e-07,-2.132453e-07,-2.069520e-07,-1.035139e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,-1.035375e-07,0.000000e+00,-1.034926e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036372e-07,-1.674476e-07,...,0.000000e+00,-1.036383e-07,-1.035010e-07,-1.036628e-07,0.000000e+00,,0.000000e+00,-1.037849e-07,-1.064227e-07,0.000000e+00
96,0.000000e+00,0.000000e+00,-1.064130e-07,0.000000e+00,-1.063655e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.095780e-07,-1.037030e-07,...,0.000000e+00,-1.162585e-07,-1.094258e-07,-1.096066e-07,0.000000e+00,0.000000e+00,,-1.164430e-07,-1.035247e-07,0.000000e+00
97,-1.097657e-07,-1.038183e-07,-2.133408e-07,-1.039648e-07,-2.132453e-07,-1.067232e-07,-1.130782e-07,-1.038089e-07,-2.197020e-07,-2.078948e-07,...,-1.037604e-07,-2.404799e-07,-2.193960e-07,-2.197594e-07,-1.038957e-07,-1.037849e-07,-1.164430e-07,,-2.075366e-07,-1.038065e-07
98,-1.035447e-07,-1.127510e-07,-2.070420e-07,-1.163685e-07,-2.069520e-07,-1.035707e-07,-1.035955e-07,-1.095036e-07,-2.072412e-07,-2.131870e-07,...,-1.197577e-07,-2.072434e-07,-2.069690e-07,-2.072923e-07,-1.065391e-07,-1.064227e-07,-1.035247e-07,-2.075366e-07,,-1.064453e-07


np.nanmean(norm_err)=np.float64(-1.0697248708547971e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0697248708547971e-07)
    np.nanmedian(norm_err)=np.float64(-1.0395136282756849e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0395136282756849e-07)
    
