In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-20T21:19:13.149856+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10825.69it/s]
100%|██████████| 100/100 [00:00<00:00, 381.20it/s]
6128it [00:00, 670895.95it/s]
100%|██████████| 100/100 [00:00<00:00, 256375.55it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-1.050855,-0.665792,0.165583,0.000000,-0.051547,-0.053268,-0.085305,-0.695706,...,0.165619,0.000000,0.00000,0.000000,-0.730038,0.000000,0.000000,0.000000,0.00000,-0.096240
1,0.000000,,0.334400,-0.070905,-0.712784,0.000000,-0.145140,-0.150618,-0.261778,0.235102,...,-0.062544,0.000000,0.00000,0.000000,-0.076655,0.000000,0.000000,0.000000,0.00000,0.000000
2,-1.050855,0.334400,,-0.583022,0.000000,0.000000,-0.047379,-0.048829,-0.074464,0.000000,...,0.000000,0.985123,0.00000,0.382307,-0.631703,0.000000,0.373774,-0.218469,0.00000,-0.088492
3,-0.665792,-0.070905,-0.583022,,-0.040785,0.070436,0.271364,0.283404,0.761354,-0.399258,...,-0.040793,-0.075093,0.15884,-0.109752,0.000000,0.353485,-0.079825,-0.468019,0.39306,-0.161195
4,0.165583,-0.712784,0.000000,-0.040785,,0.000000,-0.090457,-0.092555,-0.125233,0.000000,...,0.328791,-0.233075,0.00000,-0.225365,-0.042624,0.000000,-0.622762,0.127949,0.00000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.353485,0.000000,0.000000,0.228349,0.242206,0.763701,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.415637,,0.000000,0.000000,0.00000,0.000000
96,0.000000,0.000000,0.373774,-0.079825,-0.622762,0.000000,-0.159752,-0.166412,-0.313493,0.253907,...,-0.066940,0.000000,0.00000,0.000000,-0.087187,0.000000,,0.000000,0.00000,0.000000
97,0.000000,0.000000,-0.218469,-0.468019,0.127949,0.000000,-0.083689,-0.088320,-0.234078,-0.135427,...,0.127987,0.000000,0.00000,0.000000,-0.552098,0.000000,0.000000,,0.00000,-0.155787
98,0.000000,0.000000,0.000000,0.393060,0.000000,0.000000,0.260687,0.275693,0.782545,0.000000,...,0.000000,0.000000,0.00000,0.000000,0.456282,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.05805683335377688)
    np.nanmean(np.abs(norm_err))=np.float64(0.13299276334857477)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.053782649002555)
    


100%|██████████| 100/100 [00:00<00:00, 32574.59it/s]
100%|██████████| 100/100 [00:00<00:00, 661.59it/s]
5968it [00:00, 668882.94it/s]
100%|██████████| 100/100 [00:00<00:00, 295790.13it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.671383e-07,-2.206846e-07,-2.852788e-07,-2.739931e-07,-2.735629e-07,-1.446314e-07,-1.897149e-07,-5.870582e-07,-2.879384e-07,...,-3.486305e-07,-2.443585e-07,-3.076046e-07,-1.596329e-07,-1.327498e-07,-5.799877e-07,-1.413425e-07,-1.117383e-02,0.012267,-3.879549e-07
1,-3.671383e-07,,-2.478638e-07,-3.070402e-07,-2.940063e-07,-2.951627e-07,-1.519506e-07,-2.094598e-07,-6.873001e-07,-3.101232e-07,...,-3.816901e-07,-2.660064e-07,-3.312497e-07,-1.714375e-07,-1.408128e-07,-6.776287e-07,-1.483246e-07,-2.385911e-03,0.006727,-4.293357e-07
2,-2.206846e-07,-2.478638e-07,,-1.791718e-07,-5.677268e-03,0.000000e+00,0.000000e+00,0.000000e+00,-5.078265e-07,-1.812750e-07,...,-2.326072e-07,0.000000e+00,-1.952531e-07,0.000000e+00,0.000000e+00,-4.973371e-07,0.000000e+00,0.000000e+00,-0.124320,-2.683994e-07
3,-2.852788e-07,-3.070402e-07,-1.791718e-07,,-2.395400e-07,-2.019750e-07,-1.227863e-07,-1.582041e-07,-4.487626e-07,-2.767874e-07,...,-2.946977e-07,-1.878832e-07,-2.631272e-07,-1.352050e-07,-1.154098e-07,-4.446192e-07,-1.204077e-07,5.852860e-03,0.002537,-3.679803e-07
4,-2.739931e-07,-2.940063e-07,-5.677268e-03,-2.395400e-07,,-1.908441e-07,-1.185817e-07,-1.514801e-07,-4.221835e-07,-2.414123e-07,...,-2.829978e-07,-1.782142e-07,-2.534964e-07,-1.301246e-07,-1.116876e-07,-4.185143e-07,-1.163618e-07,-1.147958e-07,-0.014791,-3.079818e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-5.799877e-07,-6.776287e-07,-4.973371e-07,-4.446192e-07,-4.185143e-07,-7.200603e-07,-2.182502e-07,-5.070530e-07,-2.321276e-06,-4.511134e-07,...,-6.266624e-07,-5.681436e-07,-4.952271e-07,-2.608352e-07,-1.959848e-07,,-2.108468e-07,-2.057607e-07,0.002838,-7.567774e-07
96,-1.413425e-07,-1.483246e-07,0.000000e+00,-1.204077e-07,-1.163618e-07,0.000000e+00,0.000000e+00,0.000000e+00,-2.127095e-07,-1.213539e-07,...,-1.422240e-07,0.000000e+00,-1.282662e-07,0.000000e+00,0.000000e+00,-2.108468e-07,,-9.496936e-03,0.009871,-1.550467e-07
97,-1.117383e-02,-2.385911e-03,0.000000e+00,5.852860e-03,-1.147958e-07,-1.553890e-02,-9.677432e-03,0.000000e+00,-2.075342e-07,5.898343e-03,...,-1.398915e-07,-1.451639e-02,-6.192040e-03,-3.218828e-02,-2.760208e-02,-2.057607e-07,-9.496936e-03,,0.002401,7.512769e-03
98,1.226748e-02,6.727499e-03,-1.243204e-01,2.536767e-03,-1.479115e-02,1.999018e-02,1.011666e-02,1.728629e-03,2.880092e-03,2.562673e-03,...,1.571452e-03,1.795256e-02,4.499392e-02,1.144045e-02,9.362786e-03,2.837964e-03,9.871012e-03,2.401303e-03,,3.571158e-03


np.nanmean(norm_err)=np.float64(-0.000672680365212673)
    np.nanmean(np.abs(norm_err))=np.float64(0.009885939987868422)
    np.nanmedian(norm_err)=np.float64(-2.2744801671146547e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.773006692862554e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35705.32it/s]
100%|██████████| 100/100 [00:00<00:00, 458.27it/s]
5937it [00:00, 679294.64it/s]
100%|██████████| 100/100 [00:00<00:00, 330520.41it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 36198.36it/s]
100%|██████████| 100/100 [00:00<00:00, 910.61it/s]
5950it [00:00, 648665.53it/s]
100%|██████████| 100/100 [00:00<00:00, 417343.68it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.070411e-07,-2.189288e-07,-1.033235e-07,-2.186989e-07,-2.127763e-07,-1.127036e-07,-2.067270e-07,-2.069048e-07,-2.071850e-07,...,-2.071042e-07,-1.035182e-07,-1.035515e-07,-2.072494e-07,-2.072561e-07,-1.033244e-07,-2.125560e-07,-1.034035e-07,-1.033879e-07,-2.127222e-07
1,-2.070411e-07,,-2.074347e-07,-1.126877e-07,-2.072283e-07,-2.073915e-07,-1.037218e-07,-2.128877e-07,-2.191981e-07,-2.133734e-07,...,-2.132877e-07,-1.066079e-07,-1.066433e-07,-2.134417e-07,-2.330053e-07,-1.322178e-07,-2.071822e-07,-1.095442e-07,-1.095267e-07,-2.073401e-07
2,-2.189288e-07,-2.074347e-07,,-1.035195e-07,-2.475833e-07,-2.131920e-07,-1.096894e-07,-2.071195e-07,-2.072979e-07,-2.075792e-07,...,-2.074980e-07,-1.037150e-07,-1.037485e-07,-2.076438e-07,-2.076505e-07,-1.035205e-07,-2.129709e-07,-1.035999e-07,-1.035842e-07,-2.131377e-07
3,-1.033235e-07,-1.126877e-07,-1.035195e-07,,-1.034167e-07,-1.034980e-07,0.000000e+00,-1.062355e-07,-1.093782e-07,-1.064774e-07,...,-1.064347e-07,0.000000e+00,0.000000e+00,-1.065114e-07,-1.128151e-07,0.000000e+00,-1.033938e-07,0.000000e+00,0.000000e+00,-1.034724e-07
4,-2.186989e-07,-2.072283e-07,-2.475833e-07,-1.034167e-07,,-2.129740e-07,-1.095740e-07,-2.069136e-07,-2.070918e-07,-2.073724e-07,...,-2.072915e-07,-1.036118e-07,-1.036452e-07,-2.074369e-07,-2.074436e-07,-1.034176e-07,-2.127533e-07,-1.034969e-07,-1.034813e-07,-2.129198e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.033244e-07,-1.322178e-07,-1.035205e-07,0.000000e+00,-1.034176e-07,-1.034989e-07,0.000000e+00,-1.062365e-07,-1.093792e-07,-1.064784e-07,...,-1.064357e-07,0.000000e+00,0.000000e+00,-1.065124e-07,-1.162543e-07,,-1.033947e-07,0.000000e+00,0.000000e+00,-1.034733e-07
96,-2.125560e-07,-2.071822e-07,-2.129709e-07,-1.033938e-07,-2.127533e-07,-2.396821e-07,-1.064901e-07,-2.068677e-07,-2.070458e-07,-2.073263e-07,...,-2.072454e-07,-1.035887e-07,-1.036221e-07,-2.073908e-07,-2.073975e-07,-1.033947e-07,,-1.034739e-07,-1.034583e-07,-2.189811e-07
97,-1.034035e-07,-1.095442e-07,-1.035999e-07,0.000000e+00,-1.034969e-07,-1.035783e-07,0.000000e+00,-1.063202e-07,-1.127021e-07,-1.065624e-07,...,-1.065197e-07,0.000000e+00,0.000000e+00,-1.065965e-07,-1.096646e-07,0.000000e+00,-1.034739e-07,,0.000000e+00,-1.035527e-07
98,-1.033879e-07,-1.095267e-07,-1.035842e-07,0.000000e+00,-1.034813e-07,-1.035627e-07,0.000000e+00,-1.063037e-07,-1.126835e-07,-1.065459e-07,...,-1.065031e-07,0.000000e+00,0.000000e+00,-1.065799e-07,-1.096470e-07,0.000000e+00,-1.034583e-07,0.000000e+00,,-1.035370e-07


np.nanmean(norm_err)=np.float64(-1.0662916109393571e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0662916109393571e-07)
    np.nanmedian(norm_err)=np.float64(-1.0507711864347053e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0507711864347053e-07)
    
