In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-29T00:24:35.077342+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10822.06it/s]
100%|██████████| 100/100 [00:00<00:00, 390.04it/s]
6135it [00:00, 613515.21it/s]
100%|██████████| 100/100 [00:00<00:00, 232758.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.0,0.000000,-0.554145,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.083622,0.000000,0.000000,0.000000,-1.223671,0.000000,0.000000,-0.261542,-0.689248
1,0.000000,,0.0,0.000000,0.149757,0.000000,0.000000,0.000000,-0.186501,0.000000,...,0.000000,0.317849,0.000000,0.000000,0.000000,0.149814,0.000000,0.000000,0.000000,0.094560
2,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,0.345983,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.0,,0.730873,0.000000,0.000000,0.000000,-0.095155,0.000000,...,0.000000,0.368098,0.000000,0.000000,0.000000,0.084586,0.000000,0.000000,0.000000,0.063603
4,-0.554145,0.149757,0.0,0.730873,,0.590942,0.379387,0.581209,0.000000,0.086278,...,0.678476,0.235669,-0.034825,-0.051824,-0.046355,-0.702218,-0.030879,0.089921,-0.273825,-0.626120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.223671,0.149814,0.0,0.084586,-0.702218,0.066608,0.112816,0.065393,0.000000,0.362780,...,0.077746,0.046192,-0.053087,-0.106180,-0.085510,,-0.044432,0.249770,-0.498432,0.000000
96,0.000000,0.000000,0.0,0.000000,-0.030879,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.311242,0.000000,0.000000,0.000000,-0.044432,,0.000000,0.000000,-0.034667
97,0.000000,0.000000,0.0,0.000000,0.089921,0.000000,0.000000,0.000000,-0.340508,0.000000,...,0.000000,0.217366,0.000000,0.000000,0.000000,0.249770,0.000000,,0.000000,0.139419
98,-0.261542,0.000000,0.0,0.000000,-0.273825,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.072335,0.000000,0.000000,0.000000,-0.498432,0.000000,0.000000,,-0.326541


np.nanmean(norm_err)=np.float64(-0.017538952517575895)
    np.nanmean(np.abs(norm_err))=np.float64(0.09723529604163612)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 34295.21it/s]
100%|██████████| 100/100 [00:00<00:00, 440.49it/s]
5985it [00:00, 576985.53it/s]
100%|██████████| 100/100 [00:00<00:00, 202135.13it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,1.367019e-02,1.755989e-02,-1.422374e-02,1.643937e-02,1.908347e-02,4.143693e-03,4.019766e-03,-3.229076e-02,-2.020597e-07,...,-1.615537e-07,-8.356674e-04,-8.469112e-04,-1.317892e-03,-1.046916e-07,-1.595285e-07,1.341443e-02,-1.494795e-07,-8.725935e-04,4.839744e-03
1,1.367019e-02,,1.615759e-01,1.601619e-03,1.974864e-02,2.538408e-02,6.707176e-03,6.388488e-03,3.073004e-03,-1.031645e-06,...,-6.290275e-07,-7.155528e-02,-7.301400e-02,-1.054909e-02,-3.057410e-07,2.701210e-02,9.152322e-03,-5.435526e-07,-7.640749e-02,8.742414e-03
2,1.755989e-02,1.615759e-01,,3.951084e-03,-2.553130e-07,-7.109546e-07,-2.601331e-07,-4.893861e-07,9.765654e-03,-8.674031e-03,...,-9.198232e-07,-1.775110e-07,-1.816397e-07,-9.500948e-07,-5.912695e-03,4.204709e-02,7.395012e-02,-7.478545e-07,-3.826691e-07,-3.688060e-07
3,-1.422374e-02,1.601619e-03,3.951084e-03,,2.498840e-03,3.009346e-03,2.553055e-03,2.459190e-03,-2.476355e-07,-2.915042e-07,...,-2.140712e-07,-9.918247e-04,-1.007703e-03,-1.753140e-03,-1.244816e-07,-4.476418e-02,1.563373e-03,-1.933739e-07,-1.044248e-03,3.105691e-03
4,1.643937e-02,1.974864e-02,-2.553130e-07,2.498840e-03,,-2.064648e-07,0.000000e+00,-1.543800e-07,4.007878e-03,-2.427858e-03,...,-2.190532e-07,0.000000e+00,0.000000e+00,-2.224287e-07,-4.125648e-03,4.201433e-02,1.926157e-02,-1.974300e-07,-1.312816e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.595285e-07,2.701210e-02,4.204709e-02,-4.476418e-02,4.201433e-02,5.499254e-02,7.063562e-03,6.711008e-03,-4.863432e-02,-1.181225e-06,...,-6.816593e-07,-1.261678e-03,-1.287482e-03,-2.818182e-03,-3.176625e-07,,2.603110e-02,-5.824107e-07,-1.347687e-03,9.357823e-03
96,1.341443e-02,9.152322e-03,7.395012e-02,1.563373e-03,1.926157e-02,2.458492e-02,6.538396e-03,6.235188e-03,2.934897e-03,-4.844012e-07,...,-3.025476e-07,-2.449789e-02,-2.497649e-02,-1.013676e-02,-1.499871e-07,2.603110e-02,,-2.627948e-07,-2.608689e-02,8.457762e-03
97,-1.494795e-07,-5.435526e-07,-7.478545e-07,-1.933739e-07,-1.974300e-07,-5.048663e-07,-2.029642e-07,-3.868814e-07,-7.346796e-07,8.225770e-03,...,-7.063641e-07,-1.496198e-07,-1.525423e-07,-6.338728e-07,-3.036533e-07,-5.824107e-07,-2.627948e-07,,-3.186438e-07,-2.635572e-07
98,-8.725935e-04,-7.640749e-02,-3.826691e-07,-1.044248e-03,-1.312816e-07,-3.070510e-07,-1.337058e-07,-2.590112e-07,-1.529682e-03,-4.409409e-07,...,-3.462235e-07,-1.400812e-07,-1.374984e-07,-3.524415e-07,8.990949e-02,-1.347687e-03,-2.608689e-02,-3.186438e-07,,-1.575703e-07


np.nanmean(norm_err)=np.float64(-0.00040967146970717795)
    np.nanmean(np.abs(norm_err))=np.float64(0.011794576780827324)
    np.nanmedian(norm_err)=np.float64(-1.9977935727322062e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0016815602802713215)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36427.86it/s]
100%|██████████| 100/100 [00:00<00:00, 448.63it/s]
5954it [00:00, 643846.80it/s]
100%|██████████| 100/100 [00:00<00:00, 412825.20it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 36647.48it/s]
100%|██████████| 100/100 [00:00<00:00, 1012.05it/s]
5961it [00:00, 42208.36it/s]
100%|██████████| 100/100 [00:00<00:00, 407213.98it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.067856e-07,-2.190945e-07,-1.039000e-07,-2.071985e-07,-2.134085e-07,-2.326955e-07,-1.067872e-07,-1.064592e-07,-1.036210e-07,...,-2.191450e-07,-2.134098e-07,-2.071600e-07,-2.397931e-07,-1.037066e-07,-2.072436e-07,-2.133368e-07,-2.073223e-07,-1.038693e-07,-2.074438e-07
1,-1.067856e-07,,-1.065938e-07,0.000000e+00,-1.036983e-07,-1.131454e-07,-1.066993e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.066178e-07,-1.166047e-07,-1.036790e-07,-1.066112e-07,0.000000e+00,-1.037209e-07,-1.202350e-07,-1.037603e-07,0.000000e+00,-1.038212e-07
2,-2.190945e-07,-1.065938e-07,,-1.037185e-07,-2.068376e-07,-2.130256e-07,-2.189128e-07,-1.065955e-07,-1.062686e-07,-1.034404e-07,...,-2.320473e-07,-2.130269e-07,-2.067992e-07,-2.187274e-07,-1.035258e-07,-2.068825e-07,-2.129542e-07,-2.069609e-07,-1.036879e-07,-2.070820e-07
3,-1.039000e-07,0.000000e+00,-1.037185e-07,,-1.129290e-07,-1.039226e-07,-1.038183e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.037412e-07,-1.039232e-07,-1.129061e-07,-1.037349e-07,0.000000e+00,-1.066404e-07,-1.038885e-07,-1.130025e-07,0.000000e+00,-1.098195e-07
4,-2.071985e-07,-1.036983e-07,-2.068376e-07,-1.129290e-07,,-2.072433e-07,-2.070361e-07,-1.036999e-07,-1.033905e-07,-1.063227e-07,...,-2.068826e-07,-2.072445e-07,-2.640564e-07,-2.068702e-07,-1.064129e-07,-2.126472e-07,-2.071757e-07,-2.471922e-07,-1.065842e-07,-2.189671e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.072436e-07,-1.037209e-07,-2.068825e-07,-1.066404e-07,-2.126472e-07,-2.072884e-07,-2.070811e-07,-1.037225e-07,-1.034130e-07,-1.196938e-07,...,-2.069276e-07,-2.072896e-07,-2.126066e-07,-2.069152e-07,-1.094917e-07,,-2.072208e-07,-2.127775e-07,-1.163640e-07,-2.129055e-07
96,-2.133368e-07,-1.202350e-07,-2.129542e-07,-1.038885e-07,-2.071757e-07,-2.260277e-07,-2.131646e-07,-1.098498e-07,-1.095027e-07,-1.036095e-07,...,-2.130020e-07,-2.329299e-07,-2.071371e-07,-2.129888e-07,-1.036952e-07,-2.072208e-07,,-2.072994e-07,-1.038578e-07,-2.074209e-07
97,-2.073223e-07,-1.037603e-07,-2.069609e-07,-1.130025e-07,-2.471922e-07,-2.073671e-07,-2.071596e-07,-1.037619e-07,-1.034521e-07,-1.063879e-07,...,-2.070060e-07,-2.073683e-07,-2.471374e-07,-2.069936e-07,-1.064781e-07,-2.127775e-07,-2.072994e-07,,-1.066497e-07,-2.191053e-07
98,-1.038693e-07,0.000000e+00,-1.036879e-07,0.000000e+00,-1.065842e-07,-1.038918e-07,-1.037877e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,-1.037106e-07,-1.038924e-07,-1.065638e-07,-1.037043e-07,0.000000e+00,-1.163640e-07,-1.038578e-07,-1.066497e-07,,-1.067140e-07


np.nanmean(norm_err)=np.float64(-1.2582875318808535e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.2582875318808535e-07)
    np.nanmedian(norm_err)=np.float64(-1.0665103020859392e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0665103020859392e-07)
    
