In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-15T23:51:52.163111+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
downstream                        : 1.14.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10308.20it/s]
100%|██████████| 100/100 [00:00<00:00, 368.73it/s]
6138it [00:00, 629037.99it/s]
100%|██████████| 100/100 [00:00<00:00, 230836.76it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.063571,0.000000,0.755125,0.000000,0.000000,0.000000,0.395005,0.000000,...,0.000000,0.000000,0.000000,-0.035514,-0.066409,0.000000,0.000000,0.000000,0.401960,0.591642
1,0.000000,,0.132157,0.000000,-0.910276,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.887784,0.000000,0.375370,0.137639,0.000000,0.000000,0.000000,-0.075933,-0.102758
2,-0.063571,0.132157,,0.268683,0.000000,-0.046171,-0.009374,-0.041442,-0.753028,0.494561,...,-0.060242,0.083138,0.558234,-0.623757,0.126772,0.121417,-0.037749,-0.039845,0.000000,0.000000
3,0.000000,0.000000,0.268683,,-0.232049,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.107609,0.292355,0.000000,0.000000,0.000000,-0.107243,-0.169874
4,0.755125,-0.910276,0.000000,-0.232049,,0.515228,0.008083,0.454979,0.416618,-0.147408,...,0.482276,-0.079266,-0.162034,0.000000,0.000000,-0.104747,0.409265,0.306624,0.742831,1.209290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.121417,0.000000,-0.104747,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.048394,0.132209,,0.000000,0.000000,-0.048229,-0.076538
96,0.000000,0.000000,-0.037749,0.000000,0.409265,0.000000,0.000000,0.000000,0.554056,0.000000,...,0.000000,0.000000,0.000000,-0.025695,-0.038732,0.000000,,0.000000,0.277242,0.355956
97,0.000000,0.000000,-0.039845,0.000000,0.306624,0.000000,0.000000,0.000000,0.189889,0.000000,...,0.000000,0.000000,0.000000,-0.026649,-0.040942,0.000000,0.000000,,0.207245,0.266442
98,0.401960,-0.075933,0.000000,-0.107243,0.742831,0.322119,0.003715,0.297490,0.280597,-0.564395,...,0.274923,-0.269797,-0.318041,0.000000,0.000000,-0.048229,0.277242,0.207245,,0.726399


np.nanmean(norm_err)=np.float64(0.017686485798375767)
    np.nanmean(np.abs(norm_err))=np.float64(0.13500957133864516)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0059199559856386)
    


100%|██████████| 100/100 [00:00<00:00, 33117.28it/s]
100%|██████████| 100/100 [00:00<00:00, 435.86it/s]
5971it [00:00, 580398.36it/s]
100%|██████████| 100/100 [00:00<00:00, 279993.59it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.663090e-07,0.000000e+00,0.000000e+00,-2.026805e-07,-2.231192e-07,0.000000e+00,-2.564306e-07,-3.127474e-07,-1.824755e-07,...,-6.468998e-07,0.000000e+00,-2.257956e-07,0.000000e+00,0.003029,0.000000e+00,-4.810536e-07,0.000000e+00,0.000000,-2.442010e-07
1,-2.663090e-07,,-2.841748e-07,-1.832096e-07,-3.350698e-07,-3.460757e-07,-2.408910e-07,-3.848474e-07,-4.762210e-07,-3.085254e-07,...,-7.557434e-07,-2.024045e-07,-3.492865e-07,-2.019391e-07,0.010231,-1.553611e-07,-6.423018e-07,-1.623868e-07,0.024302,6.407578e-03
2,0.000000e+00,-2.841748e-07,,0.000000e+00,-2.128657e-07,-2.251015e-07,0.000000e+00,-2.590524e-07,-3.376790e-07,-1.906901e-07,...,-1.018249e-06,0.000000e+00,-2.278259e-07,0.000000e+00,0.003168,0.000000e+00,-5.426836e-07,0.000000e+00,0.000000,-2.591405e-07
3,0.000000e+00,-1.832096e-07,0.000000e+00,,3.021936e-02,-1.551050e-07,0.000000e+00,-1.705022e-07,-2.040541e-07,-1.392103e-07,...,-3.016989e-07,0.000000e+00,-1.563937e-07,0.000000e+00,0.004611,0.000000e+00,-2.667240e-07,2.936541e-02,-0.004383,-5.667618e-03
4,-2.026805e-07,-3.350698e-07,-2.128657e-07,3.021936e-02,,-2.874422e-07,-1.864677e-07,-3.136909e-07,-3.695944e-07,-2.599410e-07,...,-5.228431e-07,-1.634136e-07,-2.896537e-07,-1.510039e-07,0.004301,-1.313119e-07,-3.125025e-02,-1.650432e-07,-0.008223,-1.044249e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.553611e-07,0.000000e+00,0.000000e+00,-1.313119e-07,-1.358678e-07,0.000000e+00,-1.475388e-07,-1.700956e-07,-1.225225e-07,...,-2.564277e-07,0.000000e+00,-1.368556e-07,0.000000e+00,0.002019,,-2.100686e-07,0.000000e+00,0.000000,-1.475674e-07
96,-4.810536e-07,-6.423018e-07,-5.426836e-07,-2.667240e-07,-3.125025e-02,-4.874700e-07,-3.987628e-07,-5.680851e-07,-7.824013e-07,-4.133127e-07,...,-2.061569e-06,-3.063410e-07,-4.938646e-07,-2.654461e-07,0.006895,-2.100686e-07,,-2.990101e-02,-0.012813,-1.915765e-02
97,0.000000e+00,-1.623868e-07,0.000000e+00,2.936541e-02,-1.650432e-07,-1.399159e-07,0.000000e+00,-1.523245e-07,-1.785534e-07,-1.268507e-07,...,-2.490990e-07,0.000000e+00,-1.409637e-07,0.000000e+00,0.004196,0.000000e+00,-2.990101e-02,,-0.008032,-1.013670e-02
98,0.000000e+00,2.430162e-02,0.000000e+00,-4.383331e-03,-8.223078e-03,-1.279946e-07,5.283978e-03,-1.383008e-07,5.242834e-03,3.840221e-03,...,-2.136683e-07,0.000000e+00,-1.288708e-07,2.215450e-02,0.007762,0.000000e+00,-1.281325e-02,-8.032243e-03,,-3.744968e-02


np.nanmean(norm_err)=np.float64(0.001068553487061211)
    np.nanmean(np.abs(norm_err))=np.float64(0.010043350311156619)
    np.nanmedian(norm_err)=np.float64(-1.4670243677373475e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(3.143869618510051e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34376.72it/s]
100%|██████████| 100/100 [00:00<00:00, 421.66it/s]
5960it [00:00, 632958.22it/s]
100%|██████████| 100/100 [00:00<00:00, 371506.11it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33808.67it/s]
100%|██████████| 100/100 [00:00<00:00, 988.41it/s]
5945it [00:00, 605706.93it/s]
100%|██████████| 100/100 [00:00<00:00, 393831.36it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.063907e-07,-1.037442e-07,-1.035082e-07,-2.127775e-07,-1.063908e-07,-2.126939e-07,-1.063744e-07,-2.073329e-07,-2.130620e-07,...,-1.160840e-07,-1.036601e-07,-2.394485e-07,-2.192483e-07,-2.075402e-07,-1.035599e-07,-1.097361e-07,-2.128073e-07,-1.064808e-07,-1.037099e-07
1,-1.063907e-07,,0.000000e+00,0.000000e+00,-1.125477e-07,0.000000e+00,-1.125009e-07,0.000000e+00,-1.035600e-07,-1.236684e-07,...,0.000000e+00,0.000000e+00,-1.062584e-07,-1.064493e-07,-1.036634e-07,0.000000e+00,0.000000e+00,-1.234968e-07,0.000000e+00,0.000000e+00
2,-1.037442e-07,0.000000e+00,,0.000000e+00,-1.036357e-07,0.000000e+00,-1.035960e-07,0.000000e+00,-1.201429e-07,-1.037707e-07,...,0.000000e+00,0.000000e+00,-1.036183e-07,-1.037999e-07,-1.166054e-07,0.000000e+00,0.000000e+00,-1.036498e-07,0.000000e+00,0.000000e+00
3,-1.035082e-07,0.000000e+00,0.000000e+00,,-1.034002e-07,0.000000e+00,-1.033607e-07,0.000000e+00,-1.127437e-07,-1.035346e-07,...,0.000000e+00,0.000000e+00,-1.033830e-07,-1.035637e-07,-1.128663e-07,0.000000e+00,0.000000e+00,-1.034143e-07,0.000000e+00,0.000000e+00
4,-2.127775e-07,-1.125477e-07,-1.036357e-07,-1.034002e-07,,-1.125478e-07,-2.391000e-07,-1.125295e-07,-2.071162e-07,-2.254095e-07,...,-1.062589e-07,-1.035518e-07,-2.125129e-07,-2.128948e-07,-2.073231e-07,-1.034517e-07,-1.065529e-07,-2.251244e-07,-1.094174e-07,-1.036015e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.035599e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.034517e-07,0.000000e+00,-1.034122e-07,0.000000e+00,-1.128049e-07,-1.035863e-07,...,0.000000e+00,0.000000e+00,-1.034345e-07,-1.036154e-07,-1.129277e-07,,0.000000e+00,-1.034659e-07,0.000000e+00,0.000000e+00
96,-1.097361e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065529e-07,0.000000e+00,-1.065110e-07,0.000000e+00,-1.038223e-07,-1.066956e-07,...,0.000000e+00,0.000000e+00,-1.095953e-07,-1.130525e-07,-1.039263e-07,0.000000e+00,,-1.065679e-07,0.000000e+00,0.000000e+00
97,-2.128073e-07,-1.234968e-07,-1.036498e-07,-1.034143e-07,-2.251244e-07,-1.159871e-07,-2.250308e-07,-1.276053e-07,-2.071445e-07,-3.071193e-07,...,-1.062738e-07,-1.035659e-07,-2.125426e-07,-2.129246e-07,-2.073514e-07,-1.034659e-07,-1.065679e-07,,-1.094332e-07,-1.036156e-07
98,-1.064808e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.094174e-07,0.000000e+00,-1.093732e-07,0.000000e+00,-1.036453e-07,-1.095679e-07,...,0.000000e+00,0.000000e+00,-1.063483e-07,-1.065395e-07,-1.037489e-07,0.000000e+00,0.000000e+00,-1.094332e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-9.828085996307527e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.828085996307527e-08)
    np.nanmedian(norm_err)=np.float64(-1.0382544326622101e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0382544326622101e-07)
    
