In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-13T00:07:33.491705+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
pandas                            : 2.2.3
downstream                        : 1.14.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 7791.90it/s]
100%|██████████| 100/100 [00:00<00:00, 350.46it/s]
6119it [00:00, 581931.98it/s]
100%|██████████| 100/100 [00:00<00:00, 234843.45it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.062778,-0.557466,-0.029547,-0.442335,-0.026957,0.172551,0.051898,-0.036531,-0.026533,...,-0.030182,0.057162,0.250475,0.059812,0.321010,-0.000008,0.311252,0.233437,-0.000002,0.279053
1,0.062778,,-0.405737,-0.036417,-0.458227,-0.032562,0.238124,0.000000,-0.047643,-0.031944,...,-0.037387,0.000000,0.532985,0.000000,0.263973,-0.000010,0.255201,0.487553,-0.000002,0.327697
2,-0.557466,-0.405737,,0.442105,-0.511373,0.620259,-0.304174,-0.325547,0.293557,0.601796,...,0.460211,-0.363745,-0.023168,-0.383396,-0.021233,-0.324095,-0.020551,-0.021432,-0.030289,-0.026139
3,-0.029547,-0.036417,0.442105,,0.235441,0.000000,-0.072547,-0.029293,0.000000,0.000000,...,0.000000,-0.032691,-0.025966,-0.034436,-0.023559,-0.078282,-0.022722,-0.023804,-0.035257,-0.029757
4,-0.442335,-0.458227,-0.511373,0.235441,,0.197621,-0.375033,-0.358497,0.380214,0.191991,...,0.245738,-0.405376,-0.024964,-0.429935,-0.022732,-0.405780,-0.021952,-0.022960,-0.033435,-0.028448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.000008,-0.000010,-0.324095,-0.078282,-0.405780,-0.062398,0.000000,-0.000007,-0.158613,-0.060169,...,-0.082902,-0.000009,0.129691,-0.000009,0.115019,,0.110073,0.116485,-0.000004,0.154223
96,0.311252,0.255201,-0.020551,-0.022722,-0.021952,-0.021159,0.226325,0.223464,-0.026638,-0.020896,...,-0.023096,0.239274,0.073971,0.246905,0.000000,0.110073,,0.069479,-0.160242,0.000000
97,0.233437,0.487553,-0.021432,-0.023804,-0.022960,-0.022094,0.239900,0.419294,-0.028138,-0.021808,...,-0.024215,0.452992,0.000000,0.469474,0.071417,0.116485,0.069479,,-0.738279,0.084802
98,-0.000002,-0.000002,-0.030289,-0.035257,-0.033435,-0.031631,-0.000004,-0.000002,-0.045676,-0.031048,...,-0.036165,-0.000002,-0.813260,-0.000002,-0.165420,-0.000004,-0.160242,-0.738279,,-0.202419


np.nanmean(norm_err)=np.float64(0.011905194494487025)
    np.nanmean(np.abs(norm_err))=np.float64(0.1264487970511644)
    np.nanmedian(norm_err)=np.float64(-1.6071141047275627e-06)
    np.nanmedian(np.abs(norm_err))=np.float64(0.03186734710153634)
    


100%|██████████| 100/100 [00:00<00:00, 32742.42it/s]
100%|██████████| 100/100 [00:00<00:00, 371.71it/s]
5985it [00:00, 579743.87it/s]
100%|██████████| 100/100 [00:00<00:00, 241468.28it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.067959e-07,0.012528,0.006608,-1.818582e-02,-4.929862e-07,6.174253e-03,5.940361e-02,0.007728,2.191059e-02,...,5.071278e-02,-3.079390e-07,-6.821866e-07,-2.084544e-07,0.005789,-4.838553e-07,-4.063330e-07,-1.845681e-06,0.007846,-4.859439e-07
1,-4.067959e-07,,-0.005884,-0.004141,-1.396812e-07,-3.491398e-07,-3.966885e-03,-8.475203e-02,-0.004555,-4.717227e-02,...,-1.643223e-01,-2.819274e-03,-2.403078e-07,-2.225740e-03,-0.003804,-2.964279e-07,-2.188753e-03,-4.929028e-07,-0.004596,-2.936359e-07
2,1.252819e-02,-5.883593e-03,,-0.047210,1.360851e-02,-7.143568e-03,-2.287799e-07,0.000000e+00,0.000000,0.000000e+00,...,0.000000e+00,-1.927273e-02,1.869859e-02,-1.296821e-02,0.000000,1.497538e-02,-1.263543e-02,2.005466e-02,0.000000,6.853837e-03
3,6.607947e-03,-4.141500e-03,-0.047210,,2.961944e-02,-4.728549e-03,-6.123738e-02,-2.384497e-07,-0.005782,-1.273658e-07,...,-2.192950e-07,-1.172959e-02,8.000671e-03,-9.051499e-03,-0.058423,3.162472e-02,-8.888156e-03,8.238887e-03,-0.248731,4.599415e-03
4,-1.818582e-02,-1.396812e-07,0.013609,0.029619,,-1.587403e-07,8.943372e-03,6.473205e-03,0.010350,3.545451e-03,...,5.978757e-03,0.000000e+00,-2.627515e-02,0.000000e+00,0.008558,3.663279e-02,-1.395720e-07,-2.247005e-02,0.010448,-2.576769e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.838553e-07,-2.964279e-07,0.014975,0.031625,3.663279e-02,-3.397062e-07,9.514024e-03,7.137095e-03,0.011122,3.735720e-03,...,6.540678e-03,-1.969307e-07,-2.960045e-07,-1.508809e-07,0.009079,,-2.961821e-07,-6.107633e-07,0.011235,-3.317766e-07
96,-4.063330e-07,-2.188753e-03,-0.012635,-0.008888,-1.395720e-07,-2.472652e-03,-8.512853e-03,-6.002161e-03,-0.009778,-3.399386e-03,...,-5.574693e-03,-2.342418e-07,-2.399848e-07,-1.513926e-07,-0.008163,-2.961821e-07,,-4.922235e-07,-0.009865,-2.933946e-07
97,-1.845681e-06,-4.929028e-07,0.020055,0.008239,-2.247005e-02,-6.253844e-07,7.575435e-03,9.930624e-02,0.010057,2.572288e-02,...,7.719171e-02,-4.186692e-07,-1.647451e-06,-2.539143e-07,0.007003,-6.107633e-07,-4.922235e-07,,0.010256,-6.140950e-07
98,7.845601e-03,-4.595571e-03,0.000000,-0.248731,1.044772e-02,-5.329845e-03,-1.804960e-07,0.000000e+00,0.000000,0.000000e+00,...,0.000000e+00,-1.349200e-02,9.889406e-03,-1.006615e-02,0.000000,1.123497e-02,-9.864518e-03,1.025602e-02,,5.166793e-03


np.nanmean(norm_err)=np.float64(0.001846417150864281)
    np.nanmean(np.abs(norm_err))=np.float64(0.012111257257580811)
    np.nanmedian(norm_err)=np.float64(-1.35571891679506e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0035450175735105057)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34141.67it/s]
100%|██████████| 100/100 [00:00<00:00, 413.90it/s]
5948it [00:00, 638718.87it/s]
100%|██████████| 100/100 [00:00<00:00, 284745.69it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32268.84it/s]
100%|██████████| 100/100 [00:00<00:00, 1000.52it/s]
5937it [00:00, 531994.16it/s]
100%|██████████| 100/100 [00:00<00:00, 386571.80it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.036938e-07,-1.201142e-07,0.000000e+00,0.000000e+00,-1.064876e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.064913e-07,-1.037009e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.035586e-07,0.000000e+00
1,-1.036938e-07,,-2.077958e-07,-1.066384e-07,-1.098766e-07,-2.074343e-07,-1.065058e-07,-1.066651e-07,-1.097652e-07,-1.037220e-07,...,-1.065881e-07,-1.038984e-07,-2.074414e-07,-2.134485e-07,-1.038878e-07,-1.036127e-07,-1.036332e-07,-1.036655e-07,-2.131471e-07,-1.035679e-07
2,-1.201142e-07,-2.077958e-07,,-1.038236e-07,-1.039771e-07,-2.134058e-07,-1.036979e-07,-1.038489e-07,-1.038774e-07,-1.067080e-07,...,-1.037760e-07,-1.203888e-07,-2.134133e-07,-2.078100e-07,-1.068836e-07,-1.163453e-07,-1.129262e-07,-1.129646e-07,-2.075243e-07,-1.096062e-07
3,0.000000e+00,-1.066384e-07,-1.038236e-07,,0.000000e+00,-1.036431e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.036466e-07,-1.129619e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.127930e-07,0.000000e+00
4,0.000000e+00,-1.098766e-07,-1.039771e-07,0.000000e+00,,-1.037961e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.037996e-07,-1.068078e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066569e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.036127e-07,-1.163453e-07,0.000000e+00,0.000000e+00,-1.064021e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.064058e-07,-1.036198e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.034777e-07,0.000000e+00
96,0.000000e+00,-1.036332e-07,-1.129262e-07,0.000000e+00,0.000000e+00,-1.064237e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.064274e-07,-1.036402e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.034981e-07,0.000000e+00
97,0.000000e+00,-1.036655e-07,-1.129646e-07,0.000000e+00,0.000000e+00,-1.064578e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,-1.064615e-07,-1.036726e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.035303e-07,0.000000e+00
98,-1.035586e-07,-2.131471e-07,-2.075243e-07,-1.127930e-07,-1.066569e-07,-2.071637e-07,-1.094139e-07,-1.095820e-07,-1.065520e-07,-1.035867e-07,...,-1.095007e-07,-1.037627e-07,-2.071707e-07,-2.399819e-07,-1.037521e-07,-1.034777e-07,-1.034981e-07,-1.035303e-07,,-1.034330e-07


np.nanmean(norm_err)=np.float64(-9.83170665558711e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.83170665558711e-08)
    np.nanmedian(norm_err)=np.float64(-1.0382776096065631e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0382776096065631e-07)
    
