In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-25T00:23:49.813275+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10798.65it/s]
100%|██████████| 100/100 [00:00<00:00, 361.27it/s]
6128it [00:00, 630338.80it/s]
100%|██████████| 100/100 [00:00<00:00, 216984.17it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,,0.028055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.0,0.028055,,-0.042113,0.055014,-0.036699,-0.077267,-0.067881,-0.072956,-0.097786,...,-0.024960,-0.042245,-0.033793,-0.041292,-0.055986,-0.089963,-0.037525,-0.080368,-0.034321,-0.025560
3,0.0,0.000000,-0.042113,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.247614,0.000000,0.000000,-0.301894,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.055014,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.000000,-0.089963,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.179965,0.000000,,0.000000,0.000000,0.000000,0.000000
96,0.0,0.000000,-0.037525,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.227203,0.000000,0.000000,-0.272092,0.000000,,0.000000,0.000000,0.000000
97,0.0,0.000000,-0.080368,0.000000,0.000000,0.062200,0.000000,0.000000,0.000000,0.000000,...,0.034618,0.000000,0.000000,-0.169826,0.000000,0.000000,0.000000,,0.000000,0.035782
98,0.0,0.000000,-0.034321,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.212209,0.000000,0.000000,-0.250865,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.0015649705988213282)
    np.nanmean(np.abs(norm_err))=np.float64(0.06295530331990813)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 35861.01it/s]
100%|██████████| 100/100 [00:00<00:00, 410.65it/s]
5999it [00:00, 570480.88it/s]
100%|██████████| 100/100 [00:00<00:00, 275759.63it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.606758e-02,0.000000e+00,0.000000,0.008727,-1.443367e-01,-1.213957e-07,9.869784e-03,-0.018004,-1.670250e-07,...,-1.251940e-07,0.007909,0.002220,-1.571441e-02,-0.049645,0.000000,0.013917,0.004479,5.597451e-03,-1.104302e-02
1,-0.016068,,0.000000e+00,-0.001830,0.000698,-1.956788e-02,4.494782e-03,8.234209e-04,-0.047106,6.944938e-03,...,4.678072e-03,0.001726,0.000723,-3.780861e-02,-0.015969,0.008228,0.001374,0.000000,-3.037641e-07,0.000000e+00
2,0.000000,0.000000e+00,,0.000000,-0.033513,-2.409905e-07,-1.760243e-07,-4.075426e-02,0.000000,-2.914914e-07,...,-1.841245e-07,-0.010324,-0.247103,-3.690216e-07,0.000000,0.000000,-0.007511,0.000000,-3.875977e-07,0.000000e+00
3,0.000000,-1.829700e-03,0.000000e+00,,0.002092,-3.297863e-07,-2.189212e-07,2.699300e-03,0.000000,-4.315079e-07,...,-2.315924e-07,0.017185,0.002206,-6.279062e-07,0.000000,0.000000,0.007614,0.001493,2.795664e-03,-8.348307e-04
4,0.008727,6.977976e-04,-3.351335e-02,0.002092,,2.511637e-03,4.948852e-04,-1.693898e-07,0.003721,6.883174e-04,...,5.108279e-04,0.004215,-0.021047,3.230423e-03,0.002151,0.000775,0.071914,0.005288,6.670353e-03,4.748161e-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,8.227548e-03,0.000000e+00,0.000000,0.000775,-2.328337e-07,-1.774972e-07,9.333085e-04,0.000000,-2.955528e-07,...,-1.857367e-07,0.002292,0.000807,-3.502334e-07,0.000000,,0.001709,-0.049252,-2.280115e-02,4.660110e-03
96,0.013917,1.373635e-03,-7.511444e-03,0.007614,0.071914,4.421648e-03,7.602203e-04,8.994770e-02,0.010333,1.337693e-03,...,7.985028e-04,0.000000,0.000000,7.269012e-03,0.003414,0.001709,,0.009664,1.555834e-02,7.137790e-04
97,0.004479,0.000000e+00,0.000000e+00,0.001493,0.005288,2.291848e-03,-3.811773e-02,6.152604e-03,0.003771,-5.709821e-02,...,-3.957889e-02,0.047814,0.002717,3.133236e-03,0.001906,-0.049252,0.009664,,8.808451e-03,0.000000e+00
98,0.005597,-3.037641e-07,-3.875977e-07,0.002796,0.006670,3.010108e-03,-1.099437e-02,8.108505e-03,0.006209,-1.840630e-02,...,-1.150907e-02,0.066543,0.003457,4.650425e-03,0.002378,-0.022801,0.015558,0.008808,,-1.677910e-07


np.nanmean(norm_err)=np.float64(-0.0005544234144061377)
    np.nanmean(np.abs(norm_err))=np.float64(0.009263538435599065)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.000740547532685463)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36999.86it/s]
100%|██████████| 100/100 [00:00<00:00, 431.38it/s]
5957it [00:00, 624371.37it/s]
100%|██████████| 100/100 [00:00<00:00, 318958.48it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28581.29it/s]
100%|██████████| 100/100 [00:00<00:00, 1008.54it/s]
5925it [00:00, 600271.77it/s]
100%|██████████| 100/100 [00:00<00:00, 330520.41it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,-1.037261e-07,0.000000e+00,0.000000e+00,-1.096310e-07,-1.036702e-07,-1.038529e-07,-1.068146e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036491e-07,0.000000e+00,0.000000e+00,-1.067207e-07,0.000000e+00,-1.037497e-07,0.000000e+00
1,0.000000e+00,,-1.037219e-07,0.000000e+00,0.000000e+00,-1.065639e-07,-1.036660e-07,-1.038488e-07,-1.202815e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036449e-07,0.000000e+00,0.000000e+00,-1.130409e-07,0.000000e+00,-1.037455e-07,0.000000e+00
2,-1.037261e-07,-1.037219e-07,,-1.036950e-07,-1.035483e-07,-2.071380e-07,-2.557272e-07,-2.194287e-07,-2.076032e-07,-1.094495e-07,...,-1.036709e-07,-1.063558e-07,-1.037528e-07,-2.189737e-07,-1.037504e-07,-1.065646e-07,-2.074258e-07,-1.063805e-07,-2.256823e-07,-1.129637e-07
3,0.000000e+00,0.000000e+00,-1.036950e-07,,0.000000e+00,-1.095962e-07,-1.036391e-07,-1.038218e-07,-1.067816e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.036180e-07,0.000000e+00,0.000000e+00,-1.066878e-07,0.000000e+00,-1.037186e-07,0.000000e+00
4,0.000000e+00,0.000000e+00,-1.035483e-07,0.000000e+00,,-1.063806e-07,-1.034926e-07,-1.036747e-07,-1.096921e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.034715e-07,0.000000e+00,0.000000e+00,-1.095931e-07,0.000000e+00,-1.035719e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,-1.065646e-07,0.000000e+00,0.000000e+00,-1.036175e-07,-1.065056e-07,-1.066985e-07,-1.038504e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.064833e-07,0.000000e+00,,-1.037616e-07,0.000000e+00,-1.065896e-07,0.000000e+00
96,-1.067207e-07,-1.130409e-07,-2.074258e-07,-1.066878e-07,-1.095931e-07,-2.131087e-07,-2.073140e-07,-2.076794e-07,-2.262711e-07,-1.036024e-07,...,-1.066623e-07,-1.035636e-07,-1.130775e-07,-2.072718e-07,-1.098195e-07,-1.037616e-07,,-1.035869e-07,-2.074730e-07,-1.038400e-07
97,0.000000e+00,0.000000e+00,-1.063805e-07,0.000000e+00,0.000000e+00,-1.034434e-07,-1.063216e-07,-1.065139e-07,-1.036754e-07,0.000000e+00,...,0.000000e+00,0.000000e+00,0.000000e+00,-1.062994e-07,0.000000e+00,0.000000e+00,-1.035869e-07,,-1.064053e-07,0.000000e+00
98,-1.037497e-07,-1.037455e-07,-2.256823e-07,-1.037186e-07,-1.035719e-07,-2.071851e-07,-2.255500e-07,-2.194816e-07,-2.076505e-07,-1.094758e-07,...,-1.036945e-07,-1.063807e-07,-1.037764e-07,-2.190263e-07,-1.037741e-07,-1.065896e-07,-2.074730e-07,-1.064053e-07,,-1.164407e-07


np.nanmean(norm_err)=np.float64(-8.773349659297506e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.773349659297506e-08)
    np.nanmedian(norm_err)=np.float64(-1.0368828026452138e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0368828026452138e-07)
    
