In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-15T15:25:10.531134+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10655.99it/s]
100%|██████████| 100/100 [00:00<00:00, 358.80it/s]
6132it [00:00, 609856.36it/s]
100%|██████████| 100/100 [00:00<00:00, 257161.50it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,,0.00000,0.105467,-0.241232,0.000000,-0.045169,-0.068251,0.000000,-0.058333,...,-0.266591,0.141325,-0.061966,0.000000,-0.170931,-0.115982,-0.420754,0.000000,0.000000,-0.459948
2,0.0,0.000000,,0.000000,0.431080,0.000000,0.201966,0.000000,0.000000,0.298926,...,0.519365,0.000000,0.330873,0.000000,0.248464,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.105467,0.00000,,-0.049412,0.082326,-0.260497,0.000000,0.000000,-0.323693,...,-0.052601,0.000000,-0.340297,0.131014,-0.039151,0.000000,0.568803,0.119566,-0.318883,0.607768
4,0.0,-0.241232,0.43108,-0.049412,,-0.167960,0.448914,-0.130152,0.457008,0.642490,...,0.000000,-0.065967,0.703502,-0.345920,0.163591,-0.427610,-0.150452,-0.295272,-0.074163,-0.161716
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,-0.115982,0.00000,0.000000,-0.427610,-0.088597,-0.037579,0.722652,0.000000,-0.046265,...,-0.464131,0.000000,-0.048521,-0.147640,-1.113866,,-0.081248,-0.133263,0.355148,-0.086015
96,0.0,-0.420754,0.00000,0.568803,-0.150452,-0.315671,-0.033422,-0.047577,0.000000,-0.040121,...,-0.159941,0.734953,-0.041807,-0.240639,-0.119738,-0.081248,,-0.103319,0.000000,0.000000
97,0.0,0.000000,0.00000,0.119566,-0.295272,0.000000,-0.050559,-0.078612,0.000000,-0.067648,...,-0.334182,0.167848,-0.072581,0.000000,-0.196401,-0.133263,-0.103319,,0.000000,-0.112757
98,0.0,0.000000,0.00000,-0.318883,-0.074163,0.000000,-0.742082,0.372387,0.000000,-1.073888,...,-0.081586,-0.446492,-1.180002,0.000000,-0.053225,0.355148,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.028489721623014747)
    np.nanmean(np.abs(norm_err))=np.float64(0.14478357712267634)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06691556165164175)
    


100%|██████████| 100/100 [00:00<00:00, 33557.12it/s]
100%|██████████| 100/100 [00:00<00:00, 443.64it/s]
5963it [00:00, 543461.35it/s]
100%|██████████| 100/100 [00:00<00:00, 274316.81it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.000613,-1.631512e-07,-1.797433e-07,-2.067919e-07,6.222244e-02,-2.228261e-07,-3.018245e-07,-3.544249e-07,-4.563713e-07,...,-6.279046e-07,-3.574783e-07,-2.962873e-07,-3.910207e-07,-8.588420e-07,-3.360576e-07,-1.738507e-07,-2.675626e-07,-0.010906,-8.054899e-07
1,-6.126638e-04,,1.818384e-03,3.176514e-02,-4.272237e-03,-1.397081e-07,2.726750e-02,8.689225e-02,-2.048900e-01,3.250869e-03,...,2.711511e-03,2.363624e-02,2.628389e-03,4.123371e-03,6.363516e-03,2.809913e-03,2.319734e-02,-5.403631e-04,0.003918,1.241930e-02
2,-1.631512e-07,0.001818,,-2.045731e-03,0.000000e+00,-4.802665e-03,-2.363126e-03,-1.806156e-03,0.000000e+00,0.000000e+00,...,-1.778603e-07,-3.100912e-02,0.000000e+00,-2.167269e-03,-2.091046e-07,0.000000e+00,-1.998671e-03,0.000000e+00,0.002328,-2.025711e-07
3,-1.797433e-07,0.031765,-2.045731e-03,,0.000000e+00,-1.052660e-02,0.000000e+00,-1.209426e-07,0.000000e+00,-1.212311e-02,...,-9.736347e-03,2.218771e-02,-9.383183e-03,-4.701548e-03,3.893015e-03,-3.374478e-03,0.000000e+00,0.000000e+00,0.005094,1.130905e-02
4,-2.067919e-07,-0.004272,0.000000e+00,0.000000e+00,,7.180915e-04,0.000000e+00,-1.280958e-07,0.000000e+00,0.000000e+00,...,-2.290696e-07,-1.476008e-07,0.000000e+00,-1.588535e-07,-2.849811e-07,0.000000e+00,0.000000e+00,0.000000e+00,-0.000696,-2.729818e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.360576e-07,0.002810,0.000000e+00,-3.374478e-03,0.000000e+00,-8.961527e-03,-4.334866e-03,-2.768602e-03,0.000000e+00,0.000000e+00,...,-4.050573e-07,-9.465406e-02,0.000000e+00,-3.718286e-03,-6.139902e-07,,-3.248317e-03,0.000000e+00,0.004246,-5.608731e-07
96,-1.738507e-07,0.023197,-1.998671e-03,0.000000e+00,0.000000e+00,-1.021652e-02,0.000000e+00,-1.173365e-07,0.000000e+00,-1.158432e-02,...,-9.385763e-03,3.042098e-02,-9.057138e-03,-4.577696e-03,3.726052e-03,-3.248317e-03,,0.000000e+00,0.004948,1.083877e-02
97,-2.675626e-07,-0.000540,0.000000e+00,0.000000e+00,0.000000e+00,5.162631e-02,0.000000e+00,-1.334997e-07,0.000000e+00,0.000000e+00,...,-2.469450e-07,-1.548219e-07,0.000000e+00,-1.672491e-07,-3.131847e-07,0.000000e+00,0.000000e+00,,-0.009147,-2.987528e-07
98,-1.090617e-02,0.003918,2.327856e-03,5.093558e-03,-6.958945e-04,-1.922402e-07,6.121249e-03,4.368680e-03,-1.076452e-03,5.340428e-03,...,4.025253e-03,5.068675e-03,3.844645e-03,4.483590e-02,2.072335e-02,4.245856e-03,4.947932e-03,-9.147118e-03,,9.810060e-03


np.nanmean(norm_err)=np.float64(0.002364338101644699)
    np.nanmean(np.abs(norm_err))=np.float64(0.011964260805812284)
    np.nanmedian(norm_err)=np.float64(-1.8383109312399543e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0036356063852872626)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35095.84it/s]
100%|██████████| 100/100 [00:00<00:00, 454.85it/s]
5947it [00:00, 642890.95it/s]
100%|██████████| 100/100 [00:00<00:00, 266643.61it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31140.43it/s]
100%|██████████| 100/100 [00:00<00:00, 1031.22it/s]
5944it [00:00, 647405.62it/s]
100%|██████████| 100/100 [00:00<00:00, 366955.73it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,-1.067176e-07,0.000000e+00,-1.097576e-07,0.000000e+00,-1.035630e-07,0.000000e+00,-1.066199e-07,...,0.000000e+00,0.000000e+00,-1.037785e-07,-1.036863e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065772e-07,0.000000e+00,-1.067418e-07
1,0.000000e+00,,0.000000e+00,-1.099199e-07,0.000000e+00,-1.068115e-07,0.000000e+00,-1.036795e-07,0.000000e+00,-1.201968e-07,...,0.000000e+00,0.000000e+00,-1.038954e-07,-1.038030e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.097709e-07,0.000000e+00,-1.099456e-07
2,0.000000e+00,0.000000e+00,,-1.131774e-07,0.000000e+00,-1.068081e-07,0.000000e+00,-1.036763e-07,0.000000e+00,-1.098126e-07,...,0.000000e+00,0.000000e+00,-1.038922e-07,-1.037998e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.164702e-07,0.000000e+00,-1.329285e-07
3,-1.067176e-07,-1.099199e-07,-1.131774e-07,,-1.066489e-07,-2.137256e-07,-1.096437e-07,-2.074556e-07,-1.067541e-07,-2.197408e-07,...,-1.037703e-07,-1.039598e-07,-2.078879e-07,-2.077029e-07,-1.038786e-07,-1.037914e-07,-1.038769e-07,-2.261614e-07,-1.068140e-07,-2.265320e-07
4,0.000000e+00,0.000000e+00,0.000000e+00,-1.066489e-07,,-1.096849e-07,0.000000e+00,-1.034983e-07,0.000000e+00,-1.065513e-07,...,0.000000e+00,0.000000e+00,-1.037135e-07,-1.036215e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.065087e-07,0.000000e+00,-1.066731e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,0.000000e+00,0.000000e+00,-1.037914e-07,0.000000e+00,-1.037633e-07,0.000000e+00,-1.161483e-07,0.000000e+00,-1.036990e-07,...,0.000000e+00,0.000000e+00,-1.097222e-07,-1.065572e-07,0.000000e+00,,0.000000e+00,-1.036586e-07,0.000000e+00,-1.038143e-07
96,0.000000e+00,0.000000e+00,0.000000e+00,-1.038769e-07,0.000000e+00,-1.038488e-07,0.000000e+00,-1.162554e-07,0.000000e+00,-1.037843e-07,...,0.000000e+00,0.000000e+00,-1.098178e-07,-1.066473e-07,0.000000e+00,0.000000e+00,,-1.037439e-07,0.000000e+00,-1.038999e-07
97,-1.065772e-07,-1.097709e-07,-1.164702e-07,-2.261614e-07,-1.065087e-07,-2.134440e-07,-1.094955e-07,-2.071903e-07,-1.066136e-07,-2.194432e-07,...,-1.036375e-07,-1.038266e-07,-2.076215e-07,-2.074370e-07,-1.037456e-07,-1.036586e-07,-1.037439e-07,,-1.066734e-07,-2.331281e-07
98,0.000000e+00,0.000000e+00,0.000000e+00,-1.068140e-07,0.000000e+00,-1.165740e-07,0.000000e+00,-1.036538e-07,0.000000e+00,-1.067161e-07,...,0.000000e+00,0.000000e+00,-1.038697e-07,-1.037773e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.066734e-07,,-1.068383e-07


np.nanmean(norm_err)=np.float64(-8.96621023759856e-08)
    np.nanmean(np.abs(norm_err))=np.float64(8.96621023759856e-08)
    np.nanmedian(norm_err)=np.float64(-1.0376908514529429e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0376908514529429e-07)
    
