In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-22T00:23:59.126915+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 9737.21it/s]
100%|██████████| 100/100 [00:00<00:00, 396.28it/s]
6118it [00:00, 618540.03it/s]
100%|██████████| 100/100 [00:00<00:00, 220636.72it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.057114,0.054396,-0.016551,0.112121,0.000000,-0.051355,-0.057056,0.417533,-0.017716,...,0.246680,0.997594,0.112414,0.058328,-0.046232,0.000000,0.231627,-0.063269,0.047791,-0.055827
1,-0.057114,,0.074590,-0.024685,0.147364,0.000000,0.000000,0.000000,0.000000,-0.027370,...,0.324342,0.349811,0.147871,0.082186,0.000000,0.000000,0.000000,0.000000,0.062705,0.000000
2,0.054396,0.074590,,-0.224487,-1.307915,0.281889,0.067956,0.074524,0.067055,-0.245701,...,0.086608,-0.086388,-1.314281,0.000000,0.061905,0.084605,0.090600,0.081491,0.000000,0.073123
3,-0.016551,-0.024685,-0.224487,,-0.176923,-0.044129,-0.021860,-0.024656,-0.021489,0.000000,...,-0.030239,-0.485109,-0.177583,-0.251578,-0.019418,-0.029272,-0.032222,-0.027802,-0.183885,-0.024046
4,0.112121,0.147364,-1.307915,-0.176923,,0.083071,0.136114,0.147254,0.134563,-0.189840,...,0.166984,0.000000,0.000000,-0.601146,0.125577,0.662058,0.173294,0.158748,-0.677047,0.144903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.084605,-0.029272,0.662058,0.000000,0.000000,0.000000,0.000000,-0.033125,...,0.000000,0.197730,0.665177,0.094514,0.000000,,0.000000,0.000000,0.069635,0.000000
96,0.231627,0.000000,0.090600,-0.032222,0.173294,0.000000,0.000000,0.000000,0.000000,-0.036955,...,0.769843,0.702262,0.173995,0.102058,0.000000,0.000000,,0.000000,0.073646,0.000000
97,-0.063269,0.000000,0.081491,-0.027802,0.158748,0.000000,0.000000,0.000000,0.000000,-0.031255,...,0.397541,0.385109,0.159336,0.090645,0.000000,0.000000,0.000000,,0.067512,0.000000
98,0.047791,0.062705,0.000000,-0.183885,-0.677047,0.215681,0.057950,0.062659,0.057294,-0.197880,...,0.070986,-0.074176,-0.679389,0.000000,0.053491,0.069635,0.073646,0.067512,,0.061666


np.nanmean(norm_err)=np.float64(-0.004115058674122239)
    np.nanmean(np.abs(norm_err))=np.float64(0.13139455114770757)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.042526085759289206)
    


100%|██████████| 100/100 [00:00<00:00, 33740.68it/s]
100%|██████████| 100/100 [00:00<00:00, 407.94it/s]
5964it [00:00, 596955.64it/s]
100%|██████████| 100/100 [00:00<00:00, 267323.39it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-6.448570e-04,-7.419940e-04,-1.438882e-07,-4.997385e-04,4.001483e-03,0.002376,-3.665351e-02,-2.612600e-03,3.359111e-03,...,0.001803,-7.248107e-04,0.001874,-6.325180e-04,2.892916e-03,-4.791810e-04,-6.849398e-04,-2.455539e-03,0.000000e+00,-0.040358
1,-6.448570e-04,,-4.000844e-07,-6.598474e-04,-1.956962e-07,-4.351575e-07,-0.048375,-7.304201e-04,-1.066401e-06,-3.051787e-07,...,-0.009854,-7.239046e-07,-0.021671,-5.574803e-07,-2.347815e-07,-4.049081e-07,-3.379177e-07,-9.189787e-07,-1.554664e-03,-0.000845
2,-7.419940e-04,-4.000844e-07,,-7.618866e-04,0.000000e+00,0.000000e+00,-0.127654,-8.576189e-04,-9.593426e-07,0.000000e+00,...,-0.011373,-5.182130e-07,-0.025262,-3.630443e-07,0.000000e+00,-2.129413e-07,0.000000e+00,-7.444672e-07,-2.272765e-03,-0.001020
3,-1.438882e-07,-6.598474e-04,-7.618866e-04,,-5.087250e-04,4.116148e-03,0.002427,-3.729779e-02,-2.694630e-03,3.439518e-03,...,0.001832,-7.437804e-04,0.001905,-6.469368e-04,2.952335e-03,-4.874402e-04,-7.018694e-04,-2.527878e-03,-2.488661e-07,-0.041140
4,-4.997385e-04,-1.956962e-07,0.000000e+00,-5.087250e-04,,0.000000e+00,-0.033063,-5.496482e-04,-2.689324e-07,0.000000e+00,...,-0.007606,-2.171206e-07,-0.016485,-1.841448e-07,0.000000e+00,-1.368622e-07,0.000000e+00,-2.488015e-07,-9.146459e-04,-0.000612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.791810e-04,-4.049081e-07,-2.129413e-07,-4.874402e-04,-1.368622e-07,-2.224854e-07,-0.031170,-5.248689e-04,-4.911441e-07,-1.827009e-07,...,-0.007287,-4.032703e-07,-0.015762,-3.457678e-07,-1.548962e-07,,-1.939504e-07,-4.573533e-07,-8.478971e-04,-0.000582
96,-6.849398e-04,-3.379177e-07,0.000000e+00,-7.018694e-04,0.000000e+00,0.000000e+00,-0.151108,-7.823002e-04,-6.656866e-07,0.000000e+00,...,-0.010481,-4.184912e-07,-0.023145,-3.111085e-07,0.000000e+00,-1.939504e-07,,-5.546095e-07,-1.810756e-03,-0.000915
97,-2.455539e-03,-9.189787e-07,-7.444672e-07,-2.527878e-03,-2.488015e-07,-1.048908e-06,-0.001502,-2.884499e-03,-3.950656e-06,-5.175655e-07,...,-0.000833,-1.355479e-06,-0.000895,-8.694635e-07,-3.430964e-07,-4.573533e-07,-5.546095e-07,,-9.551869e-03,-0.003510
98,0.000000e+00,-1.554664e-03,-2.272765e-03,-2.488661e-07,-9.146459e-04,1.462803e-02,0.005159,-6.659592e-02,-1.246738e-02,8.609376e-03,...,0.003053,-2.117903e-03,0.003262,-1.484831e-03,6.092859e-03,-8.478971e-04,-1.810756e-03,-9.551869e-03,,-0.079925


np.nanmean(norm_err)=np.float64(-0.0008710114200895826)
    np.nanmean(np.abs(norm_err))=np.float64(0.009243531204740715)
    np.nanmedian(norm_err)=np.float64(-3.3904219111938517e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(9.698982987737237e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35517.86it/s]
100%|██████████| 100/100 [00:00<00:00, 426.19it/s]
5936it [00:00, 646752.61it/s]
100%|██████████| 100/100 [00:00<00:00, 391259.70it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28127.04it/s]
100%|██████████| 100/100 [00:00<00:00, 1011.80it/s]
5949it [00:00, 635960.61it/s]
100%|██████████| 100/100 [00:00<00:00, 287675.17it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.037562e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.129250e-07,0.000000e+00,0.000000e+00,-1.036948e-07,...,-1.034606e-07,-1.064934e-07,-1.201344e-07,0.000000e+00,0.000000e+00,-1.094578e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,-1.037562e-07,,-1.068167e-07,-1.038532e-07,-1.036297e-07,-1.099232e-07,-2.076949e-07,-1.097503e-07,-1.068016e-07,-2.261123e-07,...,-2.129635e-07,-2.074681e-07,-2.078489e-07,-1.098084e-07,-1.037260e-07,-2.072996e-07,-1.038811e-07,-1.127851e-07,-1.036178e-07,-1.095720e-07
2,0.000000e+00,-1.068167e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.038681e-07,0.000000e+00,0.000000e+00,-1.067517e-07,...,-1.095624e-07,-1.037546e-07,-1.039451e-07,0.000000e+00,0.000000e+00,-1.036704e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,0.000000e+00,-1.038532e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.067154e-07,0.000000e+00,0.000000e+00,-1.037917e-07,...,-1.035571e-07,-1.163493e-07,-1.067968e-07,0.000000e+00,0.000000e+00,-1.065067e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,0.000000e+00,-1.036297e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.127752e-07,0.000000e+00,0.000000e+00,-1.035684e-07,...,-1.033348e-07,-1.063602e-07,-1.199649e-07,0.000000e+00,0.000000e+00,-1.093170e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.094578e-07,-2.072996e-07,-1.036704e-07,-1.065067e-07,-1.093170e-07,-1.036965e-07,-2.191187e-07,-1.035426e-07,-1.036561e-07,-2.071770e-07,...,-2.067096e-07,-2.127627e-07,-2.192902e-07,-1.035942e-07,-1.063729e-07,,-1.095968e-07,-1.033609e-07,-1.125281e-07,-1.033838e-07
96,0.000000e+00,-1.038811e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.130730e-07,0.000000e+00,0.000000e+00,-1.038196e-07,...,-1.035848e-07,-1.066251e-07,-1.166240e-07,0.000000e+00,0.000000e+00,-1.095968e-07,,0.000000e+00,0.000000e+00,0.000000e+00
97,0.000000e+00,-1.127851e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.035575e-07,0.000000e+00,0.000000e+00,-1.161442e-07,...,-1.061769e-07,-1.034447e-07,-1.036340e-07,0.000000e+00,0.000000e+00,-1.033609e-07,0.000000e+00,,0.000000e+00,0.000000e+00
98,0.000000e+00,-1.036178e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.095236e-07,0.000000e+00,0.000000e+00,-1.035565e-07,...,-1.033230e-07,-1.063477e-07,-1.096093e-07,0.000000e+00,0.000000e+00,-1.125281e-07,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0664302719106886e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0664302719106886e-07)
    np.nanmedian(norm_err)=np.float64(-1.062285466386818e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.062285466386818e-07)
    
