In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-15T00:25:00.793834+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

hstrat                            : 1.20.10
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
pandas                            : 2.2.3
downstream                        : 1.14.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10581.52it/s]
100%|██████████| 100/100 [00:00<00:00, 422.19it/s]
6124it [00:00, 665231.47it/s]
100%|██████████| 100/100 [00:00<00:00, 255283.26it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.080550,0.000000,0.162992,0.000000,-0.469916,-0.420686,-0.066775,0.198021,...,0.165192,0.000000,0.000000,0.0,-0.086338,-0.463066,0.000000,0.0,0.0,0.000000
1,0.000000,,-0.805182,0.000000,-0.056717,0.000000,-0.118549,-0.103301,-0.570080,-0.075244,...,-0.057789,0.000000,0.000000,0.0,-0.929803,-0.116378,0.000000,0.0,0.0,0.000000
2,-0.080550,-0.805182,,0.386274,-0.116123,0.340246,0.044299,0.039897,-0.597200,-0.139577,...,-0.117611,0.435153,0.231883,0.0,0.000000,0.043689,0.377546,0.0,0.0,-0.471670
3,0.000000,0.000000,0.386274,,0.000000,-0.136958,0.126787,0.114714,0.613161,0.000000,...,0.000000,0.000000,0.000000,0.0,0.420033,0.125122,0.000000,0.0,0.0,0.000000
4,0.162992,-0.056717,-0.116123,0.000000,,0.000000,-0.074666,-0.068315,-0.096910,0.683446,...,-0.508918,0.000000,0.000000,0.0,-0.124120,-0.073799,0.000000,0.0,0.0,-0.062765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.463066,-0.116378,0.043689,0.125122,-0.073799,0.112031,0.000000,0.000000,0.035701,-0.087876,...,-0.074700,0.138567,0.302951,0.0,0.047116,,0.122673,0.0,0.0,-0.129144
96,0.000000,0.000000,0.377546,0.000000,0.000000,-1.190609,0.124273,0.112651,0.600547,0.000000,...,0.000000,0.000000,0.000000,0.0,0.409734,0.122673,,0.0,0.0,0.000000
97,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0.0,0.000000
98,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,,0.000000


np.nanmean(norm_err)=np.float64(0.0016840823348858098)
    np.nanmean(np.abs(norm_err))=np.float64(0.10777906995078464)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29196.05it/s]
100%|██████████| 100/100 [00:00<00:00, 755.20it/s]
5937it [00:00, 604480.71it/s]
100%|██████████| 100/100 [00:00<00:00, 286692.00it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,2.777895e-03,-9.034372e-04,0.001516,-2.313358e-07,-2.273254e-02,3.247549e-03,-0.007260,-1.245016e-07,-0.004129,...,1.510091e-03,-1.669170e-07,-1.363361e-07,2.867521e-02,1.227734e-03,-4.197643e-03,-0.003712,-1.590315e-03,3.203081e-03,-1.068271e-07
1,2.777895e-03,,0.000000e+00,0.000000,3.987772e-03,-1.312070e-07,-1.493527e-07,0.001357,3.271173e-03,0.074175,...,-2.146389e-07,4.658797e-03,3.605813e-03,-8.026757e-03,0.000000e+00,0.000000e+00,0.015273,-2.275410e-07,0.000000e+00,2.779509e-03
2,-9.034372e-04,0.000000e+00,,0.000000,-1.283527e-03,-1.277830e-07,-1.422302e-07,-0.001311,-1.059458e-03,-0.001104,...,-2.049351e-07,-1.490836e-03,-1.164442e-03,0.000000e+00,0.000000e+00,0.000000e+00,-0.000985,-2.213803e-07,0.000000e+00,-9.040956e-04
3,1.516455e-03,0.000000e+00,0.000000e+00,,3.013972e-03,-2.398980e-07,-2.978722e-07,0.009539,2.013677e-03,0.002179,...,-8.406890e-07,4.475823e-03,2.430223e-03,0.000000e+00,0.000000e+00,0.000000e+00,0.001762,-1.061900e-06,0.000000e+00,1.517814e-03
4,-2.313358e-07,3.987772e-03,-1.283527e-03,0.003014,,-3.362188e-02,5.032720e-03,-0.012031,-3.739978e-07,-0.006240,...,2.989292e-03,-6.048500e-07,-4.300691e-07,4.114729e-02,2.053883e-03,-7.752223e-03,-0.005335,-3.320890e-03,4.926613e-03,-2.995573e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.197643e-03,0.000000e+00,0.000000e+00,0.000000,-7.752223e-03,-2.171697e-07,-2.613581e-07,-0.008092,-5.437001e-03,-0.005836,...,-5.970475e-07,-1.076666e-02,-6.428532e-03,0.000000e+00,0.000000e+00,,-0.004816,-7.171808e-07,0.000000e+00,-4.201759e-03
96,-3.712157e-03,1.527286e-02,-9.851601e-04,0.001762,-5.334524e-03,-7.543637e-03,4.791637e-02,-0.041517,-4.373385e-03,-0.018628,...,1.753216e-03,-2.132406e-01,-4.822082e-03,2.990273e-03,1.383723e-03,-4.816442e-03,,-1.862208e-03,7.732169e-02,-3.714583e-03
97,-1.590315e-03,-2.275410e-07,-2.213803e-07,-0.000001,-3.320890e-03,-5.115114e-07,-6.420399e-07,-0.003510,-2.146478e-03,-0.002336,...,-2.075096e-06,-5.186769e-03,-2.626108e-03,-2.213042e-07,-4.520615e-07,-7.171808e-07,-0.001862,,-3.105167e-07,-1.592208e-03
98,3.203081e-03,0.000000e+00,0.000000e+00,0.000000,4.926613e-03,-1.551069e-07,-1.878083e-07,0.001684,3.877271e-03,0.020548,...,-2.869758e-07,5.993036e-03,4.356489e-03,-9.375253e-03,0.000000e+00,0.000000e+00,0.077322,-3.105167e-07,,3.205248e-03


np.nanmean(norm_err)=np.float64(0.0020300861899585207)
    np.nanmean(np.abs(norm_err))=np.float64(0.013373519188944723)
    np.nanmedian(norm_err)=np.float64(-1.421627405220473e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0025651477510296563)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36282.91it/s]
100%|██████████| 100/100 [00:00<00:00, 434.43it/s]
5936it [00:00, 692652.35it/s]
100%|██████████| 100/100 [00:00<00:00, 378205.95it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31585.99it/s]
100%|██████████| 100/100 [00:00<00:00, 1013.42it/s]
5962it [00:00, 630934.06it/s]
100%|██████████| 100/100 [00:00<00:00, 406819.01it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.036653e-07,0.000000e+00,-1.094560e-07,0.000000e+00,-1.034353e-07,0.000000e+00,-1.095178e-07,0.000000e+00,-1.063305e-07,...,-1.096023e-07,-1.036736e-07,0.000000e+00,-1.036213e-07,0.000000e+00,-1.035740e-07,-1.064880e-07,-1.063448e-07,-1.035739e-07,-1.036461e-07
1,-1.036653e-07,,-1.039168e-07,-2.075257e-07,-1.037813e-07,-2.257668e-07,-1.039627e-07,-2.076366e-07,-1.038955e-07,-2.073879e-07,...,-2.077886e-07,-2.136579e-07,-1.038081e-07,-2.135467e-07,-1.166217e-07,-2.134463e-07,-2.076874e-07,-2.074151e-07,-2.195894e-07,-2.135994e-07
2,0.000000e+00,-1.039168e-07,,-1.066680e-07,0.000000e+00,-1.036857e-07,0.000000e+00,-1.067266e-07,0.000000e+00,-1.096594e-07,...,-1.068069e-07,-1.039252e-07,0.000000e+00,-1.038726e-07,0.000000e+00,-1.038251e-07,-1.165372e-07,-1.163658e-07,-1.038249e-07,-1.038975e-07
3,-1.094560e-07,-2.075257e-07,-1.066680e-07,,-1.065252e-07,-2.070648e-07,-1.097877e-07,-2.399394e-07,-1.066456e-07,-2.128663e-07,...,-2.259203e-07,-2.075424e-07,-1.065535e-07,-2.074375e-07,-1.037668e-07,-2.073427e-07,-2.131819e-07,-2.128950e-07,-2.073425e-07,-2.074872e-07
4,0.000000e+00,-1.037813e-07,0.000000e+00,-1.065252e-07,,-1.035508e-07,0.000000e+00,-1.065837e-07,0.000000e+00,-1.095085e-07,...,-1.066638e-07,-1.037897e-07,0.000000e+00,-1.037372e-07,0.000000e+00,-1.036898e-07,-1.129222e-07,-1.127612e-07,-1.036897e-07,-1.037621e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.035740e-07,-2.134463e-07,-1.038251e-07,-2.073427e-07,-1.036898e-07,-2.129588e-07,-1.038709e-07,-2.074535e-07,-1.038038e-07,-2.072052e-07,...,-2.076052e-07,-2.261171e-07,-1.037166e-07,-2.194910e-07,-1.067273e-07,,-2.075042e-07,-2.072323e-07,-2.132525e-07,-2.402907e-07
96,-1.064880e-07,-2.076874e-07,-1.165372e-07,-2.131819e-07,-1.129222e-07,-2.072258e-07,-1.068019e-07,-2.132990e-07,-1.098032e-07,-2.191560e-07,...,-2.134594e-07,-2.077041e-07,-1.129539e-07,-2.075991e-07,-1.038477e-07,-2.075042e-07,,-2.476449e-07,-2.075039e-07,-2.076488e-07
97,-1.063448e-07,-2.074151e-07,-1.163658e-07,-2.128950e-07,-1.127612e-07,-2.069547e-07,-1.066579e-07,-2.130118e-07,-1.096509e-07,-2.188528e-07,...,-2.131717e-07,-2.074318e-07,-1.127928e-07,-2.073270e-07,-1.037115e-07,-2.072323e-07,-2.476449e-07,,-2.072320e-07,-2.073766e-07
98,-1.035739e-07,-2.195894e-07,-1.038249e-07,-2.073425e-07,-1.036897e-07,-2.190735e-07,-1.038708e-07,-2.074532e-07,-1.038037e-07,-2.072049e-07,...,-2.076049e-07,-2.134637e-07,-1.037164e-07,-2.133527e-07,-1.097992e-07,-2.132525e-07,-2.075039e-07,-2.072320e-07,,-2.134053e-07


np.nanmean(norm_err)=np.float64(-1.2582055154715233e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.2582055154715233e-07)
    np.nanmedian(norm_err)=np.float64(-1.0665299806086373e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0665299806086373e-07)
    
