In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T17:46:06.505594+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10666.83it/s]
100%|██████████| 100/100 [00:00<00:00, 385.09it/s]
6125it [00:00, 541722.62it/s]
100%|██████████| 100/100 [00:00<00:00, 179858.66it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,-0.176615,-0.092411,0.000000,0.000000,-0.133490,0.000000,-0.142578,-0.072619,...,-0.067346,-0.071748,0.910590,-0.064024,0.000000,-0.106494,0.000000,0.873629,0.000000,0.000000
1,0.000000,,-0.476737,-0.167599,0.000000,0.149516,-0.288239,0.000000,-0.321411,-0.122272,...,-0.111272,-0.120427,-0.217677,-0.104551,0.000000,-0.204354,0.000000,-0.208807,0.000000,0.122307
2,-0.176615,-0.476737,,0.000000,-0.096888,-0.332754,0.000000,-0.531548,0.000000,-0.076914,...,-0.068408,-0.075460,-0.139018,-0.063398,-0.307332,0.000000,-0.098513,-0.134458,-0.126639,-0.277744
3,-0.092411,-0.167599,0.000000,,-0.064598,-0.321657,0.000000,-0.173900,0.000000,0.537707,...,0.633837,0.529776,-0.090397,0.461313,-0.140393,0.000000,-0.065316,-0.088446,-0.076595,-0.282962
4,0.000000,0.000000,-0.096888,-0.064598,,0.000000,-0.082302,0.000000,-0.085668,-0.054260,...,-0.051261,-0.053772,0.261687,-0.049314,0.000000,-0.071177,0.000000,0.255750,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.106494,-0.204354,0.000000,0.000000,-0.071177,-0.242518,0.000000,-0.213801,0.000000,-0.809303,...,-0.723276,-0.794653,-0.100108,-0.672218,-0.165298,,-0.072051,-0.097721,-0.086024,-0.211926
96,0.000000,0.000000,-0.098513,-0.065316,0.000000,0.000000,-0.083472,0.000000,-0.086936,-0.054766,...,-0.051713,-0.054269,0.379513,-0.049732,0.000000,-0.072051,,0.370370,0.000000,0.000000
97,0.873629,-0.208807,-0.134458,-0.088446,0.255750,-0.019760,-0.113520,-0.039868,-0.118329,-0.073974,...,-0.069800,-0.073295,0.000000,-0.067094,-0.529117,-0.097721,0.370370,,0.307789,-0.017681
98,0.000000,0.000000,-0.126639,-0.076595,0.000000,0.000000,-0.102821,0.000000,-0.108130,-0.062481,...,-0.058537,-0.061835,0.316428,-0.056012,0.000000,-0.086024,0.000000,0.307789,,0.000000


np.nanmean(norm_err)=np.float64(-0.04218371551471767)
    np.nanmean(np.abs(norm_err))=np.float64(0.13567571645019705)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06823279065605026)
    


100%|██████████| 100/100 [00:00<00:00, 28434.03it/s]
100%|██████████| 100/100 [00:00<00:00, 448.66it/s]
5978it [00:00, 564795.90it/s]
100%|██████████| 100/100 [00:00<00:00, 269210.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.846498e-07,-0.006238,-0.006119,-3.232959e-07,-3.382956e-07,-1.414862e-07,-0.004723,-3.250617e-07,-3.672959e-07,...,-0.010691,-4.726716e-03,-1.754264e-07,-4.311299e-07,-2.744549e-07,-1.330728e-07,-3.026529e-07,-0.033402,-2.053455e-07,-0.001139
1,-2.846498e-07,,-0.007436,-0.007266,-3.899453e-07,-4.119778e-07,-1.663763e-07,-0.005379,-3.255263e-07,-4.558048e-07,...,-0.012402,-5.383573e-03,-2.153762e-07,-6.025056e-07,-3.210368e-07,-1.548629e-07,-3.603038e-07,-0.082329,-2.622961e-07,-0.001336
2,-6.238191e-03,-7.435511e-03,,0.013445,-4.299469e-02,2.997120e-02,2.338010e-02,0.004793,-7.206485e-03,-1.086733e-01,...,-0.006895,-3.914627e-07,3.160243e-02,-1.312142e-02,-7.306651e-03,-3.299335e-02,-8.313476e-03,-0.007164,-2.623222e-03,0.021316
3,-6.118648e-03,-7.266332e-03,0.013445,,-1.253164e-02,9.397859e-02,7.329934e-02,0.047840,-7.047451e-03,-1.495332e-02,...,-0.006749,9.397218e-03,6.427755e-02,-1.260381e-02,-7.143218e-03,-9.712050e-03,-8.102575e-03,-0.007007,-2.519542e-03,0.032914
4,-3.232959e-07,-3.899453e-07,-0.042995,-0.012532,,-5.260742e-07,-2.017111e-07,-0.013783,-3.770737e-07,-6.274508e-07,...,-0.005876,-2.906749e-02,-2.785397e-07,-7.294688e-07,-3.826959e-07,-1.733725e-01,-4.398372e-07,-0.006121,-3.623716e-07,-0.001612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.330728e-07,-1.548629e-07,-0.032993,-0.009712,-1.733725e-01,-1.949262e-07,0.000000e+00,-0.011455,-1.507749e-07,-2.214397e-07,...,-0.004745,-2.412359e-02,0.000000e+00,-2.456929e-07,-1.525673e-07,,-1.701972e-07,-0.004904,0.000000e+00,-0.001278
96,-3.026529e-07,-3.603038e-07,-0.008313,-0.008103,-4.398372e-07,-4.680724e-07,-1.842068e-07,-0.005824,-3.492871e-07,-5.254785e-07,...,-0.087410,-5.829302e-03,-2.462296e-07,-6.321778e-07,-4.032340e-07,-1.701972e-07,,-0.014123,-3.095308e-07,-0.001476
97,-3.340230e-02,-8.232946e-02,-0.007164,-0.007007,-6.120632e-03,-1.975043e-07,0.000000e+00,-0.005235,-3.795345e-02,-7.103964e-03,...,-0.012022,-5.239822e-03,0.000000e+00,-3.819504e-01,-1.264345e-02,-4.903730e-03,-1.412338e-02,,2.058253e-02,-0.001292
98,-2.053455e-07,-2.622961e-07,-0.002623,-0.002520,-3.623716e-07,-4.077411e-07,0.000000e+00,-0.001566,-2.507798e-07,-4.954133e-07,...,0.000000,-1.568642e-03,0.000000e+00,-7.016163e-07,-2.557780e-07,0.000000e+00,-3.095308e-07,0.020583,,-0.024408


np.nanmean(norm_err)=np.float64(-0.002694922879586356)
    np.nanmean(np.abs(norm_err))=np.float64(0.012443734937437553)
    np.nanmedian(norm_err)=np.float64(-4.574290527541277e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0011788023461584405)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 28653.53it/s]
100%|██████████| 100/100 [00:00<00:00, 432.01it/s]
5941it [00:00, 602708.01it/s]
100%|██████████| 100/100 [00:00<00:00, 305262.30it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32209.37it/s]
100%|██████████| 100/100 [00:00<00:00, 1030.19it/s]
5954it [00:00, 623076.00it/s]
100%|██████████| 100/100 [00:00<00:00, 346064.69it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.072262e-07,-1.066132e-07,-2.067992e-07,-2.130632e-07,-2.254286e-07,-2.189624e-07,-1.036656e-07,-1.034858e-07,-2.126403e-07,...,-2.132048e-07,-1.097106e-07,-1.036082e-07,-2.320397e-07,-1.035947e-07,-2.074152e-07,-2.127400e-07,-2.070257e-07,-2.192731e-07,-2.128818e-07
1,-2.072262e-07,,-1.038244e-07,-2.188904e-07,-2.074940e-07,-2.072921e-07,-2.072952e-07,-1.066744e-07,-1.127803e-07,-2.070930e-07,...,-2.076283e-07,-1.038532e-07,-1.096790e-07,-2.070910e-07,-1.065993e-07,-2.134377e-07,-2.071876e-07,-2.191441e-07,-2.075736e-07,-2.073221e-07
2,-1.066132e-07,-1.038244e-07,,-1.036101e-07,-1.130844e-07,-1.066481e-07,-1.066497e-07,0.000000e+00,0.000000e+00,-1.096039e-07,...,-1.099038e-07,0.000000e+00,0.000000e+00,-1.065416e-07,0.000000e+00,-1.039193e-07,-1.239037e-07,-1.037237e-07,-1.067971e-07,-1.129822e-07
3,-2.067992e-07,-2.188904e-07,-1.036101e-07,,-2.070659e-07,-2.068648e-07,-2.068679e-07,-1.064481e-07,-1.093032e-07,-2.066665e-07,...,-2.071997e-07,-1.036387e-07,-1.126722e-07,-2.066646e-07,-1.063734e-07,-2.129847e-07,-2.067607e-07,-2.319633e-07,-2.071452e-07,-2.068947e-07
4,-2.130632e-07,-2.074940e-07,-1.130844e-07,-2.070659e-07,,-2.131328e-07,-2.131361e-07,-1.037996e-07,-1.036194e-07,-2.190352e-07,...,-2.196341e-07,-1.067854e-07,-1.037421e-07,-2.129202e-07,-1.037286e-07,-2.076834e-07,-2.256216e-07,-2.072930e-07,-2.134304e-07,-2.326665e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.074152e-07,-2.134377e-07,-1.039193e-07,-2.129847e-07,-2.076834e-07,-2.074811e-07,-2.074842e-07,-1.131062e-07,-1.065838e-07,-2.072817e-07,...,-2.078180e-07,-1.039481e-07,-1.067136e-07,-2.072797e-07,-1.240478e-07,,-2.073764e-07,-2.132249e-07,-2.077632e-07,-2.075111e-07
96,-2.127400e-07,-2.071876e-07,-1.239037e-07,-2.067607e-07,-2.256216e-07,-2.128094e-07,-2.128127e-07,-1.036462e-07,-1.034665e-07,-2.186937e-07,...,-2.192908e-07,-1.066230e-07,-1.035889e-07,-2.125975e-07,-1.035754e-07,-2.073764e-07,,-2.069871e-07,-2.131062e-07,-2.254182e-07
97,-2.070257e-07,-2.191441e-07,-1.037237e-07,-2.319633e-07,-2.072930e-07,-2.070914e-07,-2.070945e-07,-1.065681e-07,-1.094297e-07,-2.068927e-07,...,-2.074270e-07,-1.037524e-07,-1.128067e-07,-2.068907e-07,-1.064932e-07,-2.132249e-07,-2.069871e-07,,-2.073724e-07,-2.071213e-07
98,-2.192731e-07,-2.075736e-07,-1.067971e-07,-2.071452e-07,-2.134304e-07,-2.193468e-07,-2.478542e-07,-1.038394e-07,-1.036591e-07,-2.130061e-07,...,-2.135725e-07,-1.131658e-07,-1.037819e-07,-2.191217e-07,-1.037684e-07,-2.077632e-07,-2.131062e-07,-2.073724e-07,,-2.132485e-07


np.nanmean(norm_err)=np.float64(-1.0673836890149258e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0673836890149258e-07)
    np.nanmedian(norm_err)=np.float64(-1.0395935979755291e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0395935979755291e-07)
    
