In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-29T00:23:57.563760+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10773.41it/s]
100%|██████████| 100/100 [00:00<00:00, 397.21it/s]
6129it [00:00, 669171.42it/s]
100%|██████████| 100/100 [00:00<00:00, 204400.78it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.115964,0.000000,-0.106817,-0.246955,-0.095145,-0.088965,-0.134362,-0.081489,-0.070873,...,-0.123058,-0.090169,-0.077895,0.031080,-0.104847,-0.080182,-0.307715,-0.078633,-0.105568,-0.098347
1,-0.115964,,-0.110194,0.000000,-0.236337,0.000000,0.142227,0.000000,0.264474,0.112353,...,0.000000,0.000000,0.131082,-0.335358,0.187658,0.121036,0.000000,0.249758,0.000000,0.167813
2,0.000000,-0.110194,,-0.101902,-0.226735,-0.091226,-0.085529,-0.126677,-0.078597,-0.068675,...,-0.116581,-0.086642,-0.075248,0.029429,-0.100107,-0.077380,-0.285084,-0.075937,-0.100765,-0.094165
3,-0.106817,0.000000,-0.101902,,-0.201222,0.000000,0.275118,0.000000,0.113640,-0.347310,...,0.000000,0.000000,-0.407263,-0.298415,0.359239,0.235278,0.000000,0.108163,0.000000,0.322710
4,-0.246955,-0.236337,-0.226735,-0.201222,,-0.163448,-0.146023,-0.327825,-0.126913,-0.102906,...,-0.267804,-0.149296,-0.118403,0.495306,-0.194341,-0.123770,-1.350843,-0.120119,-0.196836,-0.173133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.080182,0.121036,-0.077380,0.235278,-0.123770,0.207279,0.000000,-0.303884,0.083973,0.072744,...,0.128784,0.195528,0.080158,-0.203841,0.000000,,0.000000,0.080944,0.232253,0.000000
96,-0.307715,0.000000,-0.285084,0.000000,-1.350843,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.256137,0.000000,0.000000,,0.000000,0.000000,0.000000
97,-0.078633,0.249758,-0.075937,0.108163,-0.120119,0.096215,0.089901,0.000006,0.000000,0.416934,...,0.517721,0.091131,0.466385,-0.313925,0.106144,0.080944,0.000000,,0.106883,0.099489
98,-0.105568,0.000000,-0.100765,0.000000,-0.196836,0.000000,0.270991,0.000000,0.112228,-0.562603,...,0.000000,0.000000,-0.668949,-0.293565,0.352234,0.232253,0.000000,0.106883,,0.317046


np.nanmean(norm_err)=np.float64(-0.027311301674468547)
    np.nanmean(np.abs(norm_err))=np.float64(0.13899883755288867)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.08426415678318741)
    


100%|██████████| 100/100 [00:00<00:00, 31998.05it/s]
100%|██████████| 100/100 [00:00<00:00, 412.61it/s]
5961it [00:00, 606217.93it/s]
100%|██████████| 100/100 [00:00<00:00, 261490.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-6.989173e-04,-1.215975e-07,-5.820527e-07,-2.714783e-03,-2.409665e-03,-4.996264e-07,-3.746016e-07,-0.001747,-7.253206e-04,...,1.031954e-03,-2.638127e-07,-3.708557e-07,-1.707273e-07,-2.444888e-07,1.530360e-02,-1.888737e-03,-2.593458e-03,-1.281061e-07,-2.334988e-07
1,-6.989173e-04,,-1.459864e-07,-1.053177e-03,-1.745920e-03,-1.499757e-03,-7.607180e-07,-7.250303e-03,-0.001231,-5.057551e-07,...,6.155383e-04,-1.682474e-03,-4.976316e-07,-8.644561e-04,-1.378408e-03,1.352007e-03,6.421312e-02,-1.646144e-03,-1.554694e-07,-1.403553e-03
2,-1.215975e-07,-1.459864e-07,,-1.599596e-07,3.828034e-02,1.277442e-02,-3.410877e-02,-1.580365e-07,0.000000,-1.506815e-07,...,-1.122744e-02,0.000000e+00,-1.618897e-07,0.000000e+00,0.000000e+00,-1.874304e-02,-1.333744e-07,1.362174e-02,0.000000e+00,0.000000e+00
3,-5.820527e-07,-1.053177e-03,-1.599596e-07,,-4.048495e-03,-3.405359e-03,-9.848728e-07,-5.940477e-07,-0.003016,-1.114301e-03,...,1.380688e-03,-5.499653e-07,-5.846823e-07,-2.573989e-07,-4.721671e-07,2.841881e-02,-2.710200e-03,-3.784370e-03,-1.714161e-07,-4.328246e-07
4,-2.714783e-03,-1.745920e-03,3.828034e-02,-4.048495e-03,,-3.076777e-02,3.628413e-02,-1.968619e-03,-0.004707,-1.830543e-03,...,-1.523407e-07,-6.351911e-03,4.855540e-02,-3.342044e-03,-5.249751e-03,-3.270057e-07,-3.799905e-02,-3.380218e-02,4.076905e-02,-8.974132e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.530360e-02,1.352007e-03,-1.874304e-02,2.841881e-02,-3.270057e-07,-5.240353e-07,-9.653493e-02,3.348975e-03,0.009211,1.458216e-03,...,-3.089294e-01,2.822959e-02,-4.078821e-02,2.076499e-02,4.730605e-02,,1.107263e-03,-5.990702e-07,-2.036776e-02,1.200331e-02
96,-1.888737e-03,6.421312e-02,-1.333744e-07,-2.710200e-03,-3.799905e-02,-3.313812e-02,-6.103462e-07,-8.879950e-04,-0.003093,8.141389e-02,...,5.592588e-04,-3.991565e-03,-4.285618e-07,-2.282913e-03,-3.398593e-03,1.107263e-03,,-3.604418e-02,-1.412456e-07,-5.543857e-02
97,-2.593458e-03,-1.646144e-03,1.362174e-02,-3.784370e-03,-3.380218e-02,-4.000534e-07,1.890149e-01,-1.842657e-03,-0.004353,-1.721154e-03,...,-2.921973e-07,-5.724789e-03,2.243364e-02,-3.160004e-03,-4.813960e-03,-5.990702e-07,-3.604418e-02,,1.446006e-02,-2.181563e-01
98,-1.281061e-07,-1.554694e-07,0.000000e+00,-1.714161e-07,4.076905e-02,1.350888e-02,-3.735829e-02,-1.692095e-07,0.000000,-1.608055e-07,...,-1.179085e-02,0.000000e+00,-1.736345e-07,0.000000e+00,0.000000e+00,-2.036776e-02,-1.412456e-07,1.446006e-02,,0.000000e+00


np.nanmean(norm_err)=np.float64(-0.0013234660089533488)
    np.nanmean(np.abs(norm_err))=np.float64(0.011186410191936924)
    np.nanmedian(norm_err)=np.float64(-3.483822125253808e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0012132841124405714)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34932.16it/s]
100%|██████████| 100/100 [00:00<00:00, 415.96it/s]
5953it [00:00, 629219.59it/s]
100%|██████████| 100/100 [00:00<00:00, 339344.98it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32398.46it/s]
100%|██████████| 100/100 [00:00<00:00, 441.31it/s]
5967it [00:00, 587704.87it/s]
100%|██████████| 100/100 [00:00<00:00, 247890.31it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.130405e-07,-1.036687e-07,-1.165318e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.097411e-07,-1.038125e-07,0.000000e+00,...,-1.038753e-07,-1.036683e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.066715e-07,0.000000e+00,0.000000e+00,-1.065525e-07
1,-1.130405e-07,,-2.070276e-07,-2.257866e-07,-1.037619e-07,-1.063914e-07,-1.127736e-07,-2.191351e-07,-2.073144e-07,-1.066632e-07,...,-2.074396e-07,-2.070269e-07,-1.035095e-07,-1.096161e-07,-1.037076e-07,-1.128552e-07,-2.130151e-07,-1.036454e-07,-1.037110e-07,-2.127778e-07
2,-1.036687e-07,-2.070276e-07,,-2.070898e-07,-1.199000e-07,-1.033625e-07,-1.034442e-07,-2.069457e-07,-2.395205e-07,-1.036190e-07,...,-2.255177e-07,-2.468748e-07,-1.159296e-07,-1.035162e-07,-1.064520e-07,-1.035128e-07,-2.069442e-07,-1.094385e-07,-1.198320e-07,-2.067202e-07
3,-1.165318e-07,-2.257866e-07,-2.070898e-07,,-1.037932e-07,-1.064243e-07,-1.477701e-07,-2.192047e-07,-2.073768e-07,-1.066962e-07,...,-2.075020e-07,-2.070891e-07,-1.035406e-07,-1.096510e-07,-1.037388e-07,-1.163350e-07,-2.130809e-07,-1.036766e-07,-1.037422e-07,-2.128435e-07
4,0.000000e+00,-1.037619e-07,-1.199000e-07,-1.037932e-07,,0.000000e+00,0.000000e+00,-1.037208e-07,-1.239961e-07,0.000000e+00,...,-1.130533e-07,-1.198996e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037200e-07,0.000000e+00,0.000000e+00,-1.036075e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.128552e-07,-1.035128e-07,-1.163350e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.095665e-07,-1.036563e-07,0.000000e+00,...,-1.037189e-07,-1.035125e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.065065e-07,0.000000e+00,0.000000e+00,-1.063879e-07
96,-1.066715e-07,-2.130151e-07,-2.069442e-07,-2.130809e-07,-1.037200e-07,-1.093971e-07,-1.064339e-07,-2.129283e-07,-2.072308e-07,-1.096845e-07,...,-2.073558e-07,-2.069435e-07,-1.034678e-07,-1.065100e-07,-1.036657e-07,-1.065065e-07,,-1.036036e-07,-1.036691e-07,-2.252484e-07
97,0.000000e+00,-1.036454e-07,-1.094385e-07,-1.036766e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036043e-07,-1.095988e-07,0.000000e+00,...,-1.096688e-07,-1.094381e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036036e-07,,0.000000e+00,-1.034913e-07
98,0.000000e+00,-1.037110e-07,-1.198320e-07,-1.037422e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.036699e-07,-1.239233e-07,0.000000e+00,...,-1.129929e-07,-1.198316e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036691e-07,0.000000e+00,,-1.035567e-07


np.nanmean(norm_err)=np.float64(-9.391600134785824e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.391600134785824e-08)
    np.nanmedian(norm_err)=np.float64(-1.0375043828076776e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0375043828076776e-07)
    
