In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-13T00:25:50.495370+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1030-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
hstrat                            : 1.20.10
downstream                        : 1.14.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10526.55it/s]
100%|██████████| 100/100 [00:00<00:00, 371.50it/s]
6148it [00:00, 669920.53it/s]
100%|██████████| 100/100 [00:00<00:00, 238719.64it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.137407,0.000000,0.000000,0.311626,0.000000,0.000000,0.269957,0.372383,-0.072019,...,-0.112822,-0.146262,-0.076637,-0.098872,-0.083615,0.000000,-0.142254,0.044628,0.000000,-0.165694
1,-0.137407,,-0.189717,-0.176040,-0.110334,-0.093011,-0.115211,-0.090536,-0.143504,0.000000,...,0.000000,0.000000,-0.039086,-0.061741,-0.045248,0.435170,-0.158005,0.000000,0.501424,0.000000
2,0.000000,-0.189717,,0.000000,0.369343,0.000000,0.000000,0.312223,0.457887,-0.080773,...,-0.135895,-0.187541,-0.086627,-0.116154,-0.095651,0.000000,-0.181001,0.061037,0.000000,-0.220735
3,0.000000,-0.176040,0.000000,,0.223978,0.000000,0.000000,0.192024,0.271785,-0.078817,...,-0.130450,-0.177328,-0.084382,-0.112153,-0.092921,0.000000,-0.171469,0.056777,0.000000,-0.206721
4,0.311626,-0.110334,0.369343,0.223978,,0.157093,0.280949,0.000000,0.000000,-0.065682,...,-0.098011,-0.336709,-0.069502,-0.087309,-0.075193,0.000000,-0.119487,0.036012,0.000000,-0.290981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.435170,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.045761,...,0.096425,0.174305,0.183177,0.285464,0.211271,,0.691114,-0.326231,0.000000,0.256165
96,-0.142254,-0.158005,-0.181001,-0.171469,-0.119487,-0.103790,-0.123745,-0.101466,-0.147103,-0.213659,...,-0.445210,-0.167576,0.213206,0.365700,0.252240,0.691114,,0.000000,0.310805,-0.220154
97,0.044628,0.000000,0.061037,0.056777,0.036012,0.030455,0.037571,0.029658,0.046557,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.326231,0.000000,,-0.378768,0.000000
98,0.000000,0.501424,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.046405,...,0.099333,0.184046,0.088942,0.136419,0.102133,0.000000,0.310805,-0.378768,,0.277770


np.nanmean(norm_err)=np.float64(-0.038373862288034175)
    np.nanmean(np.abs(norm_err))=np.float64(0.15723881570560713)
    np.nanmedian(norm_err)=np.float64(-0.045768652649526745)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0996384146195439)
    


100%|██████████| 100/100 [00:00<00:00, 32974.09it/s]
100%|██████████| 100/100 [00:00<00:00, 414.81it/s]
5986it [00:00, 557551.55it/s]
100%|██████████| 100/100 [00:00<00:00, 292693.93it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.559287e-07,7.280505e-02,-8.441643e-07,1.168410e-03,-3.473546e-03,-3.568436e-07,-3.021420e-03,1.341856e-01,-0.000001,...,1.087265e-01,-1.937796e-07,1.325044e-03,-4.021642e-07,-3.853754e-07,-4.411186e-03,-0.000002,-3.263962e-03,-2.725001e-03,-1.701423e-01
1,-4.559287e-07,,7.676714e-04,-4.162567e-07,1.258686e-02,1.708521e-03,-1.909174e-07,-8.552854e-04,1.108036e-03,0.021549,...,9.867087e-04,1.085470e-02,1.370011e-02,-2.031667e-07,-2.559613e-07,-1.080143e-03,0.004806,-8.994266e-04,1.504826e-03,-9.312146e-04
2,7.280505e-02,7.676714e-04,,-9.131409e-02,-6.489992e-03,1.027782e-03,-9.948602e-03,-1.195688e-07,-3.226040e-07,0.001221,...,-1.453033e-07,7.000833e-04,-6.980926e-03,-4.358490e-02,2.186507e-02,-2.911786e-07,0.001334,-1.248318e-07,9.270615e-04,-1.285751e-07
3,-8.441643e-07,-4.162567e-07,-9.131409e-02,,-1.125714e-02,-3.596014e-07,-3.173805e-07,4.371665e-03,-4.562557e-01,-0.000001,...,-2.442844e-01,-1.792571e-07,-1.265100e-02,-3.790201e-07,-3.450515e-07,6.157658e-03,-0.000001,4.692782e-03,-5.832361e-07,3.135921e-03
4,1.168410e-03,1.258686e-02,-6.489992e-03,-1.125714e-02,,2.530777e-03,-8.936759e-02,-1.241496e-07,-8.990669e-03,0.056620,...,-8.121375e-03,3.118978e-02,-2.695904e-07,-1.102754e-02,7.072985e-04,-3.048756e-07,0.007172,-1.298331e-07,2.270713e-03,-1.338872e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.411186e-03,-1.080143e-03,-2.911786e-07,6.157658e-03,-3.048756e-07,-1.126118e-01,5.506006e-03,-1.946624e-07,-4.534159e-07,-0.002261,...,-1.963261e-07,-9.506732e-04,-3.367766e-07,5.966550e-03,-2.147242e-03,,-0.002680,-1.883672e-07,-9.490057e-02,-1.834058e-07
96,-2.020743e-06,4.805697e-03,1.334085e-03,-1.420645e-06,7.172024e-03,5.862068e-03,-5.431971e-07,-1.622305e-03,2.861766e-03,-0.000005,...,2.171567e-03,-2.395479e-07,8.389153e-03,-6.556730e-07,-4.528201e-07,-2.680246e-03,,-1.788826e-03,4.003764e-03,-1.919118e-03
97,-3.263962e-03,-8.994266e-04,-1.248318e-07,4.692782e-03,-1.298331e-07,-1.456765e-01,4.304583e-03,0.000000e+00,-1.800795e-07,-0.001592,...,0.000000e+00,-8.077852e-04,-1.412270e-07,4.581013e-03,-1.833483e-03,-1.883672e-07,-0.001789,,-3.360604e-01,0.000000e+00
98,-2.725001e-03,1.504826e-03,9.270615e-04,-5.832361e-07,2.270713e-03,-3.602891e-07,-2.589141e-07,-6.964466e-02,1.473764e-03,0.003315,...,1.266555e-03,1.317656e-03,2.518644e-03,-2.819695e-07,-1.273547e-03,-9.490057e-02,0.004004,-3.360604e-01,,-2.746958e-03


np.nanmean(norm_err)=np.float64(-0.0011248923355853435)
    np.nanmean(np.abs(norm_err))=np.float64(0.01012883764356253)
    np.nanmedian(norm_err)=np.float64(-2.1008426445274583e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(2.4018522211056195e-06)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34729.68it/s]
100%|██████████| 100/100 [00:00<00:00, 479.42it/s]
5943it [00:00, 689155.34it/s]
100%|██████████| 100/100 [00:00<00:00, 419011.39it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32716.88it/s]
100%|██████████| 100/100 [00:00<00:00, 1021.23it/s]
5959it [00:00, 651696.33it/s]
100%|██████████| 100/100 [00:00<00:00, 423667.07it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.095705e-07,-1.037214e-07,0.000000e+00,-1.131426e-07,-1.038635e-07,-1.038230e-07,0.000000e+00,0.000000e+00,-1.037689e-07,...,0.000000e+00,0.000000e+00,-1.068283e-07,0.000000e+00,-1.037432e-07,-1.038469e-07,0.000000e+00,-1.036320e-07,-1.098590e-07,0.000000e+00
1,-1.095705e-07,,-2.068456e-07,-1.036165e-07,-2.190968e-07,-2.071281e-07,-2.070475e-07,-1.092605e-07,-1.092622e-07,-2.069400e-07,...,-1.064944e-07,-1.064610e-07,-2.130230e-07,-1.093691e-07,-2.068889e-07,-2.070951e-07,-1.033197e-07,-2.066678e-07,-2.474684e-07,-1.062173e-07
2,-1.037214e-07,-2.068456e-07,,-1.163734e-07,-2.074033e-07,-2.131207e-07,-2.191548e-07,-1.034436e-07,-1.034452e-07,-2.129215e-07,...,-1.037056e-07,-1.036739e-07,-2.074436e-07,-1.035410e-07,-2.254479e-07,-2.256928e-07,-1.063016e-07,-2.251854e-07,-2.073597e-07,-1.034428e-07
3,0.000000e+00,-1.036165e-07,-1.163734e-07,,-1.038965e-07,-1.067660e-07,-1.097949e-07,0.000000e+00,0.000000e+00,-1.066661e-07,...,0.000000e+00,0.000000e+00,-1.039167e-07,0.000000e+00,-1.129541e-07,-1.130771e-07,0.000000e+00,-1.128224e-07,-1.038746e-07,0.000000e+00
4,-1.131426e-07,-2.190968e-07,-2.074033e-07,-1.038965e-07,,-2.076874e-07,-2.076063e-07,-1.128122e-07,-1.237973e-07,-2.074983e-07,...,-1.067901e-07,-1.067565e-07,-2.136146e-07,-1.163730e-07,-2.074469e-07,-2.076542e-07,-1.035980e-07,-2.072246e-07,-2.196737e-07,-1.065115e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.038469e-07,-2.070951e-07,-2.256928e-07,-1.130771e-07,-2.076542e-07,-2.133856e-07,-2.194349e-07,-1.035685e-07,-1.035700e-07,-2.131860e-07,...,-1.038310e-07,-1.037993e-07,-2.076946e-07,-1.036660e-07,-2.326275e-07,,-1.064334e-07,-2.323481e-07,-2.076105e-07,-1.035676e-07
96,0.000000e+00,-1.033197e-07,-1.063016e-07,0.000000e+00,-1.035980e-07,-1.095067e-07,-1.064083e-07,0.000000e+00,0.000000e+00,-1.321392e-07,...,0.000000e+00,0.000000e+00,-1.036181e-07,0.000000e+00,-1.063245e-07,-1.064334e-07,,-1.062077e-07,-1.035762e-07,0.000000e+00
97,-1.036320e-07,-2.066678e-07,-2.251854e-07,-1.128224e-07,-2.072246e-07,-2.129320e-07,-2.189552e-07,-1.033547e-07,-1.033562e-07,-2.127332e-07,...,-1.036162e-07,-1.035846e-07,-2.072648e-07,-1.034519e-07,-2.393702e-07,-2.323481e-07,-1.062077e-07,,-2.071810e-07,-1.033539e-07
98,-1.098590e-07,-2.474684e-07,-2.073597e-07,-1.038746e-07,-2.196737e-07,-2.076436e-07,-2.075626e-07,-1.095474e-07,-1.095491e-07,-2.074546e-07,...,-1.067669e-07,-1.067334e-07,-2.135683e-07,-1.096566e-07,-2.074032e-07,-2.076105e-07,-1.035762e-07,-2.071810e-07,,-1.064884e-07


np.nanmean(norm_err)=np.float64(-9.818004285435153e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.818004285435153e-08)
    np.nanmedian(norm_err)=np.float64(-1.0382134713285151e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0382134713285151e-07)
    
