In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-18T00:24:21.857003+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10827.08it/s]
100%|██████████| 100/100 [00:00<00:00, 367.33it/s]
6146it [00:00, 676788.37it/s]
100%|██████████| 100/100 [00:00<00:00, 230077.02it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.216432,-0.228380,0.000000,-0.042740,-0.056599,0.000000,0.674669,0.480224,-0.176123,...,-0.060407,-0.040634,0.000000,-0.000005,-0.057516,0.000000,0.000000,0.000000,0.457507,0.000000
1,0.216432,,0.029428,0.178717,0.365439,0.525429,-0.095519,0.000000,0.000000,0.021689,...,0.574315,0.343307,0.052639,0.309821,0.536993,0.069381,0.064046,-0.087339,0.000000,-0.161969
2,-0.228380,0.029428,,-0.193064,-0.609941,-0.876135,0.000000,0.027510,0.020427,0.000000,...,-0.957370,-0.573077,0.442185,-0.174561,-0.895354,0.064967,0.196168,0.000000,0.019560,0.000000
3,0.000000,0.178717,-0.193064,,-0.038167,-0.048848,0.000000,0.544017,0.410117,-0.154349,...,-0.051658,-0.036478,0.000000,-0.000004,-0.049529,0.000000,0.000000,0.000000,0.393433,0.000000
4,-0.042740,0.365439,-0.609941,-0.038167,,-0.127421,0.000000,0.132478,0.108786,-0.127731,...,-0.137153,0.000000,0.000000,-0.000003,0.000000,0.000000,0.000000,0.000000,0.105535,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.069381,0.064967,0.000000,0.000000,0.000000,0.000000,0.059583,0.034029,0.040839,...,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,0.000000,0.031688,0.000000
96,0.000000,0.064046,0.196168,0.000000,0.000000,0.000000,0.000000,0.055605,0.032694,0.123016,...,0.000000,0.000000,0.327313,0.000000,0.000000,0.000000,,0.000000,0.030526,0.000000
97,0.000000,-0.087339,0.000000,0.000000,0.000000,0.000000,0.000000,-0.075602,-0.044092,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,-0.041138,0.000000
98,0.457507,0.000000,0.019560,0.393433,0.105535,0.133635,-0.042868,0.000000,0.000000,0.015810,...,0.140931,0.101037,0.027669,0.150225,0.135409,0.031688,0.030526,-0.041138,,-0.073955


np.nanmean(norm_err)=np.float64(0.024878473509046976)
    np.nanmean(np.abs(norm_err))=np.float64(0.09807628181243444)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.02345427300736594)
    


100%|██████████| 100/100 [00:00<00:00, 33088.55it/s]
100%|██████████| 100/100 [00:00<00:00, 472.57it/s]
5958it [00:00, 635610.52it/s]
100%|██████████| 100/100 [00:00<00:00, 302183.29it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.189725e-07,0.003363,0.000000e+00,0.000000e+00,0.000000e+00,1.286724e-01,-3.897368e-07,1.907566e-02,9.313617e-02,...,-1.431745e-01,2.332736e-02,2.272502e-02,0.000000e+00,-2.025969e-07,0.000000e+00,4.489517e-02,-4.400168e-07,-2.580921e-07,2.619885e-02
1,-3.189725e-07,,0.002542,-2.899213e-07,-2.113155e-07,-3.430183e-07,8.516680e-02,-4.846888e-07,1.463711e-02,6.799503e-02,...,-9.128753e-02,1.701707e-02,1.669422e-02,-1.737926e-07,-3.078613e-07,-3.422116e-07,2.619837e-02,-5.217620e-07,-3.679778e-07,1.849586e-02
2,3.363316e-03,2.542134e-03,,4.319800e-02,2.729505e-03,3.928232e-03,-2.745212e-03,-6.168305e-03,-2.070746e-03,-2.296546e-03,...,-1.759447e-07,-2.349077e-03,-2.312254e-03,-4.717067e-03,-2.119216e-03,6.569262e-02,-3.307961e-03,2.637248e-02,2.441173e-03,-2.515601e-03
3,0.000000e+00,-2.899213e-07,0.043198,,0.000000e+00,0.000000e+00,5.328947e-03,-4.215716e-07,3.254364e-03,3.855581e-03,...,-5.931417e-03,4.007249e-03,3.899783e-03,0.000000e+00,-2.108748e-07,0.000000e+00,7.990576e-03,-5.091146e-07,-2.692807e-07,4.521262e-03
4,0.000000e+00,-2.113155e-07,0.002730,0.000000e+00,,0.000000e+00,1.127132e-02,-2.711576e-07,7.789406e-03,8.889098e-03,...,-1.213515e-02,9.153921e-03,8.967022e-03,0.000000e+00,-1.650719e-07,0.000000e+00,1.470602e-02,-2.945770e-07,-2.021226e-07,1.001659e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-3.422116e-07,0.065693,0.000000e+00,0.000000e+00,0.000000e+00,6.442351e-03,-5.419959e-07,3.638372e-03,4.406552e-03,...,-7.343962e-03,4.605785e-03,4.464425e-03,0.000000e+00,-2.372419e-07,,1.078540e-02,-6.958213e-07,-3.138188e-07,5.298134e-03
96,4.489517e-02,2.619837e-02,-0.003308,7.990576e-03,1.470602e-02,3.232733e-02,-3.039177e-07,-3.699294e-02,2.177997e-01,0.000000e+00,...,-3.247681e-07,0.000000e+00,-2.320377e-07,-2.292958e-02,-3.208970e-03,1.078540e-02,,6.721265e-03,1.214490e-02,-2.676057e-07
97,-4.400168e-07,-5.217620e-07,0.026372,-5.091146e-07,-2.945770e-07,-6.338202e-07,4.732748e-03,-7.256818e-07,3.021819e-03,3.533486e-03,...,-5.202767e-03,3.660463e-03,3.570560e-03,-2.281202e-07,-3.901599e-07,-6.958213e-07,6.721265e-03,,-4.880967e-07,4.084656e-03
98,-2.580921e-07,-3.679778e-07,0.002441,-2.692807e-07,-2.021226e-07,-3.697480e-07,9.702974e-03,-4.555038e-07,7.006690e-03,7.884074e-03,...,-1.033688e-02,8.091705e-03,7.945295e-03,-1.661580e-07,-2.958222e-07,-3.138188e-07,1.214490e-02,-4.880967e-07,,8.758480e-03


np.nanmean(norm_err)=np.float64(0.00044942753027669074)
    np.nanmean(np.abs(norm_err))=np.float64(0.013545928287970225)
    np.nanmedian(norm_err)=np.float64(-2.1440794192511707e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.003163789450187947)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35267.00it/s]
100%|██████████| 100/100 [00:00<00:00, 471.41it/s]
5946it [00:00, 674984.62it/s]
100%|██████████| 100/100 [00:00<00:00, 347498.26it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28856.58it/s]
100%|██████████| 100/100 [00:00<00:00, 1036.33it/s]
5950it [00:00, 683391.99it/s]
100%|██████████| 100/100 [00:00<00:00, 358794.18it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.068438e-07,-1.066183e-07,-1.038302e-07,-1.098948e-07,-2.197212e-07,-1.099491e-07,-2.076917e-07,-2.072588e-07,-2.073673e-07,...,-2.194412e-07,-1.065927e-07,-1.038335e-07,-1.039053e-07,-2.075556e-07,-1.099399e-07,-2.330912e-07,-1.097249e-07,-2.134725e-07,-1.066813e-07
1,-1.068438e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.067791e-07,0.000000e+00,-1.038400e-07,-1.036236e-07,-1.036779e-07,...,-1.066469e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037720e-07,0.000000e+00,-1.067543e-07,0.000000e+00,-1.130564e-07,0.000000e+00
2,-1.066183e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.065538e-07,0.000000e+00,-1.036270e-07,-1.034114e-07,-1.034655e-07,...,-1.064221e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035592e-07,0.000000e+00,-1.065291e-07,0.000000e+00,-1.128039e-07,0.000000e+00
3,-1.038302e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.037691e-07,0.000000e+00,-1.097010e-07,-1.197696e-07,-1.095200e-07,...,-1.036442e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.096251e-07,0.000000e+00,-1.037456e-07,0.000000e+00,-1.037228e-07,0.000000e+00
4,-1.098948e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.130820e-07,0.000000e+00,-1.038152e-07,-1.035989e-07,-1.036532e-07,...,-1.163791e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037472e-07,0.000000e+00,-1.098001e-07,0.000000e+00,-1.067039e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.099399e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.165873e-07,0.000000e+00,-1.038555e-07,-1.036390e-07,-1.036933e-07,...,-1.129813e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.037874e-07,,-1.098451e-07,0.000000e+00,-1.067464e-07,0.000000e+00
96,-2.330912e-07,-1.067543e-07,-1.065291e-07,-1.037456e-07,-1.098001e-07,-2.195319e-07,-1.098543e-07,-2.075225e-07,-2.070903e-07,-2.071987e-07,...,-2.192524e-07,-1.065036e-07,-1.037490e-07,-1.038206e-07,-2.073866e-07,-1.098451e-07,,-1.096305e-07,-2.132938e-07,-1.065921e-07
97,-1.097249e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.129021e-07,0.000000e+00,-1.036636e-07,-1.034479e-07,-1.035020e-07,...,-1.161886e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.035958e-07,0.000000e+00,-1.096305e-07,,-1.065437e-07,0.000000e+00
98,-2.134725e-07,-1.130564e-07,-1.128039e-07,-1.037228e-07,-1.067039e-07,-2.133433e-07,-1.067551e-07,-2.074768e-07,-2.070448e-07,-2.071531e-07,...,-2.130793e-07,-1.095370e-07,-1.037261e-07,-1.037977e-07,-2.073410e-07,-1.067464e-07,-2.132938e-07,-1.065437e-07,,-1.238702e-07


np.nanmean(norm_err)=np.float64(-9.186309108009915e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.186309108009915e-08)
    np.nanmedian(norm_err)=np.float64(-1.0375635890308903e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0375635890308903e-07)
    
