In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-10T15:27:41.544640+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10475.28it/s]
100%|██████████| 100/100 [00:00<00:00, 383.79it/s]
6136it [00:00, 680889.18it/s]
100%|██████████| 100/100 [00:00<00:00, 258907.65it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.213476,-0.034606,-0.163697,-0.212900,-0.178756,0.000000,-0.053012,0.000000,-0.055816,...,0.000000,0.000000,-0.152793,-0.066385,-0.221628,0.000000,-0.205311,-0.169811,0.634210,0.094489
1,-0.213476,,0.482822,-0.164595,0.000000,0.000000,0.000000,0.561223,0.000000,0.337669,...,-0.209668,0.111349,0.181473,0.071652,-0.501025,0.000000,-0.270072,-0.170778,-0.518184,-0.127279
2,-0.034606,0.482822,,-0.044057,0.481481,0.402465,0.000000,0.197140,0.000000,-0.135401,...,-0.053253,0.088572,0.238158,0.000000,-0.035797,0.000000,-0.041690,-0.045377,-0.044620,-0.035659
3,-0.163697,-0.164595,-0.044057,,-0.164024,-0.131717,0.262191,-0.078960,0.552386,-0.085346,...,-0.518744,0.000000,-0.502674,-0.112810,0.208879,0.436693,-0.223581,0.000000,-0.253312,-0.375664
4,-0.212900,0.000000,0.481481,-0.164024,,0.000000,0.000000,0.558736,0.000000,0.336228,...,-0.208743,0.110827,0.180779,0.071328,-0.499437,0.000000,-0.269150,-0.170163,-0.516064,-0.126937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.000000,0.000000,0.436693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.247826,,0.000000,0.483092,0.000000,0.000000
96,-0.205311,-0.270072,-0.041690,-0.223581,-0.269150,-0.216799,0.000000,-0.071667,0.000000,-0.076888,...,0.000000,0.000000,-0.203724,-0.098490,-0.283252,0.000000,,-0.235144,0.000000,0.935467
97,-0.169811,-0.170778,-0.045377,0.000000,-0.170163,-0.135647,0.287020,-0.083301,0.607763,-0.090441,...,-0.567284,0.000000,-0.778480,-0.121885,0.840910,0.483092,-0.235144,,-0.268257,-0.558972
98,0.634210,-0.518184,-0.044620,-0.253312,-0.516064,-0.400245,0.000000,-0.080788,0.000000,-0.087485,...,0.000000,0.000000,-0.228120,-0.116578,-0.548807,0.000000,0.000000,-0.268257,,0.136200


np.nanmean(norm_err)=np.float64(-0.03389349429367191)
    np.nanmean(np.abs(norm_err))=np.float64(0.15903861635373595)
    np.nanmedian(norm_err)=np.float64(-0.02941667763339309)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0908754858128166)
    


100%|██████████| 100/100 [00:00<00:00, 35953.23it/s]
100%|██████████| 100/100 [00:00<00:00, 421.74it/s]
5973it [00:00, 603880.29it/s]
100%|██████████| 100/100 [00:00<00:00, 285715.53it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.004931,-1.411272e-07,1.780840e-02,0.002434,-3.051193e-07,3.953983e-03,-2.285251e-07,1.844092e-02,4.913324e-03,...,0.002473,-7.288994e-07,0.019127,-4.348744e-07,4.709537e-03,-3.375595e-07,-4.785386e-07,1.593235e-02,-3.124300e-07,2.424951e-03
1,4.930896e-03,,-4.253421e-03,-5.999830e-02,-0.044027,-4.686603e-03,2.370524e-02,1.070219e-02,7.164350e-03,3.343129e-02,...,-0.038636,-2.329457e-02,-0.122280,-7.497943e-03,3.115237e-02,-5.330324e-03,5.521594e-03,-5.179187e-02,-4.828587e-03,6.080746e-03
2,-1.411272e-07,-0.004253,,-3.053524e-03,-0.002678,-4.188276e-02,0.000000e+00,0.000000e+00,-1.650591e-07,-1.732758e-07,...,-0.002709,1.589125e-01,-0.003206,-5.961359e-02,-1.690829e-07,-1.627625e-02,-1.485167e-07,-2.824631e-03,-1.372799e-07,0.000000e+00
3,1.780840e-02,-0.059998,-3.053524e-03,,-0.050850,-3.270592e-03,3.223564e-03,3.057414e-03,1.820140e-02,3.833755e-03,...,-0.051579,-8.919317e-03,-0.022153,-4.429727e-03,3.708550e-03,-3.571611e-03,2.545627e-02,-1.606430e-07,-3.339115e-03,2.129129e-03
4,2.434409e-03,-0.044027,-2.678168e-03,-5.085046e-02,,-2.843718e-03,5.302020e-03,3.620938e-02,2.877215e-03,6.096934e-03,...,-0.075115,-6.826261e-03,-0.034634,-3.681278e-03,5.938168e-03,-3.068587e-03,2.570146e-03,-4.675554e-02,-2.895380e-03,1.505441e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-3.375595e-07,-0.005330,-1.627625e-02,-3.571611e-03,-0.003069,-1.023425e-01,-1.847432e-07,-1.758427e-07,-4.083709e-07,-4.338187e-07,...,-0.003109,6.098634e-02,-0.003782,-3.635896e-07,-4.207574e-07,,-3.589166e-07,-3.262380e-03,-1.778807e-02,-1.249302e-07
96,-4.785386e-07,0.005522,-1.485167e-07,2.545627e-02,0.002570,-3.224633e-07,4.324977e-03,-2.485503e-07,2.047188e-02,5.499569e-03,...,0.002613,-8.363627e-07,0.114772,-4.709791e-07,5.245508e-03,-3.589166e-07,,2.260382e-02,-3.306399e-07,2.559607e-03
97,1.593235e-02,-0.051792,-2.824631e-03,-1.606430e-07,-0.046756,-3.009392e-03,2.855724e-03,2.724549e-03,1.590395e-02,3.324484e-03,...,-0.047371,-7.575805e-03,-0.019963,-3.963712e-03,3.229917e-03,-3.262380e-03,2.260382e-02,,-3.067308e-03,1.962154e-03
98,-3.124300e-07,-0.004829,-1.372799e-07,-3.339115e-03,-0.002895,-4.541401e-02,-1.697945e-07,-1.622467e-07,-3.721580e-07,-3.931766e-07,...,-0.002932,1.853747e-01,-0.003522,-6.651496e-02,-3.824176e-07,-1.778807e-02,-3.306399e-07,-3.067308e-03,,-1.179104e-07


np.nanmean(norm_err)=np.float64(-0.0006730261283963386)
    np.nanmean(np.abs(norm_err))=np.float64(0.012550363171803039)
    np.nanmedian(norm_err)=np.float64(-1.5978089433549816e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002693181609371509)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 34940.89it/s]
100%|██████████| 100/100 [00:00<00:00, 469.72it/s]
5949it [00:00, 567604.97it/s]
100%|██████████| 100/100 [00:00<00:00, 298314.65it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 22687.86it/s]
100%|██████████| 100/100 [00:00<00:00, 1001.23it/s]
5926it [00:00, 648391.65it/s]
100%|██████████| 100/100 [00:00<00:00, 366314.76it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.038667e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036160e-07,0.000000e+00,0.000000e+00,-1.037423e-07,...,0.000000e+00,-1.036915e-07,0.000000e+00,-1.036843e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036159e-07,-1.038706e-07
1,-1.038667e-07,,-1.039707e-07,-1.039382e-07,-1.038777e-07,-1.097733e-07,-2.193628e-07,-1.036653e-07,-1.066713e-07,-2.134994e-07,...,-1.037889e-07,-2.329374e-07,-1.037723e-07,-2.195159e-07,-1.239031e-07,-1.039383e-07,-1.037261e-07,-1.036325e-07,-2.193626e-07,-2.137710e-07
2,0.000000e+00,-1.039707e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.037195e-07,0.000000e+00,0.000000e+00,-1.038461e-07,...,0.000000e+00,-1.037952e-07,0.000000e+00,-1.037880e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037194e-07,-1.039746e-07
3,0.000000e+00,-1.039382e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.036872e-07,0.000000e+00,0.000000e+00,-1.038137e-07,...,0.000000e+00,-1.037628e-07,0.000000e+00,-1.037556e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036871e-07,-1.039421e-07
4,0.000000e+00,-1.038777e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.036269e-07,0.000000e+00,0.000000e+00,-1.037533e-07,...,0.000000e+00,-1.037025e-07,0.000000e+00,-1.036953e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036269e-07,-1.038816e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.039383e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036872e-07,0.000000e+00,0.000000e+00,-1.038137e-07,...,0.000000e+00,-1.037629e-07,0.000000e+00,-1.037557e-07,0.000000e+00,,0.000000e+00,0.000000e+00,-1.036872e-07,-1.039422e-07
96,0.000000e+00,-1.037261e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.034760e-07,0.000000e+00,0.000000e+00,-1.036020e-07,...,0.000000e+00,-1.035514e-07,0.000000e+00,-1.035442e-07,0.000000e+00,0.000000e+00,,0.000000e+00,-1.034759e-07,-1.037299e-07
97,0.000000e+00,-1.036325e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.033829e-07,0.000000e+00,0.000000e+00,-1.035086e-07,...,0.000000e+00,-1.034581e-07,0.000000e+00,-1.034509e-07,0.000000e+00,0.000000e+00,0.000000e+00,,-1.033828e-07,-1.036363e-07
98,-1.036159e-07,-2.193626e-07,-1.037194e-07,-1.036871e-07,-1.036269e-07,-1.161616e-07,-2.471563e-07,-1.034155e-07,-1.064067e-07,-2.129695e-07,...,-1.035384e-07,-2.189719e-07,-1.035219e-07,-2.254253e-07,-1.093769e-07,-1.036872e-07,-1.034759e-07,-1.033828e-07,,-2.132398e-07


np.nanmean(norm_err)=np.float64(-1.1534751958997246e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1534751958997246e-07)
    np.nanmedian(norm_err)=np.float64(-1.0647447314667982e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0647447314667982e-07)
    
