In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
from IPython.display import display
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-09-21T00:23:14.296119+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
pandas                            : 2.2.3
hstrat                            : 1.20.10
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 9452.38it/s]
100%|██████████| 100/100 [00:00<00:00, 313.49it/s]
6137it [00:00, 542120.93it/s]
100%|██████████| 100/100 [00:00<00:00, 227456.83it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000,0.000000,0.077377,0.086437,0.080479,0.000000,0.000000,0.0,0.000000,...,0.000000,0.058202,0.000000,0.000000,0.117367,0.000000,0.000000,0.000000,0.195129,0.000000
1,0.000000,,0.000000,-0.036292,-0.038169,-0.036960,-1.028749,-0.667936,0.0,-0.067562,...,-0.184786,0.000000,0.122671,-0.642337,0.000000,0.062666,-0.062782,-0.179797,0.000000,0.401532
2,0.000000,0.000000,,-0.038808,-0.040961,-0.039573,-1.162170,-0.741645,0.0,-0.076831,...,-0.207624,0.000000,0.138901,-0.717106,0.000000,0.070561,-0.070713,-0.201346,0.000000,0.456021
3,0.077377,-0.036292,-0.038808,,0.000000,0.000000,-0.048581,-0.047626,0.0,-0.044776,...,-0.067745,0.105923,0.000000,-0.050876,0.152562,-0.064572,-0.154499,-0.065738,0.184391,-0.058919
4,0.086437,-0.038169,-0.040961,0.000000,,0.000000,-0.052003,-0.050910,0.0,-0.047667,...,-0.074590,0.111242,0.000000,-0.054642,0.163846,-0.070761,-0.172551,-0.072164,0.201132,-0.064029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.062666,0.070561,-0.064572,-0.070761,-0.066718,-0.190596,-0.183382,0.0,0.093114,...,-0.428094,0.000000,0.582294,-0.209095,0.000000,,-0.259030,-0.390440,0.000000,0.000000
96,0.000000,-0.062782,-0.070713,-0.154499,-0.172551,-0.160680,0.000000,0.000000,0.0,-0.093392,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.259030,,0.000000,0.000000,-0.187038
97,0.000000,-0.179797,-0.201346,-0.065738,-0.072164,-0.067963,0.129588,0.123010,0.0,-0.261629,...,0.000000,0.000000,0.000000,0.147307,0.000000,-0.390440,0.000000,,0.000000,-0.302663
98,0.195129,0.000000,0.000000,0.184391,0.201132,0.190213,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,-0.087712,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.0258122391412347)
    np.nanmean(np.abs(norm_err))=np.float64(0.13048855349148328)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0463759670284022)
    


100%|██████████| 100/100 [00:00<00:00, 35696.20it/s]
100%|██████████| 100/100 [00:00<00:00, 353.18it/s]
5953it [00:00, 563017.31it/s]
100%|██████████| 100/100 [00:00<00:00, 231857.60it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-4.165352e-07,-5.115402e-07,-0.020477,-5.522889e-07,-4.183042e-07,-1.309572e-06,-6.002803e-07,-0.082237,-0.001770,...,-2.758926e-02,-2.663906e-07,-0.003030,-7.700215e-06,-0.001225,-2.456089e-07,-7.903698e-07,-5.664338e-07,-1.274537e-06,-1.260874e-06
1,-4.165352e-07,,-2.417624e-07,-0.001006,-1.617152e-07,-2.184772e-07,-1.950674e-07,-1.655916e-07,-0.000706,-0.125423,...,-1.079662e-03,-1.232026e-07,-0.015554,-4.326610e-07,-0.073165,-1.185629e-07,-1.195536e-02,-2.533670e-07,-1.942720e-07,-3.361895e-07
2,-5.115402e-07,-2.417624e-07,,-0.001182,-1.889660e-07,-2.420569e-07,-2.389186e-07,-1.942804e-07,-0.000789,-0.002365,...,-1.285493e-03,-1.393572e-07,-0.002976,-5.432345e-07,-0.001939,-1.334502e-07,8.223251e-03,-2.876547e-07,-2.377264e-07,-3.993515e-07
3,-2.047701e-02,-1.005642e-03,-1.181879e-03,,5.456541e-03,2.731633e-03,-3.095727e-03,5.735305e-03,-0.005406,0.027717,...,-5.218121e-07,-1.219025e-03,0.008468,-4.180805e-03,0.020761,-1.145277e-03,-1.621371e-03,-1.276964e-03,-3.062631e-03,-2.175087e-03
4,-5.522889e-07,-1.617152e-07,-1.889660e-07,0.005457,,-1.659351e-07,0.000000e+00,0.000000e+00,0.000718,0.003408,...,6.330297e-03,0.000000e+00,0.004843,-6.090399e-07,0.002589,0.000000e+00,-2.555808e-07,-2.035392e-07,0.000000e+00,-3.368822e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.456089e-07,-1.185629e-07,-1.334502e-07,-0.001145,0.000000e+00,-1.187046e-07,0.000000e+00,0.000000e+00,-0.000772,-0.002310,...,-1.242319e-03,0.000000e+00,-0.002890,-2.612999e-07,-0.001902,,2.421441e-02,9.384313e-03,0.000000e+00,-4.033038e-02
96,-7.903698e-07,-1.195536e-02,8.223251e-03,-0.001621,-2.555808e-07,-2.905617e-07,8.993930e-02,-2.653999e-07,-0.000963,-0.018004,...,-1.823070e-03,2.554692e-02,-0.066188,2.826282e-02,-0.014099,2.421441e-02,,2.657353e-02,5.261288e-02,4.091123e-02
97,-5.664338e-07,-2.533670e-07,-2.876547e-07,-0.001277,-2.035392e-07,-2.536905e-07,-8.875820e-03,-2.097183e-07,-0.000830,-0.002503,...,-1.398800e-03,1.478363e-02,-0.003198,-6.085744e-07,-0.002031,9.384313e-03,2.657353e-02,,2.248074e-01,1.456910e-02
98,-1.274537e-06,-1.942720e-07,-2.377264e-07,-0.003063,0.000000e+00,-1.946526e-07,0.000000e+00,0.000000e+00,-0.001336,-0.004337,...,-3.872663e-03,0.000000e+00,-0.006958,-1.851494e-06,-0.003091,0.000000e+00,5.261288e-02,2.248074e-01,,-5.816941e-07


np.nanmean(norm_err)=np.float64(-0.001375124166989129)
    np.nanmean(np.abs(norm_err))=np.float64(0.012741963367999338)
    np.nanmedian(norm_err)=np.float64(-1.4950652325719702e-06)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0016526019315418641)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35499.82it/s]
100%|██████████| 100/100 [00:00<00:00, 407.30it/s]
5940it [00:00, 580641.51it/s]
100%|██████████| 100/100 [00:00<00:00, 303056.65it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 26662.67it/s]
100%|██████████| 100/100 [00:00<00:00, 1002.03it/s]
5946it [00:00, 594274.69it/s]
100%|██████████| 100/100 [00:00<00:00, 234449.64it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.072094e-07,-2.072385e-07,-2.186240e-07,-1.035934e-07,-1.036659e-07,-2.073816e-07,-1.062377e-07,-1.036658e-07,-2.129634e-07,...,-1.034325e-07,-1.035029e-07,-2.257169e-07,-1.064524e-07,-1.063247e-07,-1.127199e-07,-1.062712e-07,-1.093651e-07,-2.475860e-07,-1.036921e-07
1,-2.072094e-07,,-2.134365e-07,-2.071670e-07,-1.164626e-07,-1.098421e-07,-2.405224e-07,-1.035558e-07,-1.165542e-07,-2.075753e-07,...,-1.065202e-07,-1.065949e-07,-2.077119e-07,-1.037598e-07,-1.036384e-07,-1.037386e-07,-1.035876e-07,-1.036312e-07,-2.076305e-07,-1.098715e-07
2,-2.072385e-07,-2.134365e-07,,-2.071961e-07,-1.067062e-07,-1.067832e-07,-2.136192e-07,-1.035703e-07,-1.067831e-07,-2.076044e-07,...,-1.128382e-07,-1.096754e-07,-2.077411e-07,-1.037744e-07,-1.036530e-07,-1.037532e-07,-1.036021e-07,-1.036457e-07,-2.076597e-07,-1.068110e-07
3,-2.186240e-07,-2.071670e-07,-2.071961e-07,,-1.035722e-07,-1.036447e-07,-2.073392e-07,-1.062154e-07,-1.036446e-07,-2.129187e-07,...,-1.034114e-07,-1.034818e-07,-2.191835e-07,-1.064301e-07,-1.063024e-07,-1.094611e-07,-1.062489e-07,-1.196283e-07,-2.190928e-07,-1.036709e-07
4,-1.035934e-07,-1.164626e-07,-1.067062e-07,-1.035722e-07,,0.000000e+00,-1.165714e-07,0.000000e+00,0.000000e+00,-1.037763e-07,...,0.000000e+00,0.000000e+00,-1.038446e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.038039e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.127199e-07,-1.037386e-07,-1.037532e-07,-1.094611e-07,0.000000e+00,0.000000e+00,-1.038249e-07,0.000000e+00,0.000000e+00,-1.066232e-07,...,0.000000e+00,0.000000e+00,-1.164680e-07,0.000000e+00,0.000000e+00,,0.000000e+00,0.000000e+00,-1.129692e-07,0.000000e+00
96,-1.062712e-07,-1.035876e-07,-1.036021e-07,-1.062489e-07,0.000000e+00,0.000000e+00,-1.036736e-07,0.000000e+00,0.000000e+00,-1.095202e-07,...,0.000000e+00,0.000000e+00,-1.065355e-07,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000e+00,-1.064927e-07,0.000000e+00
97,-1.093651e-07,-1.036312e-07,-1.036457e-07,-1.196283e-07,0.000000e+00,0.000000e+00,-1.037173e-07,0.000000e+00,0.000000e+00,-1.065097e-07,...,0.000000e+00,0.000000e+00,-1.096451e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,,-1.095997e-07,0.000000e+00
98,-2.475860e-07,-2.076305e-07,-2.076597e-07,-2.190928e-07,-1.038039e-07,-1.038767e-07,-2.078034e-07,-1.064591e-07,-1.038766e-07,-2.134082e-07,...,-1.036424e-07,-1.037131e-07,-2.262166e-07,-1.066747e-07,-1.065464e-07,-1.129692e-07,-1.064927e-07,-1.095997e-07,,-1.039030e-07


np.nanmean(norm_err)=np.float64(-1.1961816994305648e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1961816994305648e-07)
    np.nanmedian(norm_err)=np.float64(-1.0656417750333044e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0656417750333044e-07)
    
