In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-13T00:07:26.319487+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 11676.14it/s]
100%|██████████| 100/100 [00:00<00:00, 359.53it/s]
6102it [00:00, 656365.07it/s]
100%|██████████| 100/100 [00:00<00:00, 219942.53it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.385030,0.000000,-0.420094,0.476559,0.233526,-0.238975,-0.634015,-0.061251,0.000000,...,-0.665867,0.402591,0.250424,0.308677,0.000000,-0.143669,0.245783,-0.059311,-0.398566,-0.618972
1,0.385030,,0.296247,0.000000,0.298447,0.000000,0.000000,0.000000,0.000000,0.370604,...,0.000000,0.000000,0.000000,0.000000,0.289740,0.000000,0.697946,0.000000,0.000000,0.000000
2,0.000000,0.296247,,-0.334919,0.332613,0.163980,-0.166646,-0.458164,-0.050104,0.000000,...,-0.474568,0.306535,0.172136,0.197792,0.000000,-0.094404,0.197657,-0.048799,-0.292649,-0.396252
3,-0.420094,0.000000,-0.334919,,-0.632964,0.000000,0.000000,0.000000,0.085916,-0.406661,...,0.000000,0.000000,0.000000,0.000000,-0.328433,0.000000,-0.046175,0.083615,0.000000,0.000000
4,0.476559,0.298447,0.332613,-0.632964,,0.165556,-0.168274,-0.910742,-0.050397,0.279779,...,-0.949907,0.308891,0.173873,0.200089,0.603561,-0.095451,0.198901,-0.049077,-0.295157,-0.400863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.143669,0.000000,-0.094404,0.000000,-0.095451,0.000000,0.000000,0.000000,-0.100907,-0.134549,...,0.000000,0.000000,0.000000,0.000000,-0.091353,,-0.176447,-0.095750,0.000000,0.000000
96,0.245783,0.697946,0.197657,-0.046175,0.198901,0.148823,-0.072763,-0.059390,-0.598774,0.238248,...,-0.061031,1.016130,0.155511,0.176152,0.193958,-0.176447,,-0.796585,-0.060937,-0.077902
97,-0.059311,0.000000,-0.048799,0.083615,-0.049077,0.000000,0.080952,0.110276,0.000000,-0.057697,...,0.113681,0.000000,0.000000,0.000000,-0.047970,-0.095750,-0.796585,,0.113486,0.149982
98,-0.398566,0.000000,-0.292649,0.000000,-0.295157,0.000000,0.000000,0.000000,0.117766,-0.380671,...,0.000000,0.000000,0.000000,0.000000,-0.285266,0.000000,-0.060937,0.113486,,0.000000


np.nanmean(norm_err)=np.float64(-0.0032925851671622808)
    np.nanmean(np.abs(norm_err))=np.float64(0.14997309790465793)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.06086237427248858)
    


100%|██████████| 100/100 [00:00<00:00, 33885.15it/s]
100%|██████████| 100/100 [00:00<00:00, 662.33it/s]
5989it [00:00, 624355.30it/s]
100%|██████████| 100/100 [00:00<00:00, 274316.81it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.002476,-1.758618e-07,-7.923902e-07,-7.877123e-03,2.549614e-03,-5.638888e-02,-1.237414e-02,0.003589,-3.972235e-07,...,-3.442215e-07,-2.239004e-03,-2.737153e-03,-4.728453e-03,-1.852204e-07,-8.094561e-07,-0.001373,-1.076880e-02,-8.560240e-07,0.002232
1,2.476089e-03,,1.585944e-03,2.605044e-03,1.143902e-02,9.681016e-03,-1.264845e-07,1.578536e-02,0.012195,2.683555e-03,...,2.474095e-03,-7.182944e-02,-3.075944e-07,-4.337302e-07,1.577723e-03,2.740700e-03,-0.054974,1.436045e-02,2.600092e-03,0.008791
2,-1.758618e-07,0.001586,,-1.838012e-07,-2.011426e-02,1.615786e-03,-7.098700e-03,-3.690332e-02,0.001979,0.000000e+00,...,0.000000e+00,-8.699744e-04,-1.689060e-03,-2.282191e-03,0.000000e+00,-1.902452e-07,-0.000699,-3.388985e-02,-1.834983e-07,0.001482
3,-7.923902e-07,0.002605,-1.838012e-07,,-8.199893e-03,2.686553e-03,-3.463480e-02,-1.318971e-02,0.003867,-4.401694e-07,...,-3.760127e-07,-2.599566e-03,-2.895549e-03,-5.221928e-03,-1.869173e-07,-8.988065e-07,-0.001500,-1.138124e-02,-8.743670e-07,0.002337
4,-7.877123e-03,0.011439,-2.011426e-02,-8.199893e-03,,1.572229e-02,-6.689738e-03,-9.147943e-03,0.019007,-4.267825e-02,...,-1.548745e-01,-7.794609e-04,-1.574553e-03,-2.078004e-03,-5.421624e-03,-1.728464e-07,-0.000639,-8.449527e-03,-8.187625e-03,0.014493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-8.094561e-07,0.002741,-1.902452e-07,-8.988065e-07,-1.728464e-07,2.831065e-03,-2.269419e-03,-5.685321e-07,0.004173,-4.790267e-07,...,-4.040080e-07,-3.069321e-03,-8.393148e-02,-1.357730e-02,-1.887952e-07,,-0.001645,-4.866181e-07,-8.951930e-07,0.002445
96,-1.372658e-03,-0.054974,-6.986951e-04,-1.500191e-03,-6.391064e-04,-6.888969e-03,-6.804077e-04,-1.007540e-03,-0.009765,-1.582167e-03,...,-1.370224e-03,-6.963757e-07,-8.966049e-04,-1.566075e-03,-6.937598e-04,-1.645490e-03,,-8.757888e-04,-1.495111e-03,-0.006018
97,-1.076880e-02,0.014360,-3.388985e-02,-1.138124e-02,-8.449527e-03,4.073279e-02,-8.182608e-03,-4.110112e-07,0.052241,-4.378944e-02,...,-4.002020e-02,-1.162729e-03,-2.005132e-03,-2.899704e-03,-6.650896e-03,-4.866181e-07,-0.000876,,-1.135762e-02,0.036757
98,-8.560240e-07,0.002600,-1.834983e-07,-8.743670e-07,-8.187625e-03,2.681286e-03,-5.881423e-02,-1.315800e-02,0.003856,-4.384360e-07,...,-3.747471e-07,-2.584347e-03,-2.889435e-03,-5.202075e-03,-2.510869e-07,-8.951930e-07,-0.001495,-1.135762e-02,,0.002333


np.nanmean(norm_err)=np.float64(-0.0002539148445734348)
    np.nanmean(np.abs(norm_err))=np.float64(0.010657042652222138)
    np.nanmedian(norm_err)=np.float64(-0.00048173610276263463)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002173096922053662)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36364.70it/s]
100%|██████████| 100/100 [00:00<00:00, 436.48it/s]
5945it [00:00, 637759.92it/s]
100%|██████████| 100/100 [00:00<00:00, 387643.62it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 30888.17it/s]
100%|██████████| 100/100 [00:00<00:00, 999.34it/s]
5955it [00:00, 610164.41it/s]
100%|██████████| 100/100 [00:00<00:00, 389443.27it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.039008e-07,-1.164668e-07,0.000000e+00,-1.036742e-07,-1.036497e-07,-1.037362e-07,-1.131691e-07,-1.098732e-07,-1.039554e-07,...,0.000000e+00,0.000000e+00,-1.097347e-07,0.000000e+00,-1.067511e-07,0.000000e+00,0.000000e+00,-1.037219e-07,-1.038485e-07,-1.036450e-07
1,-1.039008e-07,,-2.075463e-07,-1.037025e-07,-2.192197e-07,-2.130449e-07,-2.258519e-07,-2.078041e-07,-2.077410e-07,-2.263717e-07,...,-1.035975e-07,-1.067826e-07,-2.074934e-07,-1.035961e-07,-2.076538e-07,-1.067344e-07,-1.129986e-07,-2.193265e-07,-2.134651e-07,-2.130351e-07
2,-1.164668e-07,-2.075463e-07,,-1.064852e-07,-2.070941e-07,-2.070452e-07,-2.072177e-07,-2.260353e-07,-2.194609e-07,-2.076552e-07,...,-1.063745e-07,-1.037683e-07,-2.191846e-07,-1.160840e-07,-2.132326e-07,-1.037227e-07,-1.036700e-07,-2.071894e-07,-2.074420e-07,-2.070359e-07
3,0.000000e+00,-1.037025e-07,-1.064852e-07,,-1.034767e-07,-1.034523e-07,-1.035385e-07,-1.066209e-07,-1.065877e-07,-1.037569e-07,...,0.000000e+00,0.000000e+00,-1.064573e-07,0.000000e+00,-1.096029e-07,0.000000e+00,0.000000e+00,-1.035243e-07,-1.036504e-07,-1.034477e-07
4,-1.036742e-07,-2.192197e-07,-2.070941e-07,-1.034767e-07,,-2.125684e-07,-2.188532e-07,-2.073507e-07,-2.072879e-07,-2.193412e-07,...,-1.033722e-07,-1.065432e-07,-2.070414e-07,-1.033707e-07,-2.072011e-07,-1.064952e-07,-1.094948e-07,-2.252829e-07,-2.129867e-07,-2.125586e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000e+00,-1.067344e-07,-1.037227e-07,0.000000e+00,-1.064952e-07,-1.095262e-07,-1.065606e-07,-1.038515e-07,-1.038199e-07,-1.067920e-07,...,0.000000e+00,0.000000e+00,-1.036963e-07,0.000000e+00,-1.037764e-07,,0.000000e+00,-1.065456e-07,-1.097483e-07,-1.127583e-07
96,0.000000e+00,-1.129986e-07,-1.036700e-07,0.000000e+00,-1.094948e-07,-1.064138e-07,-1.162412e-07,-1.037987e-07,-1.037672e-07,-1.165166e-07,...,0.000000e+00,0.000000e+00,-1.036436e-07,0.000000e+00,-1.037237e-07,0.000000e+00,,-1.095481e-07,-1.066234e-07,-1.064089e-07
97,-1.037219e-07,-2.193265e-07,-2.071894e-07,-1.035243e-07,-2.252829e-07,-2.126688e-07,-2.189596e-07,-2.074462e-07,-2.073834e-07,-2.194481e-07,...,-1.034197e-07,-1.065936e-07,-2.071367e-07,-1.034182e-07,-2.072965e-07,-1.065456e-07,-1.095481e-07,,-2.130875e-07,-2.126590e-07
98,-1.038485e-07,-2.134651e-07,-2.074420e-07,-1.036504e-07,-2.129867e-07,-2.323931e-07,-2.131175e-07,-2.076994e-07,-2.076364e-07,-2.135803e-07,...,-1.035455e-07,-1.240857e-07,-2.073891e-07,-1.035441e-07,-2.075493e-07,-1.097483e-07,-1.066234e-07,-2.130875e-07,,-2.190382e-07


np.nanmean(norm_err)=np.float64(-1.1511489451531262e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1511489451531262e-07)
    np.nanmedian(norm_err)=np.float64(-1.0646469038079706e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0646469038079706e-07)
    
