In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-07-27T00:24:52.190016+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
hstrat                            : 1.20.10
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10819.54it/s]
100%|██████████| 100/100 [00:00<00:00, 404.53it/s]
6108it [00:00, 664750.22it/s]
100%|██████████| 100/100 [00:00<00:00, 206108.30it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.762859,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.113163,0.000000,0.000000,-0.202214,0.000000,-0.091302,0.766534
1,0.762859,,0.227388,-0.609475,0.621006,0.554233,0.091248,-0.054831,-0.762260,0.229573,...,0.324982,-0.348706,0.055330,0.000000,0.686614,0.219911,0.504378,-1.114826,0.000000,0.000000
2,0.000000,0.227388,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.087014,0.000000,-0.254157,0.000000,0.000000,-0.073485,0.228133
3,0.000000,-0.609475,0.000000,,0.000000,0.000000,0.000000,0.000000,-0.740404,0.000000,...,0.000000,0.000000,0.000000,0.298795,0.000000,0.000000,0.000000,-0.820080,0.227035,-0.612604
4,0.000000,0.621006,0.000000,0.000000,,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.089022,0.000000,0.000000,-0.136222,0.000000,-0.074912,0.623440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.219911,-0.254157,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.083224,0.000000,,0.000000,0.000000,-0.070763,0.220608
96,-0.202214,0.504378,0.000000,0.000000,-0.136222,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,-0.108154,-0.163674,0.000000,,0.000000,-0.088013,0.506519
97,0.000000,-1.114826,0.000000,-0.820080,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,-0.295873,0.000000,0.586612,0.000000,0.000000,0.000000,,0.317987,-1.121111
98,-0.091302,0.000000,-0.073485,0.227035,-0.074912,-0.125207,-0.622996,0.082221,0.301227,-0.074285,...,-0.111296,0.258020,-0.083349,0.137955,-0.082526,-0.070763,-0.088013,0.317987,,0.000000


np.nanmean(norm_err)=np.float64(0.005032476144390257)
    np.nanmean(np.abs(norm_err))=np.float64(0.09138319342232133)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33460.74it/s]
100%|██████████| 100/100 [00:00<00:00, 428.93it/s]
5987it [00:00, 601266.59it/s]
100%|██████████| 100/100 [00:00<00:00, 254354.40it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-3.664273e-03,-4.006705e-07,-8.318094e-07,1.037299e-02,-6.406914e-03,-5.587357e-02,-2.985707e-07,-2.101789e-07,-2.873790e-06,...,-1.173512e-01,-4.756269e-02,-4.037018e-02,-5.287138e-07,-5.079740e-07,-0.000001,-0.000001,4.714158e-03,-1.363004e-06,-9.281589e-02
1,-3.664273e-03,,-2.817318e-03,-3.638336e-03,-1.559794e-07,-4.461178e-07,-2.527855e-03,4.051557e-02,2.484431e-02,-5.086684e-02,...,-2.090935e-03,-2.332675e-03,-2.135081e-03,-3.050277e-03,-3.100682e-03,0.033576,-0.004110,-2.368079e-07,4.254761e-02,-2.190131e-03
2,-4.006705e-07,-2.817318e-03,,0.000000e+00,4.468016e-02,-4.199628e-03,-1.823071e-03,0.000000e+00,0.000000e+00,-4.928287e-07,...,-1.403416e-03,-1.628156e-03,-1.443151e-03,0.000000e+00,0.000000e+00,0.000000,0.000000,3.750097e-03,-3.570785e-07,-1.493493e-03
3,-8.318094e-07,-3.638336e-03,0.000000e+00,,7.682941e-03,-6.328324e-03,3.388213e-02,0.000000e+00,0.000000e+00,-1.374963e-06,...,3.862888e-02,1.038408e-01,2.462048e-02,0.000000e+00,0.000000e+00,0.000000,0.000000,1.055424e-01,-6.672474e-07,4.194474e-02
4,1.037299e-02,-1.559794e-07,4.468016e-02,7.682941e-03,,-2.216191e-07,4.751382e-03,-3.061599e-03,-2.419569e-03,6.083681e-03,...,3.767850e-03,4.301523e-03,3.863565e-03,8.080790e-03,2.504817e-02,-0.006074,0.009157,1.146580e-03,-4.743470e-03,3.984053e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.395345e-06,3.357553e-02,0.000000e+00,0.000000e+00,-6.073888e-03,6.543755e-02,-3.688920e-07,0.000000e+00,0.000000e+00,-4.281363e-06,...,0.000000e+00,-3.078033e-07,0.000000e+00,0.000000e+00,0.000000e+00,,0.000000,-2.814161e-02,-1.165799e-06,0.000000e+00
96,-1.449291e-06,-4.109537e-03,0.000000e+00,0.000000e+00,9.157155e-03,-7.904815e-03,6.122306e-03,0.000000e+00,0.000000e+00,-4.649292e-06,...,4.076561e-03,5.097449e-03,4.246370e-03,0.000000e+00,0.000000e+00,0.000000,,8.694446e-03,-1.013697e-06,4.467940e-03
97,4.714158e-03,-2.368079e-07,3.750097e-03,1.055424e-01,1.146580e-03,-3.054924e-07,1.837543e-02,-2.708986e-02,-1.972520e-01,3.453182e-03,...,2.328989e-02,5.218378e-02,1.576038e-02,4.021970e-03,4.080122e-03,-0.028142,0.008694,,-2.120275e-01,2.430417e-02
98,-1.363004e-06,4.254761e-02,-3.570785e-07,-6.672474e-07,-4.743470e-03,7.226266e-02,-5.777844e-07,-2.919822e-07,-2.511642e-07,-2.032006e-06,...,-2.081330e-07,-5.000616e-07,-2.153362e-07,-4.237449e-07,-4.398906e-07,-0.000001,-0.000001,-2.120275e-01,,-2.245899e-07


np.nanmean(norm_err)=np.float64(0.002696220758624051)
    np.nanmean(np.abs(norm_err))=np.float64(0.014074260758851818)
    np.nanmedian(norm_err)=np.float64(-1.68443571596291e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0025601035858680028)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35897.84it/s]
100%|██████████| 100/100 [00:00<00:00, 437.43it/s]
5938it [00:00, 647037.75it/s]
100%|██████████| 100/100 [00:00<00:00, 350108.85it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 28336.06it/s]
100%|██████████| 100/100 [00:00<00:00, 988.97it/s]
5952it [00:00, 43627.29it/s]
100%|██████████| 100/100 [00:00<00:00, 363773.11it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.000000e+00,0.000000e+00,-1.036626e-07,0.000000e+00,-1.035521e-07,-1.037865e-07,0.000000e+00,-1.065826e-07,-1.035872e-07,...,0.000000e+00,-1.424330e-07,0.000000e+00,0.000000e+00,-1.064476e-07,-1.034487e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,0.000000e+00,,0.000000e+00,-1.036212e-07,0.000000e+00,-1.035108e-07,-1.037450e-07,0.000000e+00,-1.065389e-07,-1.035459e-07,...,0.000000e+00,-1.096270e-07,0.000000e+00,0.000000e+00,-1.064040e-07,-1.034075e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,0.000000e+00,0.000000e+00,,-1.036653e-07,0.000000e+00,-1.035548e-07,-1.037892e-07,0.000000e+00,-1.065855e-07,-1.035899e-07,...,0.000000e+00,-1.096763e-07,0.000000e+00,0.000000e+00,-1.064505e-07,-1.034514e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,-1.036626e-07,-1.036212e-07,-1.036653e-07,,-1.036990e-07,-2.130106e-07,-2.261649e-07,-1.065832e-07,-2.074981e-07,-2.325718e-07,...,-1.094894e-07,-2.075466e-07,-1.038204e-07,-1.064901e-07,-2.072422e-07,-2.127918e-07,-1.038135e-07,-1.065899e-07,-1.097433e-07,-1.066505e-07
4,0.000000e+00,0.000000e+00,0.000000e+00,-1.036990e-07,,-1.035884e-07,-1.038229e-07,0.000000e+00,-1.096869e-07,-1.036236e-07,...,0.000000e+00,-1.066467e-07,0.000000e+00,0.000000e+00,-1.095438e-07,-1.034849e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.034487e-07,-1.034075e-07,-1.034514e-07,-2.127918e-07,-1.034849e-07,-2.251020e-07,-2.130529e-07,-1.126380e-07,-2.070696e-07,-2.126331e-07,...,-1.062090e-07,-2.071179e-07,-1.036059e-07,-1.159546e-07,-2.068147e-07,,-1.035990e-07,-1.094145e-07,-1.064480e-07,-1.094783e-07
96,0.000000e+00,0.000000e+00,0.000000e+00,-1.038135e-07,0.000000e+00,-1.037027e-07,-1.039378e-07,0.000000e+00,-1.098150e-07,-1.037379e-07,...,0.000000e+00,-1.067679e-07,0.000000e+00,0.000000e+00,-1.096717e-07,-1.035990e-07,,0.000000e+00,0.000000e+00,0.000000e+00
97,0.000000e+00,0.000000e+00,0.000000e+00,-1.065899e-07,0.000000e+00,-1.095302e-07,-1.067209e-07,0.000000e+00,-1.037185e-07,-1.065103e-07,...,0.000000e+00,-1.037427e-07,0.000000e+00,0.000000e+00,-1.035906e-07,-1.094145e-07,0.000000e+00,,0.000000e+00,0.000000e+00
98,0.000000e+00,0.000000e+00,0.000000e+00,-1.097433e-07,0.000000e+00,-1.065575e-07,-1.098822e-07,0.000000e+00,-1.037985e-07,-1.096589e-07,...,0.000000e+00,-1.038228e-07,0.000000e+00,0.000000e+00,-1.036704e-07,-1.064480e-07,0.000000e+00,0.000000e+00,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.0876838276966081e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.0876838276966081e-07)
    np.nanmedian(norm_err)=np.float64(-1.0628656889046816e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0628656889046816e-07)
    
