In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-06-22T00:24:24.740586+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1029-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

downstream                        : 1.14.3
hstrat                            : 1.20.10
pandas                            : 2.2.3
alifedata_phyloinformatics_convert: 0.19.3
numpy                             : 2.1.2

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10702.21it/s]
100%|██████████| 100/100 [00:00<00:00, 388.82it/s]
6121it [00:00, 634473.48it/s]
100%|██████████| 100/100 [00:00<00:00, 246289.14it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.115115,0.000000,0.000000,0.000000,0.000000,0.120502,0.148349,0.106477,0.101090,...,0.000000,0.000000,0.000000,0.000000,0.136422,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.115115,,-0.008763,-0.019714,0.111483,0.060448,0.542876,0.063417,0.500889,0.291011,...,-0.022407,-0.019435,-0.015879,-0.775141,0.586428,-0.015697,-0.310077,-0.043975,-0.020566,0.402539
2,0.000000,-0.008763,,0.000000,0.000000,0.000000,-0.009170,-0.011271,-0.008109,-0.007701,...,0.000000,0.000000,0.000000,0.000000,-0.010372,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,-0.019714,0.000000,,0.000000,0.000000,-0.273236,-0.308830,-0.252792,-0.244419,...,0.000000,0.000000,-0.211687,0.000000,-0.294324,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.111483,0.000000,0.000000,,0.000000,0.116527,0.142372,0.103362,0.098278,...,0.000000,0.000000,0.000000,0.000000,0.131350,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,-0.015697,0.000000,0.000000,0.000000,0.000000,-0.210946,-0.231550,-0.198550,-0.193348,...,0.000000,0.000000,-0.169411,0.000000,-0.223298,,0.000000,0.000000,0.000000,0.000000
96,0.000000,-0.310077,0.000000,0.000000,0.000000,0.000000,-0.028153,-0.032836,-0.025591,-0.024568,...,0.000000,0.000000,0.230566,0.000000,-0.030892,0.000000,,0.000000,0.000000,0.000000
97,0.000000,-0.043975,0.000000,0.000000,0.000000,0.000000,-0.036639,-0.044990,-0.032417,-0.030792,...,0.000000,0.000000,0.007104,0.000000,-0.041419,0.000000,0.000000,,0.000000,0.000000
98,0.000000,-0.020566,0.000000,0.000000,0.000000,0.000000,-0.286965,-0.326484,-0.264500,-0.255347,...,0.000000,0.000000,-0.220607,0.000000,-0.310317,0.000000,0.000000,0.000000,,0.000000


np.nanmean(norm_err)=np.float64(-0.016002619248503158)
    np.nanmean(np.abs(norm_err))=np.float64(0.11192340821963496)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29901.65it/s]
100%|██████████| 100/100 [00:00<00:00, 423.10it/s]
5991it [00:00, 625154.25it/s]
100%|██████████| 100/100 [00:00<00:00, 231729.50it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.001287,-1.681903e-02,-1.749499e-03,-1.286808e-03,-2.663902e-03,-1.774396e-02,-2.182912e-03,-3.506593e-03,-0.001358,...,-0.001212,-0.035761,-2.921316e-03,-1.516921e-02,-0.040379,-1.668277e-02,-0.046873,-1.352784e-03,-0.079152,-2.861800e-03
1,-0.001287,,-5.110603e-04,-1.900492e-02,-7.339763e-03,1.907361e-02,-5.310965e-04,-2.202201e-02,-1.850379e-03,-0.001523,...,-0.001399,0.001665,-1.262838e-02,-4.735587e-04,-0.000577,-5.080389e-04,0.001644,-1.580893e-02,0.001916,-1.248116e-02
2,-0.016819,-0.000511,,-3.145873e-07,-2.495485e-07,-1.565011e-03,-9.396101e-03,-1.837515e-07,-1.011550e-03,0.106643,...,0.038775,-0.028341,-2.217919e-07,-8.281187e-03,-0.013522,-3.012600e-07,-0.020654,-1.297323e-07,-0.024454,-2.190045e-07
3,-0.001749,-0.019005,-3.145873e-07,,-2.992590e-07,-6.225011e-04,-1.652015e-07,-2.554784e-07,-4.432995e-03,-0.001279,...,-0.001151,-0.001254,-3.147197e-07,-2.860255e-07,0.047839,-3.122458e-07,-0.001230,-6.571510e-02,-0.001562,-3.091365e-07
4,-0.001287,-0.007340,-2.495485e-07,-2.992590e-07,,-4.973264e-04,-1.296991e-07,-1.733772e-07,-2.774600e-03,-0.001016,...,-0.000933,-0.000832,-2.082638e-07,-2.312320e-07,0.008118,-2.480728e-07,-0.000821,-1.244738e-07,-0.000957,-2.397587e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.016683,-0.000508,-3.012600e-07,-3.122458e-07,-2.480728e-07,-1.555567e-03,-9.332311e-03,-1.821557e-07,-9.997817e-04,0.105950,...,0.038552,-0.028043,-2.194712e-07,-8.231598e-03,-0.013426,,-0.020440,-1.289349e-07,-0.024155,-2.167414e-07
96,-0.046873,0.001644,-2.065402e-02,-1.229730e-03,-8.213447e-04,-4.149642e-07,-2.213028e-02,-1.695051e-03,-9.794065e-07,-0.000879,...,-0.000762,-0.000001,-2.761462e-03,-1.813060e-02,-0.035049,-2.044038e-02,,-8.748511e-04,-0.000001,-2.658628e-03
97,-0.001353,-0.015809,-1.297323e-07,-6.571510e-02,-1.244738e-07,-5.164005e-04,0.000000e+00,1.048005e-01,-2.980491e-03,-0.001056,...,-0.000967,-0.000887,0.000000e+00,-1.198606e-07,0.058394,-1.289349e-07,-0.000875,,-0.001031,0.000000e+00
98,-0.079152,0.001916,-2.445431e-02,-1.561624e-03,-9.571333e-04,-2.432614e-07,-2.655145e-02,-2.397919e-03,0.000000e+00,-0.001036,...,-0.000878,-0.000001,-5.286894e-03,-2.099459e-02,-0.055130,-2.415537e-02,-0.000001,-1.030649e-03,,-4.922296e-03


np.nanmean(norm_err)=np.float64(-0.0015745423593073393)
    np.nanmean(np.abs(norm_err))=np.float64(0.010948390819365831)
    np.nanmedian(norm_err)=np.float64(-1.89584265203464e-06)
    np.nanmedian(np.abs(norm_err))=np.float64(0.001018825177889976)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35481.80it/s]
100%|██████████| 100/100 [00:00<00:00, 439.71it/s]
5947it [00:00, 650333.10it/s]
100%|██████████| 100/100 [00:00<00:00, 253892.49it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 29131.16it/s]
100%|██████████| 100/100 [00:00<00:00, 401.45it/s]
5954it [00:00, 677047.20it/s]
100%|██████████| 100/100 [00:00<00:00, 375161.36it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.426482e-07,-1.038682e-07,-1.038187e-07,-1.036632e-07,0.000000e+00,-1.130196e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.037313e-07,-1.095356e-07,-1.097294e-07,0.000000e+00,-1.036408e-07,-1.038360e-07,0.000000e+00,0.000000e+00,-1.067256e-07
1,-1.426482e-07,,-2.076512e-07,-2.075523e-07,-2.072416e-07,-1.037280e-07,-2.259383e-07,-1.037180e-07,-1.038012e-07,-1.064577e-07,...,-1.038313e-07,-2.073776e-07,-2.189764e-07,-2.193638e-07,-1.067624e-07,-2.071968e-07,-2.075868e-07,-1.067891e-07,-1.036240e-07,-2.133613e-07
2,-1.038682e-07,-2.076512e-07,,-2.134440e-07,-2.131154e-07,-1.066711e-07,-2.075917e-07,-1.129784e-07,-1.067485e-07,-1.036115e-07,...,-1.165693e-07,-2.193918e-07,-2.071770e-07,-2.075237e-07,-1.039001e-07,-2.191894e-07,-2.261357e-07,-1.039254e-07,-1.065611e-07,-2.076454e-07
3,-1.038187e-07,-2.075523e-07,-2.134440e-07,,-2.397909e-07,-1.096845e-07,-2.074928e-07,-1.066083e-07,-1.097664e-07,-1.035622e-07,...,-1.067280e-07,-2.131550e-07,-2.070785e-07,-2.074249e-07,-1.038506e-07,-2.129639e-07,-2.133760e-07,-1.038758e-07,-1.095682e-07,-2.075464e-07
4,-1.036632e-07,-2.072416e-07,-2.131154e-07,-2.397909e-07,,-1.095110e-07,-2.071823e-07,-1.064444e-07,-1.095926e-07,-1.034075e-07,...,-1.065637e-07,-2.128273e-07,-2.067692e-07,-2.071146e-07,-1.036950e-07,-2.126368e-07,-2.130476e-07,-1.037202e-07,-1.093950e-07,-2.072357e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.036408e-07,-2.071968e-07,-2.191894e-07,-2.129639e-07,-2.126368e-07,-1.064313e-07,-2.071375e-07,-1.094748e-07,-1.065083e-07,-1.033852e-07,...,-1.096010e-07,-2.322086e-07,-2.067246e-07,-2.070698e-07,-1.036726e-07,,-2.191176e-07,-1.036977e-07,-1.063217e-07,-2.071909e-07
96,-1.038360e-07,-2.075868e-07,-2.261357e-07,-2.133760e-07,-2.130476e-07,-1.066372e-07,-2.075273e-07,-1.163861e-07,-1.067145e-07,-1.035794e-07,...,-1.130746e-07,-2.193200e-07,-2.071129e-07,-2.074594e-07,-1.038679e-07,-2.191176e-07,,-1.038931e-07,-1.065271e-07,-2.075810e-07
97,0.000000e+00,-1.067891e-07,-1.039254e-07,-1.038758e-07,-1.037202e-07,0.000000e+00,-1.067576e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.037883e-07,-1.065382e-07,-1.067216e-07,0.000000e+00,-1.036977e-07,-1.038931e-07,,0.000000e+00,-1.098614e-07
98,0.000000e+00,-1.036240e-07,-1.065611e-07,-1.095682e-07,-1.093950e-07,0.000000e+00,-1.035943e-07,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,-1.064170e-07,-1.033877e-07,-1.035604e-07,0.000000e+00,-1.063217e-07,-1.065271e-07,0.000000e+00,,-1.036210e-07


np.nanmean(norm_err)=np.float64(-1.108539695932004e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.108539695932004e-07)
    np.nanmedian(norm_err)=np.float64(-1.0642430871705767e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0642430871705767e-07)
    
