In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-05-18T00:25:36.556134+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1027-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3
downstream                        : 1.14.3
hstrat                            : 1.20.10

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10341.24it/s]
100%|██████████| 100/100 [00:00<00:00, 413.97it/s]
6108it [00:00, 616266.36it/s]
100%|██████████| 100/100 [00:00<00:00, 246578.72it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.000000,0.000000,0.000000,-0.314717,0.000000,0.0,0.000000,0.0,...,0.000000,0.160366,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.165001
1,0.000000,,0.000000,0.000000,0.000000,0.437391,0.000000,0.0,0.000000,0.0,...,0.000000,-0.386629,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.000000
2,0.000000,0.0,,0.000000,0.000000,-0.420759,0.000000,0.0,0.000000,0.0,...,0.000000,0.206854,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.037464
3,0.000000,0.0,0.000000,,0.000000,0.000000,-0.404202,0.0,0.506932,0.0,...,-0.133563,0.000000,0.000000,0.000000,0.549825,0.313957,0.000000,0.0000,0.099673,0.000000
4,0.000000,0.0,0.000000,0.000000,,-0.249948,0.000000,0.0,0.000000,0.0,...,0.000000,0.130265,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.405469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.0,0.000000,0.313957,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.227710,0.000000,,0.000000,0.0000,0.000000,0.000000
96,0.000000,0.0,0.000000,0.000000,0.000000,-0.248033,0.000000,0.0,0.000000,0.0,...,0.000000,0.129355,0.000000,0.000000,0.000000,0.000000,,0.0000,0.000000,-1.194503
97,0.000000,0.0,0.000000,0.000000,0.000000,-0.268708,0.000000,0.0,0.000000,0.0,...,0.000000,0.139124,0.000000,0.000000,0.000000,0.000000,0.000000,,0.000000,-0.146600
98,0.000000,0.0,0.000000,0.099673,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.073249,0.000000,0.000000,0.000000,0.0000,,0.000000


np.nanmean(norm_err)=np.float64(-0.0031877682956515316)
    np.nanmean(np.abs(norm_err))=np.float64(0.09746312461873559)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 32678.64it/s]
100%|██████████| 100/100 [00:00<00:00, 406.05it/s]
5961it [00:00, 617247.97it/s]
100%|██████████| 100/100 [00:00<00:00, 271827.87it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,3.120984e-03,-3.456283e-07,-5.523060e-07,1.809647e-02,-8.488327e-07,-4.451643e-07,7.500637e-03,3.149827e-02,-5.926303e-07,...,-5.207255e-07,-1.078611e-02,0.005125,-1.016859e-02,2.499331e-02,-2.556654e-07,4.664838e-03,-3.512403e-07,-3.699917e-07,-6.377232e-07
1,3.120984e-03,,-2.625565e-07,-2.746169e-07,2.964562e-03,-4.776664e-07,-3.162771e-07,-2.516293e-07,-3.627900e-07,3.122408e-03,...,-3.526316e-07,-1.559458e-07,0.026215,-2.989836e-07,-1.547864e-07,-1.741490e-07,-3.772381e-07,2.165238e-03,-2.763816e-07,3.271190e-03
2,-3.456283e-07,-2.625565e-07,,-2.470391e-07,-3.298414e-07,-4.353951e-07,-3.008353e-07,-2.196528e-07,-3.301092e-07,-3.457714e-07,...,-3.299370e-07,-1.440534e-07,-0.024491,-2.770576e-07,-1.427290e-07,-1.630701e-07,-3.401228e-07,-2.468072e-07,-3.102292e-07,-3.606501e-07
3,-5.523060e-07,-2.746169e-07,-2.470391e-07,,-4.790308e-07,-1.759310e-06,-3.630947e-07,0.000000e+00,-4.801624e-07,-5.530372e-07,...,-4.756979e-07,0.000000e+00,-0.028171,-3.083809e-07,0.000000e+00,0.000000e+00,-5.251393e-07,-2.422763e-07,-2.727094e-07,-6.371178e-07
4,1.809647e-02,2.964562e-03,-3.298414e-07,-4.790308e-07,,-7.595509e-07,-4.193153e-07,6.656583e-03,6.379270e-02,-9.014240e-03,...,-4.857018e-07,-1.700440e-02,0.004883,-1.608054e-02,5.119840e-02,-2.387592e-07,4.323839e-03,-5.503234e-03,-3.519587e-07,-9.647511e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.556654e-07,-1.741490e-07,-1.630701e-07,0.000000e+00,-2.387592e-07,-3.678731e-07,-2.066760e-07,0.000000e+00,-2.390400e-07,-2.558220e-07,...,-2.426580e-07,0.000000e+00,-0.021098,-1.871428e-07,0.000000e+00,,-2.496861e-07,-1.605577e-07,-1.738739e-07,-2.724543e-07
96,4.664838e-03,-3.772381e-07,-3.401228e-07,-5.251393e-07,4.323839e-03,-8.163788e-07,-4.360729e-07,-5.063864e-07,-5.297100e-07,4.668020e-03,...,-5.083288e-07,-2.138898e-07,0.010832,-4.038651e-07,-2.117148e-07,-2.496861e-07,,2.810559e-03,-3.636897e-07,5.008587e-03
97,-3.512403e-07,2.165238e-03,-2.468072e-07,-2.422763e-07,-5.503234e-03,-4.279810e-07,-2.937008e-07,3.639743e-03,8.204355e-03,-2.357174e-02,...,-3.247954e-07,-2.647077e-02,0.003624,-2.543661e-02,7.083905e-03,-1.605577e-07,2.810559e-03,,-2.589850e-07,-3.804830e-07
98,-3.699917e-07,-2.763816e-07,-3.102292e-07,-2.727094e-07,-3.519587e-07,-4.747782e-07,-3.191259e-07,-2.397160e-07,-3.522637e-07,-3.701556e-07,...,-3.520676e-07,-1.524196e-07,-0.025738,-2.924969e-07,-1.509377e-07,-1.738739e-07,-3.636897e-07,-2.589850e-07,,-3.872588e-07


np.nanmean(norm_err)=np.float64(-0.0009646201157617004)
    np.nanmean(np.abs(norm_err))=np.float64(0.009681584436486717)
    np.nanmedian(norm_err)=np.float64(-2.1173819634621006e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(6.887971742849105e-07)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 36117.32it/s]
100%|██████████| 100/100 [00:00<00:00, 460.51it/s]
5948it [00:00, 667640.44it/s]
100%|██████████| 100/100 [00:00<00:00, 404465.19it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 31240.16it/s]
100%|██████████| 100/100 [00:00<00:00, 1013.82it/s]
5924it [00:00, 632917.03it/s]
100%|██████████| 100/100 [00:00<00:00, 387285.69it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.078249e-07,-1.038240e-07,-2.261988e-07,-1.065784e-07,-1.038764e-07,-1.038295e-07,-1.065454e-07,-2.131537e-07,-2.132962e-07,...,-2.135756e-07,-1.038623e-07,-2.194575e-07,-2.257279e-07,-1.067348e-07,-2.130458e-07,-1.039141e-07,-2.262650e-07,-2.399041e-07,-1.202317e-07
1,-2.078249e-07,,-1.098687e-07,-2.078500e-07,-1.037450e-07,-1.099274e-07,-1.098748e-07,-1.037137e-07,-2.074870e-07,-2.076220e-07,...,-2.078868e-07,-1.068334e-07,-2.076459e-07,-2.074523e-07,-1.038932e-07,-2.073848e-07,-1.068882e-07,-2.079059e-07,-2.074367e-07,-1.039274e-07
2,-1.038240e-07,-1.098687e-07,,-1.038365e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.036553e-07,-1.037227e-07,...,-1.038549e-07,0.000000e+00,-1.037346e-07,-1.036380e-07,0.000000e+00,-1.036043e-07,0.000000e+00,-1.038644e-07,-1.036302e-07,0.000000e+00
3,-2.261988e-07,-2.078500e-07,-1.038365e-07,,-1.065916e-07,-1.038889e-07,-1.038420e-07,-1.065586e-07,-2.131801e-07,-2.133226e-07,...,-2.136021e-07,-1.038748e-07,-2.194855e-07,-2.399584e-07,-1.067481e-07,-2.130722e-07,-1.039266e-07,-2.332120e-07,-2.257390e-07,-1.131171e-07
4,-1.065784e-07,-1.037450e-07,0.000000e+00,-1.065916e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.094536e-07,-1.095287e-07,...,-1.096761e-07,0.000000e+00,-1.064842e-07,-1.063824e-07,0.000000e+00,-1.093967e-07,0.000000e+00,-1.066210e-07,-1.063742e-07,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.130458e-07,-2.073848e-07,-1.036043e-07,-2.130722e-07,-1.093967e-07,-1.036565e-07,-1.036098e-07,-1.235274e-07,-2.252498e-07,-2.322713e-07,...,-2.257210e-07,-1.036425e-07,-2.128578e-07,-2.126543e-07,-1.128013e-07,,-1.036940e-07,-2.131310e-07,-2.126379e-07,-1.065386e-07
96,-1.039141e-07,-1.068882e-07,0.000000e+00,-1.039266e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.037451e-07,-1.038126e-07,...,-1.039450e-07,0.000000e+00,-1.038246e-07,-1.037278e-07,0.000000e+00,-1.036940e-07,,-1.039546e-07,-1.037200e-07,0.000000e+00
97,-2.262650e-07,-2.079059e-07,-1.038644e-07,-2.332120e-07,-1.066210e-07,-1.039169e-07,-1.038699e-07,-1.065880e-07,-2.132389e-07,-2.133815e-07,...,-2.136612e-07,-1.039027e-07,-2.195479e-07,-2.327115e-07,-1.067776e-07,-2.131310e-07,-1.039546e-07,,-2.258049e-07,-1.131502e-07
98,-2.399041e-07,-2.074367e-07,-1.036302e-07,-2.257390e-07,-1.063742e-07,-1.036824e-07,-1.036357e-07,-1.063414e-07,-2.127453e-07,-2.128873e-07,...,-2.131656e-07,-1.036684e-07,-2.190247e-07,-2.252699e-07,-1.065301e-07,-2.126379e-07,-1.037200e-07,-2.258049e-07,,-1.238676e-07


np.nanmean(norm_err)=np.float64(-9.823540687177366e-08)
    np.nanmean(np.abs(norm_err))=np.float64(9.823540687177366e-08)
    np.nanmedian(norm_err)=np.float64(-1.0382873118579437e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0382873118579437e-07)
    
