In [1]:
%load_ext watermark


In [2]:
from downstream import dstream
from hstrat import hstrat
import alifedata_phyloinformatics_convert as apc
import numpy as np
import pandas as pd
from tqdm import tqdm




In [3]:
%watermark -diwmuv -iv


Last updated: 2025-08-03T00:25:21.423306+00:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.31.1

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.8.0-1031-azure
Machine     : x86_64
Processor   : x86_64
CPU cores   : 4
Architecture: 64bit

pandas                            : 2.2.3
downstream                        : 1.14.3
hstrat                            : 1.20.10
numpy                             : 2.1.2
alifedata_phyloinformatics_convert: 0.19.3

Watermark: 2.4.3



In [4]:
teeplot_subdir = "2025-05-09-reconstruction-consistency"
teeplot_subdir


'2025-05-09-reconstruction-consistency'

## Prep Data


In [5]:
def make_norm_err_matrix(
    raw_genome_df: pd.DataFrame, dstream_algo: str
) -> pd.DataFrame:
    raw_genome_df = raw_genome_df.sample(
        100
    )  # sample to size dendropy/naive can handle
    # raw_genome_df = raw_genome_df[raw_genome_df["layer"] > 1700].sample(20).copy()
    raw_genome_df["taxon_label"] = np.arange(len(raw_genome_df)).astype(str)
    raw_genome_df["dstream_algo"] = dstream_algo

    kwargs = dict(
        dstream_algo=eval(
            raw_genome_df["dstream_algo"].unique().astype(str).item(),
            {"dstream": dstream},
        ),
        dstream_S=raw_genome_df["dstream_S"].unique().item(),
        dstream_storage_bitoffset=raw_genome_df["dstream_storage_bitoffset"]
        .unique()
        .item(),
        dstream_storage_bitwidth=raw_genome_df["dstream_storage_bitwidth"]
        .unique()
        .item(),
        dstream_T_bitoffset=raw_genome_df["dstream_T_bitoffset"]
        .unique()
        .item(),
        dstream_T_bitwidth=raw_genome_df["dstream_T_bitwidth"].unique().item(),
    )

    population = [
        hstrat.surf_from_hex(
            genome_hex,
            **kwargs,
        )
        for genome_hex in tqdm(raw_genome_df["data_hex"].astype(str))
    ]
    for surface in population:
        np.random.shuffle(surface._surface._storage)  # ensure synthetic data

    naive_df = hstrat.build_tree_trie(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
        bias_adjustment=hstrat.AssignOriginTimeNodeRankTriePostprocessor(),
    )

    shortcut_df = hstrat.build_tree_searchtable(
        population,
        taxon_labels=raw_genome_df["taxon_label"],
        force_common_ancestry=True,
        progress_wrap=tqdm,
    )

    naive_dist = (
        pd.DataFrame(
            apc.RosettaTree(naive_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    shortcut_dist = (
        pd.DataFrame(
            apc.RosettaTree(shortcut_df)
            .as_dendropy.phylogenetic_distance_matrix()
            .as_data_table()
            ._data
        )
        .sort_index(axis=0)
        .sort_index(axis=1)
    )

    return (naive_dist.values - shortcut_dist.values) / (
        naive_dist.values / 2 + shortcut_dist.values / 2
    )


In [6]:
raw_genome_df_ = pd.read_parquet("https://osf.io/fe73v/download")  # sample

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 10600.78it/s]
100%|██████████| 100/100 [00:00<00:00, 378.93it/s]
6134it [00:00, 657648.34it/s]
100%|██████████| 100/100 [00:00<00:00, 275941.05it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-0.077050,-0.066182,0.042904,0.243707,0.193107,-0.105447,-0.198419,0.259993,0.000000,...,-0.355386,-0.364647,-0.455060,-0.023065,0.249159,-0.234478,0.191326,0.189442,-0.135258,-0.085964
1,-0.077050,,0.000000,-0.351774,0.046003,-0.019851,-0.042429,-0.085327,0.050830,-0.060471,...,0.297015,0.000000,0.400261,0.000000,-0.024358,0.184797,-0.019700,-0.019540,-0.268460,0.000000
2,-0.066182,0.000000,,-0.181897,0.035516,-0.019045,-0.038910,-0.072198,0.038325,-0.053567,...,0.140405,0.000000,0.180832,0.000000,-0.023156,0.091991,-0.018907,-0.018759,-0.187795,0.000000
3,0.042904,-0.351774,-0.181897,,0.105288,0.019472,0.618560,0.490128,0.151907,0.038496,...,-0.020781,-0.038810,-0.023951,-0.439941,0.022540,-0.015844,0.019363,0.019246,-0.054984,-0.208238
4,0.243707,0.046003,0.035516,0.105288,,0.055145,0.123810,0.275403,0.000000,0.183879,...,0.011865,0.059000,0.015362,0.008831,0.068319,0.007726,0.054709,0.054246,0.000000,0.056548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.234478,0.184797,0.091991,-0.015844,0.007726,-0.236664,-0.087217,-0.109942,0.007851,-0.216954,...,-0.342023,-0.065963,-0.395923,0.525166,-0.269030,,-0.235491,-0.234240,-0.032712,0.099986
96,0.191326,-0.019700,-0.018907,0.019363,0.054709,0.000000,-0.042280,-0.052061,0.055488,0.000000,...,-0.291661,-0.076649,-0.324544,-0.012325,0.000000,-0.235491,,0.000000,-0.027966,-0.020237
97,0.189442,-0.019540,-0.018759,0.019246,0.054246,0.000000,-0.042003,-0.051641,0.055012,0.000000,...,-0.289744,-0.075988,-0.322172,-0.012262,0.000000,-0.234240,0.000000,,-0.027724,-0.020068
98,-0.135258,-0.268460,-0.187795,-0.054984,0.000000,-0.028194,-0.065172,-0.155084,0.000000,-0.099357,...,-0.051777,-0.395040,-0.068811,-0.037692,-0.035123,-0.032712,-0.027966,-0.027724,,-0.368376


np.nanmean(norm_err)=np.float64(-0.02482893951560009)
    np.nanmean(np.abs(norm_err))=np.float64(0.14130434938286637)
    np.nanmedian(norm_err)=np.float64(-0.015011264074871604)
    np.nanmedian(np.abs(norm_err))=np.float64(0.060043953012857684)
    


100%|██████████| 100/100 [00:00<00:00, 34444.48it/s]
100%|██████████| 100/100 [00:00<00:00, 437.56it/s]
5971it [00:00, 569250.81it/s]
100%|██████████| 100/100 [00:00<00:00, 289461.97it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-2.149200e-07,-4.503351e-07,-0.002803,-2.231055e-07,-8.263749e-07,-8.020388e-03,-4.921307e-07,-2.680365e-06,-1.338725e-06,...,-3.356952e-07,-2.556744e-06,-1.983097e-07,-7.598573e-07,-5.283864e-07,-4.097523e-03,0.021264,2.646506e-02,-6.132951e-07,-3.874414e-03
1,-2.149200e-07,,0.000000e+00,-0.001290,0.000000e+00,0.000000e+00,-2.980977e-03,-1.335220e-07,-2.435416e-07,-2.032767e-07,...,0.000000e+00,-2.376602e-07,0.000000e+00,0.000000e+00,0.000000e+00,-2.198597e-03,0.024932,7.572447e-02,0.000000e+00,-2.132696e-03
2,-4.503351e-07,0.000000e+00,,0.000000,0.000000e+00,-4.391564e-02,1.120275e-02,-2.009986e-07,-5.749491e-07,-2.437959e-02,...,0.000000e+00,-5.905131e-07,0.000000e+00,-4.175978e-02,0.000000e+00,6.724502e-03,0.000000,-2.132173e-02,0.000000e+00,6.422248e-03
3,-2.803300e-03,-1.290471e-03,0.000000e+00,,-1.326133e-03,0.000000e+00,-5.541547e-02,-1.014482e-02,-3.355940e-03,-3.222565e-07,...,-1.255459e-02,-5.255465e-02,-1.215828e-03,0.000000e+00,0.000000e+00,-3.581675e-02,-0.010460,1.502526e-03,-1.781303e-02,-6.999733e-02
4,-2.231055e-07,0.000000e+00,0.000000e+00,-0.001326,,0.000000e+00,-3.077334e-03,-1.366364e-07,-2.541061e-07,-2.105843e-07,...,0.000000e+00,-2.477100e-07,0.000000e+00,0.000000e+00,0.000000e+00,-2.250572e-03,0.073672,2.841287e-02,0.000000e+00,-2.181567e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-4.097523e-03,-2.198597e-03,6.724502e-03,-0.035817,-2.250572e-03,2.090726e-03,0.000000e+00,-9.766960e-03,-4.663356e-03,1.947101e-03,...,-1.165538e-02,-3.871620e-02,-2.088487e-03,2.034813e-03,7.211926e-03,,-0.010022,-1.560659e-07,-1.534177e-02,0.000000e+00
96,2.126435e-02,2.493172e-02,0.000000e+00,-0.010460,7.367171e-02,0.000000e+00,-1.426203e-02,-4.794855e-02,4.416991e-02,-2.452207e-07,...,-3.575992e-01,-1.470912e-02,6.813309e-02,0.000000e+00,0.000000e+00,-1.002212e-02,,3.146486e-02,-7.669490e-02,-9.683198e-03
97,2.646506e-02,7.572447e-02,-2.132173e-02,0.001503,2.841287e-02,-5.542763e-02,-2.238404e-07,2.498090e-03,5.123230e-02,-1.679601e-02,...,2.998395e-03,5.041491e-03,2.628275e-02,-1.503243e-01,-2.295719e-02,-1.560659e-07,0.031465,,3.991360e-03,-1.506940e-07
98,-6.132951e-07,0.000000e+00,0.000000e+00,-0.017813,0.000000e+00,0.000000e+00,-2.815437e-02,-3.535260e-07,-8.701308e-07,-5.509264e-07,...,0.000000e+00,-9.485340e-07,0.000000e+00,0.000000e+00,0.000000e+00,-1.534177e-02,-0.076695,3.991360e-03,,-1.456158e-02


np.nanmean(norm_err)=np.float64(-0.0007517814643375275)
    np.nanmean(np.abs(norm_err))=np.float64(0.014396505863327174)
    np.nanmedian(norm_err)=np.float64(-3.127006657254947e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(0.002979667586970366)
    


In [7]:
raw_genome_df_ = pd.read_parquet("https://osf.io/ebtpd/download")  # tail

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.tilted_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)

norm_err = make_norm_err_matrix(
    raw_genome_df_,
    "dstream.hybrid_0_steady_1_tilted_2_algo",
)
display(pd.DataFrame(norm_err))
print(
    f"""{np.nanmean(norm_err)=}
    {np.nanmean(np.abs(norm_err))=}
    {np.nanmedian(norm_err)=}
    {np.nanmedian(np.abs(norm_err))=}
    """,
)


100%|██████████| 100/100 [00:00<00:00, 35833.44it/s]
100%|██████████| 100/100 [00:00<00:00, 473.65it/s]
5941it [00:00, 697446.26it/s]
100%|██████████| 100/100 [00:00<00:00, 387285.69it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0


np.nanmean(norm_err)=np.float64(0.0)
    np.nanmean(np.abs(norm_err))=np.float64(0.0)
    np.nanmedian(norm_err)=np.float64(0.0)
    np.nanmedian(np.abs(norm_err))=np.float64(0.0)
    


100%|██████████| 100/100 [00:00<00:00, 33341.05it/s]
100%|██████████| 100/100 [00:00<00:00, 1010.73it/s]
5940it [00:00, 656967.16it/s]
100%|██████████| 100/100 [00:00<00:00, 426250.41it/s]
  return (naive_dist.values - shortcut_dist.values) / (


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,-1.064323e-07,0.000000e+00,-1.034584e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.096465e-07,-1.035970e-07,...,-1.161903e-07,0.000000e+00,-1.126297e-07,0.000000e+00,0.000000e+00,-1.093431e-07,0.000000e+00,-1.036589e-07,0.000000e+00,0.000000e+00
1,-1.064323e-07,,-1.064319e-07,-2.069708e-07,-1.035931e-07,-1.035916e-07,-1.036709e-07,-1.034684e-07,-2.132232e-07,-2.072481e-07,...,-2.129815e-07,-1.127536e-07,-2.127564e-07,-1.063456e-07,-1.094172e-07,-2.126495e-07,-1.064631e-07,-2.073720e-07,-1.065165e-07,-1.066474e-07
2,0.000000e+00,-1.064319e-07,,-1.034581e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.199930e-07,-1.035967e-07,...,-1.095183e-07,0.000000e+00,-1.093993e-07,0.000000e+00,0.000000e+00,-1.666715e-07,0.000000e+00,-1.036586e-07,0.000000e+00,0.000000e+00
3,-1.034584e-07,-2.069708e-07,-1.034581e-07,,-1.161067e-07,-1.063904e-07,-1.064741e-07,-1.093053e-07,-2.072557e-07,-2.128493e-07,...,-2.070273e-07,-1.034848e-07,-2.068146e-07,-1.033765e-07,-1.033960e-07,-2.067136e-07,-1.034875e-07,-2.129800e-07,-1.035380e-07,-1.036617e-07
4,0.000000e+00,-1.035931e-07,0.000000e+00,-1.161067e-07,,0.000000e+00,0.000000e+00,0.000000e+00,-1.037359e-07,-1.065386e-07,...,-1.036214e-07,0.000000e+00,-1.035149e-07,0.000000e+00,0.000000e+00,-1.034642e-07,0.000000e+00,-1.066041e-07,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-1.093431e-07,-2.126495e-07,-1.666715e-07,-2.067136e-07,-1.034642e-07,-1.034627e-07,-1.035419e-07,-1.033399e-07,-2.397136e-07,-2.069902e-07,...,-2.188096e-07,-1.063241e-07,-2.185721e-07,-1.092516e-07,-1.062304e-07,,-1.160292e-07,-2.071138e-07,-1.126640e-07,-1.128105e-07
96,0.000000e+00,-1.064631e-07,0.000000e+00,-1.034875e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.163709e-07,-1.036262e-07,...,-1.095513e-07,0.000000e+00,-1.094322e-07,0.000000e+00,0.000000e+00,-1.160292e-07,,-1.036881e-07,0.000000e+00,0.000000e+00
97,-1.036589e-07,-2.073720e-07,-1.036586e-07,-2.129800e-07,-1.066041e-07,-1.096671e-07,-1.130074e-07,-1.064720e-07,-2.076580e-07,-2.194070e-07,...,-2.074287e-07,-1.036854e-07,-2.072153e-07,-1.035767e-07,-1.035963e-07,-2.071138e-07,-1.036881e-07,,-1.037388e-07,-1.038630e-07
98,0.000000e+00,-1.065165e-07,0.000000e+00,-1.035380e-07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-1.129861e-07,-1.036768e-07,...,-1.096079e-07,0.000000e+00,-1.094887e-07,0.000000e+00,0.000000e+00,-1.126640e-07,0.000000e+00,-1.037388e-07,,0.000000e+00


np.nanmean(norm_err)=np.float64(-1.1743167349396121e-07)
    np.nanmean(np.abs(norm_err))=np.float64(1.1743167349396121e-07)
    np.nanmedian(norm_err)=np.float64(-1.0647075480265912e-07)
    np.nanmedian(np.abs(norm_err))=np.float64(1.0647075480265912e-07)
    
