## Set Up Dependencies and Data


In [None]:
import random

import alifedata_phyloinformatics_convert as apc
from hstrat import hstrat
from hsurf import hsurf
import joblib
import more_itertools as mit
import pandas as pd

from pylib._make_hamming_distance_matrix import make_hamming_distance_matrix


In [None]:
df = pd.read_csv("https://osf.io/u89ft/download")


## Reproducibility


In [None]:
%load_ext watermark
%watermark -iwbmuvg -iv


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
joblib.hash(df)


## Data Prep


In [None]:
exclude_leading = 16

df["bitfield"] = df["bitfield"].apply(int)
df["bitfield value bitlengths"] = df["bitfield"].apply(int.bit_length)
df["bitfield wordlengths"] = (df["bitfield value bitlengths"] + 31) // 32
assert mit.one(df["bitfield wordlengths"].unique()) == 3
df["bitfield bitlengths"] = df["bitfield wordlengths"] * 32
df["surface bitlengths"] = df["bitfield bitlengths"] - exclude_leading
df["surface bytelengths"] = df["surface bitlengths"] // 8

df


In [None]:
bitfield_bitlength = int(mit.one(df["bitfield bitlengths"].unique()))
surface_mask = (  # mask off leading 16 bits
    1 << (bitfield_bitlength - exclude_leading)
) - 1
assert surface_mask.bit_count() == bitfield_bitlength - exclude_leading
df["bitfield surface"] = df["bitfield"].values & surface_mask

df


In [None]:
df.dtypes


## Deserialize Columns


In [None]:
surface_bytelength = int(mit.one(df["surface bytelengths"].unique()))
print(f"{surface_bytelength=}")
site_selection_algo = hsurf.tilted_sticky_algo
differentia_bitwidth = 1

hstrat_columns = [
    hsurf.col_from_surf_int(
        value=value,
        differentia_bit_width=differentia_bitwidth,
        site_selection_algo=site_selection_algo,
        differentiae_byte_bit_order="little",
        num_strata_deposited_byte_width=2,  # u16
        num_strata_deposited_byte_order="little",
        value_byte_width=surface_bytelength,
    )
    for value in df["bitfield surface"].values
]


In [None]:
for col in hstrat_columns:
    print(col.GetNumStrataDeposited())


## Reconstruct Tree


In [None]:
tree_df = hstrat.build_tree(
    hstrat_columns,
    hstrat.__version__,
    force_common_ancestry=True,
)


## Surface simulation tree


In [None]:
print(apc.RosettaTree(tree_df).as_dendropy.as_ascii_plot(plot_metric="length"))


## Random tree


In [None]:
dummy = [random.randint(0, 2**80) for _ in range(9)]
print(make_hamming_distance_matrix(dummy).upgma_tree().as_ascii_plot())
