## Set Up Dependencies and Data


In [None]:
import random

import alifedata_phyloinformatics_convert as apc
from Bio import Phylo as BioPhylo
from hstrat import hstrat
from hsurf import hsurf
import joblib
import more_itertools as mit
import pandas as pd
from teeplot import teeplot as tp

from pylib._draw_biopython_tree import draw_biopython_tree
from pylib._make_hamming_distance_matrix import make_hamming_distance_matrix
from pylib._val_to_color import val_to_color


In [None]:
df = pd.read_csv("https://osf.io/x3h9c/download")


## Reproducibility


In [None]:
%load_ext watermark
%watermark -iwbmuvg -iv


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
joblib.hash(df)


## Data Prep


In [None]:
exclude_leading = 16

df["bitfield"] = df["bitfield"].apply(lambda x: int(x, 16))
df["bitfield value bitlengths"] = df["bitfield"].apply(int.bit_length)
df["bitfield wordlengths"] = (df["bitfield value bitlengths"] + 31) // 32
assert mit.one(df["bitfield wordlengths"].unique()) == 3
df["bitfield bitlengths"] = df["bitfield wordlengths"] * 32
df["surface bitlengths"] = df["bitfield bitlengths"] - exclude_leading
df["surface bytelengths"] = df["surface bitlengths"] // 8

df


In [None]:
bitfield_bitlength = int(mit.one(df["bitfield bitlengths"].unique()))
surface_mask = (  # mask off leading 16 bit
    1 << (bitfield_bitlength - exclude_leading)
) - 1
assert surface_mask.bit_count() == bitfield_bitlength - exclude_leading
df["bitfield surface"] = df["bitfield"].values & surface_mask

df


In [None]:
df["bitfield tag"] = df["bitfield"].values >> (
    bitfield_bitlength - exclude_leading
)

df


In [None]:
df = df.groupby("replicate", group_keys=False).apply(
    lambda group: group.assign(**{
        "taxon name": (
            group.groupby("replicate").cumcount().astype(str) + "-" + group["bitfield tag"].apply(hex)
        )
    })
)
df


In [None]:
df.dtypes


## Deserialize Columns


In [None]:
# import hsurf
# hsurf.__version__
from hsurf import hsurf


In [None]:
surface_bytelength = int(mit.one(df["surface bytelengths"].unique()))
print(f"{surface_bytelength=}")
site_selection_algo = hsurf.tilted_sticky_algo
differentia_bitwidth = 1

print(df["bitfield surface"])
df["hstrat_columns"] = [
    hsurf.col_from_surf_int(
        value=value,
        differentia_bit_width=differentia_bitwidth,
        site_selection_algo=site_selection_algo,
        differentiae_byte_bit_order="little",
        num_strata_deposited_byte_width=2,  # u16
        num_strata_deposited_byte_order="little",
        value_byte_width=surface_bytelength,
    )
    for value in df["bitfield surface"].values
]


In [None]:
for col in df["hstrat_columns"]:
    print(col.GetNumStrataDeposited())


## Ascii Reconstructed Tree


In [None]:
for names, group in df.groupby("replicate"):
    tree_df = hstrat.build_tree(
        group["hstrat_columns"].values,
        hstrat.__version__,
        taxon_labels=group["taxon name"].values,
        force_common_ancestry=True,
    )
    print(apc.RosettaTree(tree_df).as_dendropy.as_ascii_plot(plot_metric="length"))


In [None]:
tree_df["name"] = tree_df["taxon_label"]


## Plotted Reconstructed Tree


In [None]:
salt = "7"  # manually chosen for nice generated color


# adapted from https://github.com/mmore500/hstrat-recomb-concept/blob/b71d36216f1d2990343b6435240d8c193a82690b/pylib/tree/color_biopython_tree.py
def color_biopython_tree(tree: BioPhylo.BaseTree) -> None:
    """Recursively color tree"""
    terminals = [x.name.split("-")[1] for x in tree.get_terminals()]
    if len(set(terminals)) != 1:
        tree.color = (220, 220, 220)
    else:
        tree.color = val_to_color(mit.one(set(terminals)) + salt)

    for clade in tree.clades:
        color_biopython_tree(clade)


In [None]:
for names, group in df.groupby("replicate"):
    tree_df = hstrat.build_tree(
        group["hstrat_columns"].values,
        hstrat.__version__,
        taxon_labels=group["taxon name"].values,
        force_common_ancestry=True,
    )
    tree_df["name"] = tree_df["taxon_label"]

    biopy_tree = apc.RosettaTree(tree_df).as_biopython
    print(biopy_tree.get_terminals()[0].name)
    color_biopython_tree(biopy_tree.root)

    tp.tee(
        draw_biopython_tree,
        biopy_tree,
        fig_size=(12, 1.5),
        teeplot_outattrs={
            "genome": "hsurftiltedsticky_tagged",
            "replicate": group["replicate"].values[0],
        },
    )
