## Set Up Dependencies and Data


In [None]:
import os
import random
import threading


import alifedata_phyloinformatics_convert as apc
from ete4 import Tree
from ete4.treeview import TreeStyle
from hstrat._auxiliary_lib import (
    alifestd_find_leaf_ids,
    alifestd_collapse_unifurcations,
    alifestd_prune_extinct_lineages_asexual,
    alifestd_sum_origin_time_deltas_asexual,
    alifestd_to_working_format,
)
import joblib
from keyname import keyname as kn
import more_itertools as mit
import numpy as np
import pandas as pd
from phylotrackpy import systematics as syst
import seaborn as sns
from teeplot import teeplot as tp
from tqdm.notebook import tqdm


In [None]:
df = pd.read_parquet(
    "https://osf.io/r2a7t/download",
    engine="fastparquet",
)


## Reproducibility


In [None]:
%load_ext watermark
%watermark -iwbmuvg -iv


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
joblib.hash(df)


In [None]:
df.dtypes


## Helper Functions


In [None]:
def logify_phylo(phylo_df):
    phylo_df = phylo_df.copy()
    phylo_df["log time ago"] = np.log(
        phylo_df["origin_time"].max() - phylo_df["origin_time"] + 1
    )
    phylo_df["origin_time"] = (
        phylo_df["log time ago"].max() - phylo_df["log time ago"]
    )
    return phylo_df


## Calculate Phylometrics


In [None]:
import multiprocessing

records = []
for replicate, tree_df in tqdm(df.groupby("replicate")):
    tree_df = tree_df.reset_index(drop=True)
    tree_df = alifestd_to_working_format(tree_df)
    attrs = {
        col: mit.one(tree_df[col].unique())
        for col in tree_df.columns
        if len(tree_df[col].unique()) == 1 and col not in ["dataSource"]
    }

    def calc_mean_evolutionary_distinctiveness(tree_df):
        tree = apc.RosettaTree(tree_df).as_phylotrack
        return tree.get_mean_evolutionary_distinctiveness(
            tree_df["origin_time"].max()
        )

    def get_mean_evolutionary_distinctiveness(tree_df):
        with multiprocessing.Pool(1) as pool:
            result = pool.map(calc_mean_evolutionary_distinctiveness, [tree_df])
            return result[0]

    records.append(
        {
            **attrs,
            "replicate": replicate,
            "metric": "sum branch lengths",
            "value": alifestd_sum_origin_time_deltas_asexual(tree_df),
        },
    )
    records.append(
        {
            **attrs,
            "replicate": replicate,
            "metric": "mean evolutionary distinctiveness",
            "value":get_mean_evolutionary_distinctiveness(tree_df),
        },
    )

dfmetrics = pd.DataFrame.from_records(records)


## Plot Phylometrics


In [None]:
fil = dfmetrics[
    (dfmetrics["nCycle"] == 1e6)
    & (dfmetrics["popSize"] >= 10000)
    & (
        dfmetrics["metric"].isin(
            ["sum branch lengths", "mean evolutionary distinctiveness"],
        )
    )
].copy()
fil["num PEs (10k)"] = (fil["popSize"] / 1e4).astype(int)
fil["metric"] = fil["metric"].apply(
    lambda x: x.replace(" ", "\n").replace("\n", " ", 1)
)

fil["regime"] = fil["genomeFlavor"].map(
    {
        "genome_purifyingplus": "adaptive",
        "genome_purifyingonly": "purifying",
    },
)

tp.tee(
    sns.catplot,
    fil,
    y="value",
    x="num PEs (10k)",
    hue="regime",
    col="metric",
    sharex=True,
    sharey=False,
    kind="swarm",
    height=2,
    aspect=1.2,
    teeplot_postprocess="teed.set_titles('{col_name}')",
)


## Plotted Reconstructed Tree


In [None]:
os.makedirs("outplots/log", exist_ok=True)
for replicate, tree_df in tqdm(df.groupby("replicate")):
    tree_df = alifestd_to_working_format(tree_df.reset_index(drop=True))

    tips = alifestd_find_leaf_ids(tree_df)
    n_downsample = 1000
    kept = random.sample(tips, min(n_downsample, len(tips)))
    tree_df["extant"] = tree_df["id"].isin(kept)

    tree_df = alifestd_prune_extinct_lineages_asexual(tree_df)
    tree_df = alifestd_collapse_unifurcations(tree_df)

    tree_df = logify_phylo(tree_df)  # log scale branches

    attrs = {
        col: mit.one(tree_df[col].unique())
        for col in tree_df.columns
        if len(tree_df[col].unique()) == 1 and col not in ["dataSource"]
    }
    attrs["ext"] = ".pdf"

    tree_df = tree_df.drop("taxon_label", axis=1).copy()
    tree_df["taxon_label"] = tree_df["id"]

    t = Tree(apc.RosettaTree(tree_df).to_newick())
    t.to_ultrametric()

    ts = TreeStyle()
    ts.mode = "c"
    ts.arc_start = -180  # 0 degrees = 3 o'clock
    ts.arc_span = 360
    ts.show_leaf_name = False
    ts.show_scale = False

    outname = kn.pack(attrs)
    t.render(f"outplots/log/{outname}", tree_style=ts, w=1, units="in")

    print(outname)
    display(t.render("%%inline", tree_style=ts, w=4, units="in"))


In [None]:
os.makedirs("outplots/linear", exist_ok=True)
for replicate, tree_df in tqdm(df.groupby("replicate")):
    tree_df = alifestd_to_working_format(tree_df.reset_index(drop=True))

    tips = alifestd_find_leaf_ids(tree_df)
    n_downsample = 1000
    kept = random.sample(tips, min(n_downsample, len(tips)))
    tree_df["extant"] = tree_df["id"].isin(kept)

    tree_df = alifestd_prune_extinct_lineages_asexual(tree_df)
    tree_df = alifestd_collapse_unifurcations(tree_df)

    attrs = {
        col: mit.one(tree_df[col].unique())
        for col in tree_df.columns
        if len(tree_df[col].unique()) == 1 and col not in ["dataSource"]
    }
    attrs["ext"] = ".pdf"

    tree_df = tree_df.drop("taxon_label", axis=1).copy()
    tree_df["taxon_label"] = tree_df["id"]

    t = Tree(apc.RosettaTree(tree_df).to_newick())

    ts = TreeStyle()
    ts.mode = "c"
    ts.arc_start = -180  # 0 degrees = 3 o'clock
    ts.arc_span = 360
    ts.show_leaf_name = False
    ts.show_scale = False

    outname = kn.pack(attrs)
    t.render(f"outplots/linear/{outname}", tree_style=ts, w=1, units="in")

    print(outname)
    display(t.render("%%inline", tree_style=ts, w=4, units="in"))
