In [None]:
%load_ext watermark


In [None]:
import itertools as it
import operator
import os

import alifedata_phyloinformatics_convert as apc
from hstrat._auxiliary_lib import (
    # alifestd_collapse_unifurcations,
    alifestd_downsample_tips_asexual,
    alifestd_downsample_tips_clade_asexual,
    alifestd_join_roots,
    alifestd_mark_leaves,
    alifestd_mark_origin_time_delta_asexual,
    alifestd_prune_extinct_lineages_asexual,
    alifestd_to_working_format,
    alifestd_try_add_ancestor_list_col,
    seed_random,
)
import iplotx as ipx
import matplotlib as mpl
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from pyfonts import load_google_font
from teeplot import teeplot as tp
from scipy import stats as scipy_stats
import seaborn as sns


In [None]:
%watermark -diwmuv -iv


In [None]:
teeplot_subdir = os.environ.get(
    "NOTEBOOK_NAME", "2026-02-12-billion-tip-phylos"
)
teeplot_subdir


In [None]:
seed_random(1)


In [None]:
font = load_google_font("Merriweather", weight=300)
mpl.font_manager.fontManager.addfont(font.get_file())
plt.rcParams['font.family'] = font.get_name()


## Prep Data


In [None]:
phylo1_df = alifestd_try_add_ancestor_list_col(
    alifestd_mark_origin_time_delta_asexual(
        alifestd_to_working_format(
            alifestd_join_roots(
                pl.read_parquet("https://osf.io/download/485tx").to_pandas(),
            ),
        ),
    ),
)
phylo1_df


In [None]:
phylo2_df = alifestd_try_add_ancestor_list_col(
    alifestd_mark_origin_time_delta_asexual(
        alifestd_to_working_format(
            alifestd_join_roots(
                pl.read_parquet("https://osf.io/download/vkzdw").to_pandas(),
            ),
        ),
    )
)
phylo2_df


In [None]:
phylo1_df["x"] = phylo1_df["position"] // phylo1_df["nCol"]
phylo1_df["x_"] = phylo1_df["x"] / phylo1_df["nRow"]
phylo1_df["y"] =phylo1_df["position"] % phylo1_df["nCol"]
phylo1_df["y_"] = phylo1_df["y"] / phylo1_df["nCol"]

phylo2_df["x"] = phylo2_df["position"] // phylo2_df["nCol"]
phylo2_df["x_"] = phylo2_df["x"] / phylo2_df["nRow"]
phylo2_df["y"] =phylo2_df["position"] % phylo2_df["nCol"]
phylo2_df["y_"] = phylo2_df["y"] / phylo2_df["nCol"]

phylo1_df["taxon_label"] = phylo1_df["id"].astype(str)
phylo2_df["taxon_label"] = phylo2_df["id"].astype(str)

phylo1_df["log_origin_time_delta"] = np.log(phylo1_df["origin_time_delta"] + 1)
phylo2_df["log_origin_time_delta"] = np.log(phylo2_df["origin_time_delta"] + 1)


In [None]:
phylo1_df = alifestd_mark_leaves(phylo1_df)
phylo2_df = alifestd_mark_leaves(phylo2_df)


In [None]:
phylo1_df["regime"] = "purifying"
phylo2_df["regime"] = "adaptive"


In [None]:
leaves1_df = phylo1_df[phylo1_df["is_leaf"]]
leaves2_df = phylo2_df[phylo2_df["is_leaf"]]


## Origin Time Stats


In [None]:
v1, v2 = map(
    float,
    (leaves1_df["origin_time"].mean(), leaves2_df["origin_time"].mean()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (leaves1_df["origin_time"].median(), leaves2_df["origin_time"].median()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (leaves1_df["origin_time"].max(), leaves2_df["origin_time"].max()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (leaves1_df["origin_time"].min(), leaves2_df["origin_time"].min()),
)
v1, v2, v1 / v2


In [None]:
scipy_stats.mannwhitneyu(
    leaves1_df["origin_time"].dropna(),
    leaves2_df["origin_time"].dropna(),
)


In [None]:
for legend in [True, False]:
    with tp.teed(
        sns.kdeplot,
        data=pd.concat([leaves1_df, leaves2_df], ignore_index=True),
        x="origin_time",
        alpha=0.5,
        common_norm=False,
        hue="regime",
        hue_order=["purifying", "adaptive"],
        fill=True,
        legend=legend,
        linewidth=0,
        palette=["chocolate", "blue"],
    ) as teed:
        if legend:
            sns.move_legend(
                teed,
                "lower center",
                bbox_to_anchor=(.5, 1),
                columnspacing=0.8,
                frameon=False,
                handletextpad=0.3,
                ncol=3,
                title=None,
            )
        sns.despine(ax=teed, left=True)
        teed.set_yticks([])
        teed.figure.set_size_inches(2.5, 0.5)
        teed.set_xlabel("Leaf Generation")
        teed.set_ylabel("")


## Origin Time Delta Stats


In [None]:
phylo1_df["ot_delta"] = phylo1_df["origin_time_delta"]
phylo2_df["ot_delta"] = phylo2_df["origin_time_delta"]


In [None]:
v1, v2 = map(
    float,
    (phylo1_df["ot_delta"].mean(), phylo2_df["ot_delta"].mean()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (phylo1_df["ot_delta"].median(), phylo2_df["ot_delta"].median()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (phylo1_df["ot_delta"].max(), phylo2_df["ot_delta"].max()),
)
v1, v2, v1 / v2


In [None]:
v1, v2 = map(
    float,
    (phylo1_df["ot_delta"].min(), phylo2_df["ot_delta"].min()),
)
v1, v2


In [None]:
scipy_stats.mannwhitneyu(
    phylo1_df["origin_time_delta"].dropna(),
    phylo2_df["origin_time_delta"].dropna(),
)


In [None]:
for x, legend in it.product(
    ["origin_time_delta", "log_origin_time_delta"],
    [True, False],
):
    with tp.teed(
        sns.kdeplot,
        data=pd.concat([phylo1_df, phylo2_df], ignore_index=True),
        x=x,
        alpha=0.5,
        bw_adjust=2.0,
        common_norm=False,
        hue="regime",
        hue_order=["purifying", "adaptive"],
        fill=True,
        legend=legend,
        linewidth=0,
        palette=["chocolate", "blue"],
    ) as teed:
        if legend:
            sns.move_legend(
                teed,
                "lower center",
                bbox_to_anchor=(.5, 1),
                columnspacing=0.8,
                frameon=False,
                handletextpad=0.3,
                ncol=3,
                title=None,
            )
        sns.despine(ax=teed, left=True)
        teed.set_yticks([])
        teed.figure.set_size_inches(2.5, 0.5)
        teed.set_xlabel(
            {
                "origin_time_delta": "Branch Length",
                "log_origin_time_delta": "Log Branch Length",
            }[x],
        )
        teed.set_ylabel("")


## Create Color Map


In [None]:
def get_bcyr_color(x: float, y: float) -> tuple[float, float, float]:
    return (y, x, 1.0 - y)


def get_bcyr_hex(x: float, y: float) -> str:
    return mpl.colors.to_hex(get_bcyr_color(x, y))


In [None]:
phylo1_df["bcyr"] = phylo1_df.apply(
    lambda row: get_bcyr_color(row["x_"], row["y_"]), axis=1
)
phylo2_df["bcyr"] = phylo2_df.apply(
    lambda row: get_bcyr_color(row["x_"], row["y_"]), axis=1
)

phylo1_df["bcyr"], phylo2_df["bcyr"]


In [None]:
for y_val in (0, 0.5, 1.0):
    fig, ax = plt.subplots(figsize=(5, 1))

    x_vals = np.linspace(0.0, 1.0, 256)
    colors = [get_bcyr_color(x, y_val) for x in x_vals]

    # Create colorbar
    cb_data = np.array([x_vals])
    with tp.teed(
        plt.imshow,
        cb_data,
        aspect="auto",
        cmap=mpl.colors.ListedColormap(colors),
        extent=[0, 1, 0, 1],
        teeplot_outattrs={"cmap": "bcyr", "what": "xstrip", "val": y_val},
        teeplot_subdir=teeplot_subdir,
    ) as im:
        ax.figure.set_size_inches(4, 1)
        ax.set_yticks([])
        ax.set_xticks([])
        ax.set_xlabel("")
        sns.despine(ax=ax, left=True, bottom=True)

        plt.tight_layout()


In [None]:
for x_val in (0, 0.5, 1.0):
    fig, ax = plt.subplots(figsize=(5, 1))

    y_vals = np.linspace(0.0, 1.0, 256)
    colors = [get_bcyr_color(x_val, y) for y in y_vals]

    # Create colorbar
    cb_data = np.array([y_vals])
    with tp.teed(
        plt.imshow,
        cb_data,
        aspect="auto",
        cmap=mpl.colors.ListedColormap(colors),
        extent=[0, 1, 0, 1],
        teeplot_outattrs={"cmap": "bcyr", "what": "ystrip", "val": x_val},
        teeplot_subdir=teeplot_subdir,
    ) as im:
        ax.figure.set_size_inches(4, 1)
        ax.set_yticks([])
        ax.set_xticks([])
        ax.set_xlabel("")
        sns.despine(ax=ax, left=True, bottom=True)

        plt.tight_layout()


In [None]:
for aspect in (1, 1), (755, 1170):
    with tp.teed(
        plt.imshow,
        X=[
            [get_bcyr_color(x / 255, y / 255) for y in range(256)]
            for x in range(256)
        ],
        teeplot_outattrs={
            "aspect": "x".join(map(str, aspect)),
            "cmap": "bcyr",
            "what": "area",
        },
        teeplot_subdir=teeplot_subdir,
    ) as teed:
        plt.gca().xaxis.set_visible(False)
        plt.gca().yaxis.set_visible(False)
        sns.despine(ax=plt.gca(), left=True, bottom=True)
        plt.gca().set_aspect(aspect=operator.truediv(*aspect))
        plt.gcf().tight_layout()


## Plotting Helpers


In [None]:
bcyr_lookup1 = dict(zip(phylo1_df["taxon_label"], phylo1_df["bcyr"]))
bcyr_lookup2 = dict(zip(phylo2_df["taxon_label"], phylo2_df["bcyr"]))


In [None]:
def overlay_tip_dots(
    ax: plt.Axes,
    tree_result: ipx.plotting.TreeArtist,
    bcyr_lookup: dict,
    radial: bool = False,
    size: float = 4,
):
    layout = tree_result.get_layout()
    xs, ys, colors = [], [], []
    for (node, (x, y)), (x_, y_) in zip(
        layout.T.items(),
        tree_result.get_nodes().get_offsets(),
    ):
        if node.taxon is not None:
            label = str(node.taxon.label)
            if label in bcyr_lookup:
                color = bcyr_lookup[label]
                if (
                    color[0] != color[0]
                    or color[1] != color[1]
                    or color[2] != color[2]
                ):  # Check for NaN
                    continue
                if radial:
                    # x = x * np.cos(y)
                    # y = x * np.sin(y)
                    x = x_
                    y = y_
                xs.append(x)
                ys.append(y)
                colors.append(color)

    ax.scatter(xs, ys, c=colors, s=size, zorder=10, edgecolors="none")


## Whole-tree Sample


In [None]:
tree1 = apc.RosettaTree(
    alifestd_downsample_tips_asexual(phylo1_df, 250, seed=1),
).as_dendropy
tree2 = apc.RosettaTree(
    alifestd_downsample_tips_asexual(phylo2_df, 250, seed=1),
).as_dendropy


In [None]:
tree1.ladderize()
tree2.ladderize()

with tp.teed(
    plt.subplots,
    1,
    2,
    figsize=(4, 2),
    teeplot_outattrs={"layout": "vertical", "sample": "down"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, (ax1, ax2) = teed
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="vertical",
        edge_linewidth=1.5,
        margins=0.0,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        layout="vertical",
        edge_linewidth=1.5,
        margins=0.0,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, size=6)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, size=6)
    plt.show()


In [None]:
tree1.ladderize()
tree2.ladderize()

with tp.teed(
    plt.subplots,
    1,
    2,
    figsize=(6, 1.5),
    gridspec_kw={
        "wspace": 0.0,
        "hspace": 0.0,
    },
    teeplot_outattrs={"layout": "vertical", "sample": "down"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, (ax1, ax2) = teed

    style = [
        "tree",
        {
            "layout": {
                "start": -180,
                "span": 180,
            },
        },
    ]
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        angle=270,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, radial=True, size=4)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, radial=True, size=4)
    fig.tight_layout()


## Clade Sample


In [None]:
df1 = alifestd_downsample_tips_clade_asexual(phylo1_df, 2_000, seed=1)
tree1 = apc.RosettaTree(df1).as_dendropy
df2 = alifestd_downsample_tips_clade_asexual(phylo2_df, 2_000, seed=1)
tree2 = apc.RosettaTree(df2).as_dendropy

tree1.ladderize()
tree2.ladderize()


In [None]:
with tp.teed(
    plt.subplots,
    2,
    2,
    figsize=(6, 4),
    gridspec_kw={
        "wspace": 0.0,
        "hspace": 0.0,
    },
    teeplot_outattrs={"layout": "vertical", "sample": "clade"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, ((ax1, ax2), (ax3, ax4)) = teed
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="vertical",
        edge_linewidth=0.5,
        margins=-0.02,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        layout="vertical",
        edge_linewidth=0.5,
        margins=-0.02,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, size=5)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, size=5)
    sns.scatterplot(
        data=df1.dropna(subset=["x_", "y_"]),
        x="x",
        y="y",
        color=[
            get_bcyr_hex(row["x_"], row["y_"])
            for _, row in df1.dropna(subset=["x_", "y_"]).iterrows()
        ],
        ax=ax3,
        legend=False,
        clip_on=False,
        s=2,
    )
    sns.scatterplot(
        data=df2.dropna(subset=["x_", "y_"]),
        x="x",
        y="y",
        color=[
            get_bcyr_hex(row["x_"], row["y_"])
            for _, row in df2.dropna(subset=["x_", "y_"]).iterrows()
        ],
        ax=ax4,
        legend=False,
        s=2,
        clip_on=False,
    )
    ax1.set_ylabel("Phylogeny")
    ax3.set_xlabel("Purifying Regime")
    ax3.set_xticks([])
    ax3.set_ylabel("Sampling Site on WSE")
    ax3.set_yticks([])

    ax4.set_xlabel("Adaptive Regime")
    ax4.set_xticks([])
    ax4.set_ylabel("")
    ax4.set_yticks([])

    fig.tight_layout()


In [None]:
for regime, (tree, lookup) in {
    "purifying": (tree1, bcyr_lookup1),
    "adaptive": (tree2, bcyr_lookup2),
}.items():
    with tp.teed(
        ipx.plotting.tree,
        tree,
        layout="vertical",
        edge_linewidth=0.5,
        margins=-0.02,
        teeplot_outattrs={
            "layout": "vertical",
            "regime": regime,
            "sample": "clade",
        },
        teeplot_subdir=teeplot_subdir,
    ) as teed:
        overlay_tip_dots(plt.gca(), teed, lookup, size=5)
        plt.gcf().set_size_inches(1170 / 400, 755 / 400)


In [None]:
for regime, data in {
    "purifying": df1,
    "adaptive": df2,
}.items():
    with tp.teed(
        sns.scatterplot,
        data=data.dropna(subset=["x_", "y_"]),
        x="x",
        y="y",
        color=[
            get_bcyr_hex(row["x_"], row["y_"])
            for _, row in data.dropna(subset=["x_", "y_"]).iterrows()
        ],
        legend=False,
        linewidth=0,
        edgecolor=None,
        s=100,
        alpha=0.3,
        clip_on=False,
        zorder=-100,
        teeplot_outattrs={"regime": regime, "sample": "clade"},
        teeplot_subdir=teeplot_subdir,
    ) as teed:

        teed.xaxis.set_major_locator(plt.MultipleLocator(100))
        teed.xaxis.set_minor_locator(plt.MultipleLocator(25))
        teed.yaxis.set_major_locator(plt.MultipleLocator(100))
        teed.yaxis.set_minor_locator(plt.MultipleLocator(25))
        teed.grid(visible=True, which="major", color="#222222", lw=0.4)
        teed.grid(
            visible=True, which="minor", color="gray", ls="--", linewidth=0.2
        )
        teed.tick_params(axis="both", which="both", length=0)
        teed.set_xticklabels([])
        teed.set_yticklabels([])
        teed.set_xlabel("")
        teed.set_ylabel("")
        teed.set_xlim(0, data["nRow"].max())
        teed.set_ylim(0, data["nCol"].max())
        teed.set_aspect("equal")
        sns.despine(ax=teed, left=True, bottom=True)


In [None]:
with tp.teed(
    plt.subplots,
    1,
    2,
    figsize=(4, 1.5),
    gridspec_kw={
        "wspace": 0.0,
        "hspace": 0.0,
    },
    teeplot_outattrs={"layout": "radial", "sample": "clade"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, (ax1, ax2) = teed

    style = [
        "tree",
        {
            "layout": {
                "start": -180,
                "span": 180,
            },
        },
    ]
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        angle=270,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, radial=True, size=4)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, radial=True, size=4)
    ax1.set_xlabel("Purifying Regime")
    ax2.xaxis.set_inverted(True)
    ax2.set_xlabel("Adaptive Regime")
    fig.tight_layout()


## Canopy Sample


In [None]:
n = 4_000
phylo1_df["extant"] = False
phylo1_df.loc[phylo1_df["is_leaf"], "extant"] = (
    phylo1_df.loc[phylo1_df["is_leaf"], "origin_time"].rank(
        ascending=False,
        method="first",
    )
    <= n
)
phylo2_df["extant"] = False
phylo2_df.loc[phylo2_df["is_leaf"], "extant"] = (
    phylo2_df.loc[phylo2_df["is_leaf"], "origin_time"].rank(
        ascending=False,
        method="first",
    )
    <= n
)

phylo1_df["extant"].sum(), phylo2_df["extant"].sum()


In [None]:
tree1 = apc.RosettaTree(
    # alifestd_mark_origin_time_delta_asexual(
    #     alifestd_collapse_unifurcations(
    alifestd_prune_extinct_lineages_asexual(phylo1_df),
    #     ),
    # ),
).as_dendropy
tree2 = apc.RosettaTree(
    # alifestd_mark_origin_time_delta_asexual(
    #     alifestd_collapse_unifurcations(
    alifestd_prune_extinct_lineages_asexual(phylo2_df),
    # ),
    # ),
).as_dendropy

tree1.ladderize()
tree2.ladderize()


In [None]:
with tp.teed(
    plt.subplots,
    1,
    2,
    figsize=(6, 2),
    gridspec_kw={
        "wspace": 0.0,
        "hspace": 0.0,
    },
    teeplot_outattrs={"layout": "vertical", "sample": "canopy"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, (ax1, ax2) = teed
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="vertical",
        edge_linewidth=0.5,
        margins=-0.02,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        layout="vertical",
        edge_linewidth=0.5,
        margins=-0.02,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, size=2)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, size=2)
    fig.tight_layout()


In [None]:
with tp.teed(
    plt.subplots,
    1,
    2,
    figsize=(6, 1.5),
    gridspec_kw={
        "wspace": 0.0,
        "hspace": 0.0,
    },
    teeplot_outattrs={"layout": "vertical", "sample": "canopy"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    fig, (ax1, ax2) = teed

    style = [
        "tree",
        {
            "layout": {
                "start": -180,
                "span": 180,
            },
        },
    ]
    tr1 = ipx.plotting.tree(
        tree1,
        ax=ax1,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    tr2 = ipx.plotting.tree(
        tree2,
        ax=ax2,
        angle=270,
        layout="radial",
        # layout_orientation="right",
        edge_linewidth=0.5,
        margins=-0.02,
        style=style,
    )
    overlay_tip_dots(ax1, tr1, bcyr_lookup1, size=2)
    overlay_tip_dots(ax2, tr2, bcyr_lookup2, size=2)
    fig.tight_layout()
