In [None]:
%load_ext watermark


In [None]:
import numpy as np
import pandas as pd
from scipy import stats as scipy_stats
import seaborn as sns
from teeplot import teeplot as tp
from tqdm import tqdm

from pylib._seed_global_rngs import seed_global_rngs
from pylib._summarize_sequence_diffs import summarize_sequence_diffs


In [None]:
%watermark -diwmuv -iv


In [None]:
teeplot_subdir = "2025-04-30-profile-mut-freqs"
teeplot_subdir


In [None]:
seed_global_rngs(1)


## Get Data


In [None]:
df = pd.read_parquet("https://osf.io/2guwm/download")


In [None]:
df["replicate_uuid"].nunique()


In [None]:
len_ancestor = df["ancestral_sequence"].str.len().unique().item()


In [None]:
records = []
for uuid, grp in tqdm(df.groupby("replicate_uuid", observed=True)):
    grp = grp.reset_index(drop=True).copy()
    (unique_mutations, counts, columns,) = summarize_sequence_diffs(
        sequence_diffs=grp["sequence_diff"],
    )
    record = {
        "replicate_uuid": uuid,
        "trt_name": grp["trt_name"].unique().astype(str).item(),
    }
    has_focal_mutation = int(unique_mutations[0] >> 8 == 0)
    records.append(
        {
            **record,
            "mutation": "focal",
            "count": counts[0] if has_focal_mutation else 0,
        }
    )
    possible_counts = len_ancestor * 3
    extended_counts = np.zeros(possible_counts, dtype=int)
    extended_counts[: len(counts) - has_focal_mutation] = counts[
        has_focal_mutation:
    ]
    for count in np.random.choice(extended_counts, 1_000):
        records.append(
            {
                **record,
                "mutation": "non-focal",
                "count": count,
            }
        )


In [None]:
data = pd.DataFrame(records)
data["count1"] = data["count"] + 1


In [None]:
for trt, grp in data.groupby("trt_name"):
    res = scipy_stats.mannwhitneyu(
        grp.loc[grp["mutation"] == "focal", "count"],
        grp.loc[grp["mutation"] == "non-focal", "count"],
    )
    print(trt, res)


In [None]:
with tp.teed(
    sns.displot,
    data=data,
    x="count1",
    hue="mutation",
    col="trt_name",
    element="poly",
    # kind="kde",
    fill=True,
    common_norm=False,
    stat="density",
    alpha=0.2,
    palette=["#FF0000", "#0000FF"],
    log_scale=True,
    height=2.5,
    ls=":",
    facet_kws=dict(
        sharex=False,
        sharey=False,
    ),
    teeplot_subdir=teeplot_subdir,
) as teed:
    sns.move_legend(
        teed,
        "lower center",
        bbox_to_anchor=(0.4, 1),
        ncol=2,
        title=None,
        frameon=False,
    )


In [None]:
for mut, grp in data.groupby("mutation"):
    res = scipy_stats.mannwhitneyu(
        grp.loc[grp["trt_name"] == "Sben/Gneu", "count"],
        grp.loc[grp["trt_name"] == "Sben/Gdel", "count"],
    )
    print(mut, res)


In [None]:
with tp.teed(
    sns.displot,
    data=data,
    x="count1",
    col="mutation",
    hue="trt_name",
    kind="kde",
    fill=True,
    common_norm=False,
    alpha=0.2,
    log_scale=True,
    height=2.5,
    ls=":",
    facet_kws=dict(
        sharex=False,
        sharey=False,
    ),
    teeplot_subdir=teeplot_subdir,
) as teed:
    sns.move_legend(
        teed,
        "lower center",
        bbox_to_anchor=(0.4, 1),
        ncol=3,
        title=None,
        frameon=False,
    )


In [None]:
with tp.teed(
    sns.displot,
    data=data[
        (data["trt_name"] != "Sneu/Gneu") & (data["mutation"] == "focal")
    ],
    x="count1",
    col="mutation",
    hue="trt_name",
    kind="kde",
    fill=True,
    common_norm=False,
    alpha=0.2,
    height=2.5,
    aspect=1.5,
    ls=":",
    facet_kws=dict(
        sharex=False,
        sharey=False,
    ),
    teeplot_outattrs={"excl": "Sneu-Gneu"},
    teeplot_subdir=teeplot_subdir,
) as teed:
    sns.move_legend(
        teed,
        "lower center",
        bbox_to_anchor=(0.4, 1),
        ncol=2,
        title=None,
        frameon=False,
    )
