# SLAVseq model report

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score
from pyslavseq.plotting import germline_dist
from scipy.stats import ttest_ind

## Read Data

In [None]:
data = pd.read_parquet(snakemake.input.data)  # type: ignore
meta = pd.read_csv(snakemake.config["donors"], sep="\t")  # type: ignore
meta["donor_id"] = meta["donor_id"].astype(str)
data = data.merge(meta, on="donor_id")
data["Cluster"] = data["Cluster"].astype("category")

## Distance to germline

In [None]:
g = germline_dist(data, y="test_proba")

## Inspect Potential Clonal insertions

In [None]:
# find putative clonal insertions
calls = []
for p in [0.5, 0.6, 0.7, 0.8, 0.9]:

    df = (
        data.query("not KNRGL and test_proba > @p")
        .drop(columns=["n_cells", "n_donors"])
        .reset_index(drop=True)
    )

    cells_per_call = df.groupby("Cluster", observed=True).size()
    cells_per_call.name = "n_cells"
    regions_per_call = df.groupby("Cluster", observed=True)["tissue_id"].nunique()
    regions_per_call.name = "n_regions"
    donors_per_call = df.groupby("Cluster", observed=True)["donor_id"].nunique()
    donors_per_call.name = "n_donors"
    df = df.join(cells_per_call, on="Cluster")
    df = df.join(donors_per_call, on="Cluster")
    df = df.join(regions_per_call, on="Cluster")
    df["threshold"] = p

    calls.append(df)

calls = pd.concat(calls)

g = sns.displot(
    calls, x="n_cells", y="n_donors", col="n_regions", row="threshold", kind="hist"
)

# set x to log
g.set(xscale="log")

## Putative calls


In [None]:
calls = []
for p in [0.5, 0.6, 0.7, 0.8, 0.9]:

    precision_per_cell = data.groupby(["donor_id", "cell_id"]).apply(
        lambda x: precision_score(
            x["KNRGL"], x["test_proba"] > p, zero_division=np.nan
        ),
    )
    recall_per_cell = data.groupby(["donor_id", "cell_id"]).apply(
        lambda x: recall_score(x["KNRGL"], x["test_proba"] > p, zero_division=np.nan),
    )
    calls_per_cell = (
        data.query("not KNRGL")
        .groupby(["donor_id", "cell_id"])
        .apply(lambda x: (x["test_proba"] > p).sum())
    )
    reads_per_cell = data.groupby(["donor_id", "cell_id"])["n_reads"].sum()

    # join
    df = pd.concat(
        [precision_per_cell, recall_per_cell, calls_per_cell, reads_per_cell], axis=1
    )
    df.columns = ["precision", "recall", "n_calls", "n_reads"]
    df.reset_index(inplace=True)
    df["threshold"] = p
    calls.append(df)

calls = pd.concat(calls).reset_index(drop=True)
calls["expected"] = (calls["n_calls"] * calls["precision"]) / calls["recall"]

In [None]:
# insertions per cell
fig, axes = plt.subplots(3, 5, figsize=(30, 30), sharex="col", sharey="row")
fig.subplots_adjust(wspace=0, hspace=0)
for i, p in enumerate([0.5, 0.6, 0.7, 0.8, 0.9]):
    precision = precision_score(data["KNRGL"], data["test_proba"] > p)
    recall = recall_score(data["KNRGL"], data["test_proba"] > p)

    calls_per_cell = calls.query("threshold == @p")[["n_calls", "donor_id", "cell_id"]]
    calls_per_cell = calls_per_cell.merge(meta, on="donor_id")

    for j, cat in enumerate(["diagnosis", "race", "age"]):
        calls_per_cell = calls_per_cell.sort_values(cat)
        sns.boxplot(
            data=calls_per_cell, y="donor_id", x="n_calls", hue=cat, ax=axes[j, i]
        )
        axes[0, i].set_title(f"{p}: precision = {precision:.2f}, recall = {recall:.2f}")

$$
E(\text{somatic insertions}) = \frac{\text{ncalls} * (1 - \text{FDR})}{1-\text{FNR}} \\
$$

$$
E(\text{somatic insertions}) = \frac{\text{ncalls} * \text{precision}}{\text{recall}}
$$

In [None]:
# compute expected insertions per cell
fig, axes = plt.subplots(3, 5, figsize=(30, 30), sharex=True, sharey="row")
fig.subplots_adjust(wspace=0, hspace=0)
for i, p in enumerate([0.5, 0.6, 0.7, 0.8, 0.9]):
    precision = precision_score(data["KNRGL"], data["test_proba"] > p)
    recall = recall_score(data["KNRGL"], data["test_proba"] > p)

    expected_per_cell = calls.query("threshold == @p")[
        ["expected", "donor_id", "cell_id"]
    ]
    expected_per_cell = expected_per_cell.merge(meta, on="donor_id")

    for j, cat in enumerate(["diagnosis", "race", "age"]):
        expected_per_cell = expected_per_cell.sort_values(cat)
        sns.boxplot(
            data=expected_per_cell, y="donor_id", x="expected", hue=cat, ax=axes[j, i]
        )
        axes[0, i].set_title(f"{p}: precision = {precision:.2f}, recall = {recall:.2f}")

In [None]:
donor_calls = (
    calls.groupby(["donor_id", "threshold"])[["n_calls", "expected"]]
    .mean()
    .reset_index()
    .merge(meta, on="donor_id")
)

In [None]:
sns.catplot(
    data=donor_calls,
    x="diagnosis",
    y="n_calls",
    hue="race",
    col="threshold",
    kind="strip",
    alpha=0.5,
    jitter=0.1,
    dodge=True,
)

In [None]:
sns.catplot(
    data=donor_calls,
    x="diagnosis",
    y="expected",
    hue="race",
    col="threshold",
    kind="strip",
    alpha=0.5,
    jitter=0.1,
    dodge=True,
)

In [None]:
for t, df in donor_calls.groupby("threshold"):
    res = ttest_ind(
        df[df["diagnosis"] == "CONTROL"]["n_calls"],
        df[df["diagnosis"] == "SCZD"]["n_calls"],
    )
    print(f"ncalls diagnosis test at threshold {t}: {res}")
    res = ttest_ind(
        df[df["diagnosis"] == "CONTROL"]["expected"],
        df[df["diagnosis"] == "SCZD"]["expected"],
    )
    print(f"expected calls diagnosis test at threshold {t}: {res}")
    res = ttest_ind(
        df[df["race"] == "CAUC"]["n_calls"], df[df["race"] == "AA"]["n_calls"]
    )
    print(f"ncalls race test at threshold {t}: {res}")
    res = ttest_ind(
        df[df["race"] == "CAUC"]["expected"], df[df["race"] == "AA"]["expected"]
    )
    print(f"expected calls race test at threshold {t}: {res}")

## Save

In [None]:
import pyranges as pr
from pathlib import Path

for d, df in data.query("test_proba > 0.9").groupby("donor_id"):
    my_df = df.rename(columns={"test_proba": "Score", "cell_id": "Name"})
    for s in snakemake.output:
        if d == Path(s).parent.name:
            pr.PyRanges(my_df[["Chromosome", "Start", "End", "Score", "Name"]]).to_bed(
                s
            )
            break