In [None]:
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd
import pyranges as pr
import seaborn as sns
import matplotlib.pyplot as plt
from pyslavseq.preprocessing import collate_labels

HUE_ORDER = ["KNRGL", "OTHER", "KRGL"]

In [None]:
meta = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore

In [None]:
bulk = sorted(snakemake.input.bulk)  # type: ignore
megane = sorted(snakemake.input.megane)  # type: ignore
bdata, mdata = [], []
for b, m in zip(bulk, megane):  # type: ignore
    bdf = pd.read_parquet(b)
    mdf = pr.read_bed(m)
    mdf = mdf.count_overlaps(pr.PyRanges(bdf), overlap_col="bulk peak").df
    bdf["donor_id"] = Path(b).parent.name
    mdf["donor_id"] = Path(b).parent.name
    bdata.append(bdf)
    mdata.append(mdf)

bdata = pd.concat(bdata).merge(meta, on="donor_id")
print(f"Loaded {len(bdata)} peaks from {bdata['donor_id'].nunique()} donors")
avg_peaks = bdata.groupby("donor_id").size().mean()
sd_peaks = bdata.groupby("donor_id").size().std()
print(f"{int(avg_peaks)} ± {int(sd_peaks)} peaks per donor")

# label
print("Labelling...")
bdata["KNRGL"] = bdata[snakemake.params.pos_label]  # type: ignore
bdata["label"] = bdata.apply(collate_labels, axis=1)
avg_peaks = bdata.groupby(["label", "donor_id"]).size().groupby("label").mean()
sd_peaks = bdata.groupby(["label", "donor_id"]).size().groupby("label").std()
for l in HUE_ORDER:
    print(f"{int(avg_peaks[l])} ± {int(sd_peaks[l])} {l} peaks per donor")

mdata = pd.concat(mdata).merge(meta, on="donor_id")
mdata["bulk peak"] = mdata["bulk peak"].astype(bool)
print(f"Loaded {len(mdata)} WGS calls from {mdata['donor_id'].nunique()} donors")
avg_wgs = mdata.groupby("donor_id").size().mean()
sd_wgs = mdata.groupby("donor_id").size().std()
print(f"{int(avg_wgs)} ± {int(sd_wgs)} WGS calls per donor")

mdata = pr.PyRanges(mdata).cluster().df
m_ndonors_call = (
    mdata.groupby(["Cluster", "bulk peak"], observed=True)["donor_id"]
    .nunique()
    .reset_index(name="ndonors")
)
m_ncalls_donor = (
    mdata.groupby(["donor_id", "bulk peak", "race"])
    .size()
    .reset_index(name="ncalls")
    .sort_values("race")
)

bdata = pr.PyRanges(bdata).cluster().df
b_ndonors_call = (
    bdata.groupby(["Cluster", "label"], observed=True)["donor_id"]
    .nunique()
    .reset_index(name="ndonors")
)
b_ncalls_donor = (
    bdata.groupby(["donor_id", "label", "race"])
    .size()
    .reset_index(name="ncalls")
    .sort_values("race")
)

In [None]:
# TODO sort donors by race
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 14))

sns.barplot(data=m_ncalls_donor, x="ncalls", y="donor_id", hue="bulk peak", ax=ax1).set(
    xlabel="LINE1 insertions detected from WGS"
)
sns.ecdfplot(data=m_ndonors_call, x="ndonors", hue="bulk peak", ax=ax2).set(
    ylabel="LINE1 insertions detected from WGS", xlabel="# donors"
)
# retitle legends
ax1.legend_.set_title("Covered by Bulk SLAVseq peak")
ax2.legend_.set_title("Covered by Bulk SLAVseq peak")


sns.barplot(
    data=b_ncalls_donor,
    x="ncalls",
    y="donor_id",
    hue="label",
    hue_order=HUE_ORDER,
    ax=ax3,
).set(xlabel="Bulk SLAVseq peaks", xscale="log", xlim=(1, None))
sns.ecdfplot(
    data=b_ndonors_call, x="ndonors", hue="label", hue_order=HUE_ORDER, ax=ax4
).set(ylabel="Bulk SLAVseq peaks", xlabel="# donors")

plt.tight_layout()

In [None]:
# first filter
total_peaks = bdata.groupby("label").size()
bdata = bdata.query("max_mapq >= 30").reset_index(drop=True)
rm_peaks = total_peaks - bdata.groupby("label").size()
print(f"Filtered to {len(bdata)} peaks with MAPQ >= 30")
for l in bdata["label"].unique():
    print(f"Removed {rm_peaks[l]}/{total_peaks[l]} {l} peaks")

In [None]:
# look for clonal insertions labelled "OTHER"
other = bdata.query("label == 'OTHER'").copy()
other["Cluster"] = other["Cluster"].astype("category")
ndonors = other.groupby("Cluster", observed=True)["donor_id"].nunique()
avg_reads = other.groupby("Cluster", observed=True)["n_reads"].mean()
avg_n_unique_5end = other.groupby("Cluster", observed=True)["n_unique_5end"].mean()
plot_df = pd.concat([ndonors, avg_reads, avg_n_unique_5end], axis=1).rename(
    columns={
        "donor_id": "n_donors",
        "n_reads": "avg_reads",
        "n_unique_5end": "avg_n_unique_5end",
    }
)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))
sns.ecdfplot(data=plot_df, x="n_donors", stat="count", ax=ax1)
sns.scatterplot(data=plot_df, x="n_donors", y="avg_reads", alpha=0.5, ax=ax2)
ax2.set_yscale("log")
ax2.axhline(30, color="red", linestyle="--")
sns.scatterplot(data=plot_df, x="n_donors", y="avg_n_unique_5end", alpha=0.5, ax=ax3)
ax3.set_yscale("log")

In [None]:
g = sns.displot(
    bdata,
    x="n_reads",
    hue="label",
    kind="ecdf",
    col="donor_id",
    col_wrap=5,
    hue_order=HUE_ORDER,
)
g.set(xscale="log", xlim=(1, 1e4))

## PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


features = snakemake.config["features"]  # type: ignore


scaler = StandardScaler()
pca = PCA(n_components=2)

X = bdata[features].values
X = scaler.fit_transform(X)
X = pca.fit_transform(X)

bdata["PC1"] = X[:, 0]
bdata["PC2"] = X[:, 1]

bdata["log_n_reads"] = np.log10(bdata["n_reads"])
bdata["log_rpm"] = np.log10(bdata["rpm"])

hues = [
    "label",
    "log_n_reads",
    "max_mapq",
    "min_mapq",
    "frac_unique_3end",
    "frac_unique_5end",
]

fig, axes = plt.subplots(
    1, len(hues), figsize=(5 * len(hues), 5), sharey=True, sharex=True
)
plt.subplots_adjust(wspace=0)

for ax, hue in zip(axes, hues):
    sns.scatterplot(data=bdata, x="PC1", y="PC2", hue=hue, s=3, alpha=0.7, ax=ax)

In [None]:
# get pca loadinsg
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
pd.DataFrame(loadings, columns=["PC1", "PC2"], index=features)

In [None]:
bdata.to_parquet(snakemake.output[0])  # type: ignore