In [None]:
from pathlib import Path
import warnings, math

warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import pyranges as pr
from pyslavseq.preprocessing import collate_labels
from pyslavseq.plotting import datashader_plot

HUE_ORDER = ["KNRGL", "OTHER", "KRGL"]

NRLABELS = [
    "megane_gaussian",
    "megane_breakpoints",
    "graffite",
    "xtea",
]
RLABELS = [
    "primer_sites",
    "l1hs",
    "l1pa2",
    "l1pa3",
    "l1pa4",
    "l1pa5",
    "l1pa6",
]
LABELS = [
    *RLABELS,
    "polyA",
    "polyT",
    *NRLABELS,
]

# THRESHOLDS
MAX_MAPQ = 60
MIN_READS = 10

## Load data

1. meta data
1. bulk peaks
1. single-cell peaks

In [None]:
# meta data
meta = pd.read_csv(snakemake.config["samples"], sep="\t", dtype={"sample_id": str, "tissue_id": str, "donor_id": str})  # type: ignore
donors = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore
meta = (
    meta.merge(donors, on=["donor_id"])
    .rename(columns={"sample_id": "cell_id"})
    .drop(columns="donor_id")
)

# bulk
def read_slavseq_bed(bed):
    data = pd.read_csv(bed, sep="\t")
    data.columns = data.columns.str.replace("#", "")
    data["width"] = data["End"] - data["Start"]
    data[LABELS] = data[LABELS].astype(bool)
    return data


bulk = read_slavseq_bed(snakemake.input.bulk[0])
data = read_slavseq_bed(snakemake.input.cells[0])

data = data.reset_index(drop=True).merge(meta, on="cell_id")
data["tissue"] = data["cell_id"].apply(
    lambda x: "DLPFC" if "usd" in x.lower() else "HIP"
)
data[f"orientation_bias"] = (
    np.maximum(data[f"n_fwd"], data[f"n_rev"]) / data[f"n_reads"]
)
data[f"frac_proper_pairs"] = data[f"n_proper_pairs"] / data[f"n_reads"]
data[f"frac_duplicates"] = data[f"n_duplicates"] / (
    data[f"n_reads"] + data[f"n_duplicates"]
)
data[f"frac_unique_3end"] = data[f"n_unique_3end"] / data[f"n_reads"]
data[f"frac_unique_5end"] = data[f"n_unique_5end"] / data[f"n_reads"]
data[f"frac_mean_supp_alignments"] = (
    data[f"num_supp_alignments_mean"] / data[f"n_reads"]
)

# filter
def filter_regions(data, col, val):
    n_peaks = len(data)
    data = data.query(f"{col} >= {val}").reset_index(drop=True)
    print(f"Removed {n_peaks - len(data)} regions with {col} < {val}")
    return data


n_peaks = len(data)
ndonors = data["donor_id"].nunique()
ncells = data["cell_id"].nunique()
print(f"Loaded {n_peaks} scSLAVseq regions from {ncells} cells from {ndonors} donors.")
data = filter_regions(data, "max_mapq", MAX_MAPQ)
data = filter_regions(data, "n_reads", MIN_READS)

n_peaks = len(bulk)
ndonors = bulk["donor_id"].nunique()
print(f"Loaded {n_peaks} bulk SLAVseq regions from {ndonors} donors.")
bulk = filter_regions(bulk, "max_mapq", MAX_MAPQ)
bulk = filter_regions(bulk, "n_reads", MIN_READS)

In [None]:
# VISUALIZE
g, axs = plt.subplots(2, 2, figsize=(16, 16))
axs = axs.flatten()

# peaks per cell
df = (
    data.groupby(["cell_id", "libd_id", "donor_id", "race"])
    .size()
    .reset_index(name="n_peaks")
)
df.sort_values("race", ascending=False, inplace=True)
sns.boxplot(df, x="n_peaks", y="libd_id", hue="race", ax=axs[0])
axs[0].set_xlabel("# single-cell peaks")
axs[0].set_ylabel(None)


colors = sns.color_palette("tab10", n_colors=2)
opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": colors[0], "AA": colors[1]},
    "alpha": 0.5,
}

for d, df in data.groupby("donor_id"):
    sns.ecdfplot(df, x="bulk", stat="count", ax=axs[1], **opts)
    sns.ecdfplot(df, x="n_reads", stat="count", ax=axs[2], log_scale=True, **opts)
    sns.ecdfplot(df, x="width", stat="count", ax=axs[3], **opts)
axs[1].set_xlabel("Number of bulk peak overlaps")
axs[1].set_title("Single-cell peaks vs bulk peaks overlaps")
axs[1].set_ylabel("# single-cell peaks")
axs[2].set_ylabel("# single-cell peaks")
axs[3].set_ylabel("# single-cell peaks")

## Sharing of peaks across cells and donors

In [None]:
# Cluster peak across cells and see how big the clusters are
# TODO: inspect very large clusters (will they be gone with new mapq read filter?)
merged = pr.PyRanges(data).merge().df
merged["width"] = merged["End"] - merged["Start"]

# plot width of clusters
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=False)
sns.ecdfplot(data, x="width", stat="count", ax=ax1, log_scale=True)
ax1.set_title("Single-cell peaks")
sns.ecdfplot(merged, x="width", stat="count", ax=ax2, log_scale=True)
ax2.set_title("Peak clusters")

In [None]:
def donor_cells(df):
    """
    Count how many cells each peaks is present in for each donor.
    """
    assert "Cluster" in df.columns, "Cluster column missing."
    assert "tissue" in df.columns, "Tissue column missing."
    assert df["donor_id"].nunique() == 1, "Multiple donors present."
    assert "cell_id" in df.columns, "Cell ID column missing."

    cells_per_peak = (
        df.groupby(["Cluster", "tissue"], observed=True)["cell_id"]
        .nunique()
        .to_frame()
        .reset_index()
        .pivot(index="Cluster", columns="tissue", values="cell_id")
        .fillna(0)
        .astype(int)
    )

    cells_per_peak = cells_per_peak.rename(
        columns={"DLPFC": "n_DLPFC_cells", "HIP": "n_HIP_cells"}
    )
    cells_per_peak["n_donor_cells"] = (
        cells_per_peak["n_DLPFC_cells"] + cells_per_peak["n_HIP_cells"]
    )

    return df.merge(cells_per_peak, on="Cluster")


# Compute sharing of peaks across cells and donors
data = pr.PyRanges(data).cluster().df
data["Cluster"] = data["Cluster"].astype("category")
data = data.groupby("donor_id").apply(donor_cells).reset_index(drop=True)
cells_per_peak = data.groupby("Cluster", observed=True)["cell_id"].nunique()
donors_per_peak = data.groupby("Cluster", observed=True)["donor_id"].nunique()
data["n_cells"] = data["Cluster"].map(cells_per_peak)
data["n_donors"] = data["Cluster"].map(donors_per_peak)
data["cells_per_donor"] = data["n_cells"] / data["n_donors"]
data["cells_per_donor"] = data["cells_per_donor"].apply(
    lambda x: 0 if x == math.inf else x
)

# compute sharing across bulk data
# TODO: something is going wrong here, resulting in too many KNRGLs
merged = (
    pr.PyRanges(merged)
    .count_overlaps(pr.PyRanges(bulk), overlap_col="n_bulk_donors")
    .df
)
merged.index = merged.index + 1
data["n_bulk_donors"] = data["Cluster"].map(merged["n_bulk_donors"])
data["n_bulk_donors"] = data["n_bulk_donors"].apply(lambda x: 37 if x > 37 else x)
data["cells_per_bulk_donor"] = data["n_cells"] / data["n_bulk_donors"]
data["cells_per_bulk_donor"] = data["cells_per_bulk_donor"].apply(
    lambda x: 0 if x == math.inf else x
)

## 4. Collate labels of the peaks

In [None]:
print("Collating labels in scSLAVseq data")
data["label"] = data.progress_apply(collate_labels, axis=1)
print("Collating labels in bulk SLAVseq data")
bulk["label"] = bulk.progress_apply(collate_labels, axis=1)

data = data.sort_values(["Chromosome", "Start"]).reset_index(drop=True)
n_peaks = len(data)
data = data.query("label != 'KRGL'").reset_index(drop=True)
print(f"Removed {n_peaks - len(data)} KRGL regions from scSLAVseq data")

In [None]:
def germline_distance(donor_df: pd.DataFrame) -> pd.DataFrame:
    assert len(donor_df["donor_id"].unique()) == 1, "Expected only one donor"
    d = donor_df["donor_id"].unique()[0]
    gdf = pr.PyRanges(bulk.query("donor_id == @d")).merge()
    donor_df = pr.PyRanges(donor_df).df.sort_values(["Chromosome", "Start"])
    donor_df["germline_distance"] = (
        pr.PyRanges(donor_df).nearest(gdf, overlap=False).df["Distance"].abs()
    )
    return donor_df


data = (
    data.groupby(["donor_id"])
    .progress_apply(germline_distance)
    .reset_index(drop=True)
    .sort_values(["Chromosome", "Start"])
)

sns.ecdfplot(data, x="germline_distance", stat="count", hue="label", log_scale=True)

In [None]:
merged = pr.PyRanges(data).merge().df
merged["Width"] = merged["End"] - merged["Start"]

# plot width of clusters
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=False)
sns.ecdfplot(data, x="Width", stat="count", ax=ax1, log_scale=True)
ax1.set_title("Single-cell peaks")
sns.ecdfplot(merged, x="Width", stat="count", ax=ax2, log_scale=True)
ax2.set_title("Peak clusters")

In [None]:
plots = [
    ("germline_dist", "n_bulk_donors", (True, False)),
    ("germline_dist", "n_donors", (True, False)),
    ("germline_dist", "n_cells", (True, False)),
    ("germline_dist", "cells_per_donor", (True, False)),
    ("germline_dist", "n_donor_cells", (True, False)),
    ("germline_dist", "n_HIP_cells", (True, False)),
    ("germline_dist", "n_DLPFC_cells", (True, False)),
    ("n_cells", "n_donors", (False, False)),
    ("n_cells", "n_bulk_donors", (False, False)),
    ("n_cells", "n_donor_cells", (False, False)),
    ("n_donors", "n_bulk_donors", (False, False)),
]

for i, (x, y, s) in enumerate(plots):
    datashader_plot(data, x, y, s, plot_width=100, plot_height=100)

In [None]:
# get all peaks that are KRGL in bulk
brdf = pr.PyRanges(bulk.query("label == 'KRGL'"))
# find overlaps across cells
data["bulk_ref"] = (
    pr.PyRanges(data[["Chromosome", "Start", "End"]])
    .count_overlaps(brdf, overlap_col="bulk_ref")
    .df["bulk_ref"]
)
# label as bulk_ref if overlap
data["bulk_ref"] = data["bulk_ref"] > 0

# report new KRGL peaks per donor
for d, df in data.groupby("donor_id"):
    new_krgls = df.query("label != 'KRGL' and bulk_ref")
    krgl_low_germ_dist = new_krgls.query("germline_dist < 20000")
    lost_knrgls = df.query("label == 'KNRGL' and bulk_ref")
    knrgl_low_germ_dist = lost_knrgls.query("germline_dist < 20000")
    print(
        f"""Donor {d}:
					{len(new_krgls)} ({len(krgl_low_germ_dist)} germ_dist < 20kb) new KRGLs 
					{len(lost_knrgls)} ({len(knrgl_low_germ_dist)} germ_dist < 20kb) lost KNRGLs
			"""
    )

data.loc[data["bulk_ref"], "label"] = "KRGL"

In [None]:
knrgl_low_germ_dist = data.query("label == 'KNRGL' and germline_dist < 20000")
print(f"Found {len(knrgl_low_germ_dist)} KNRGL peaks with germline distance < 20kb.")

new_krgls = data.query("label == 'OTHER' and bulk")
print(f"Added {len(new_krgls)} new KNRGLs.")

data.loc[(data["bulk"]) & (data["label"] == "OTHER"), "label"] = "KNRGL"
knrgl_low_germ_dist = data.query("label == 'KNRGL' and germline_dist < 20000")
print(f"Found {len(knrgl_low_germ_dist)} KNRGL peaks with germline distance < 20kb.")

In [None]:
df = (
    data.groupby(["label", "race", "diagnosis", "cell_id", "age", "libd_id"])
    .size()
    .reset_index(name="n_peaks")
)

fig, ax = plt.subplots(1, 3, figsize=(21, 10))
df.sort_values("n_peaks", inplace=True)
sns.boxplot(
    data=df, y="libd_id", x="n_peaks", hue="label", ax=ax[0], hue_order=HUE_ORDER
)
sns.ecdfplot(
    data,
    x="n_reads",
    hue="label",
    alpha=0.5,
    ax=ax[1],
    log_scale=True,
    hue_order=HUE_ORDER,
)
sns.ecdfplot(data, x="width", hue="label", alpha=0.5, ax=ax[2], hue_order=HUE_ORDER)

means = data.groupby(["cell_id", "label"]).size().groupby("label").mean()
print(
    f"""
       Mean KRNGL peaks per cell: {means["KNRGL"]:0.2f}
       Mean KRGL peaks per cell: {means["KRGL"]:0.2f}
       Mean OTHER peaks per cell: {means["OTHER"]:0.2f}
       """
)

## Save

In [None]:
data.sort_values(["Chromosome", "Start"], inplace=True)
df2tabix(data, snakemake.output[0])