In [None]:
from pathlib import Path
import pyarrow.parquet as pq
import warnings, math

warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm.notebook import tqdm
import pandas as pd

tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import pyranges as pr
from pyslavseq.preprocessing import collate_labels
from pyslavseq.plotting import datashader_plot

HUE_ORDER = ["KNRGL", "OTHER", "KRGL"]

# TODO

- mappability
- 

## Load data

1. meta data
1. bulk peaks
1. single-cell peaks

In [None]:
# meta data
meta = pd.read_csv(snakemake.config["samples"], sep="\t", dtype={"sample_id": str, "tissue_id": str, "donor_id": str})  # type: ignore
donors = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore
meta = meta.merge(donors, on=["donor_id"]).rename(columns={"sample_id": "cell_id"})

# bulk peaks
bulk = pd.read_parquet(snakemake.input.bulk_peaks)  # type: ignore

# single-cell peaks
data = []
for f in tqdm(snakemake.input.cell_peaks):  # type: ignore
    df = pq.read_table(f).to_pandas()
    cell_id = Path(f).name.rstrip("_labelled.pqt")
    df["cell_id"] = cell_id
    df["tissue"] = "DLPFC" if "usd" in cell_id.lower() else "HIP"
    data.append(df)

data = pd.concat(data).reset_index(drop=True).merge(meta, on="cell_id")


ndonors = data["donor_id"].nunique()
ncells = data["cell_id"].nunique()
print(f"Loaded {len(data)} peaks from {ncells} cells from {ndonors} donors.")

## Preprocess data

1. Remove peaks without any reads with >= 60 MAPQ 
2. Compute sharing of peaks across cells and donors

In [None]:
# Remove peaks without any reads with at least a 30 MAPQ score
n_peaks = len(data)
data = data.query("max_mapq >= 60").reset_index(drop=True)
print(f"Removed {n_peaks - len(data)}/{n_peaks} peaks with MAX MAPQ < 60")

Cluster peak across cells and see how big the clusters are

TODO: inspect very large clusters (will they be gone with new mapq read filter?)

In [None]:
merged = pr.PyRanges(data).merge().df
merged["width"] = merged["End"] - merged["Start"]

# plot width of clusters
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=False)
sns.ecdfplot(data, x="width", stat="count", ax=ax1, log_scale=True)
ax1.set_title("Single-cell peaks")
sns.ecdfplot(merged, x="width", stat="count", ax=ax2, log_scale=True)
ax2.set_title("Peak clusters")

In [None]:
def donor_cells(df):
    """
    Count how many cells each peaks is present in for each donor.
    """
    assert "Cluster" in df.columns, "Cluster column missing."
    assert "tissue" in df.columns, "Tissue column missing."
    assert df["donor_id"].nunique() == 1, "Multiple donors present."
    assert "cell_id" in df.columns, "Cell ID column missing."

    cells_per_peak = (
        df.groupby(["Cluster", "tissue"], observed=True)["cell_id"]
        .nunique()
        .to_frame()
        .reset_index()
        .pivot(index="Cluster", columns="tissue", values="cell_id")
        .fillna(0)
        .astype(int)
    )

    cells_per_peak = cells_per_peak.rename(
        columns={"DLPFC": "n_DLPFC_cells", "HIP": "n_HIP_cells"}
    )
    cells_per_peak["n_donor_cells"] = (
        cells_per_peak["n_DLPFC_cells"] + cells_per_peak["n_HIP_cells"]
    )

    return df.merge(cells_per_peak, on="Cluster")

In [None]:
# Compute sharing of peaks across cells and donors
data = pr.PyRanges(data).cluster().df
data["Cluster"] = data["Cluster"].astype("category")
data = data.groupby("donor_id").apply(donor_cells).reset_index(drop=True)
cells_per_peak = data.groupby("Cluster", observed=True)["cell_id"].nunique()
donors_per_peak = data.groupby("Cluster", observed=True)["donor_id"].nunique()
data["n_cells"] = data["Cluster"].map(cells_per_peak)
data["n_donors"] = data["Cluster"].map(donors_per_peak)
data["cells_per_donor"] = data["n_cells"] / data["n_donors"]
data["cells_per_donor"] = data["cells_per_donor"].apply(
    lambda x: 0 if x == math.inf else x
)

In [None]:
# compute sharing across bulk data
# TODO: something is going wrong here, resulting in too many KNRGLs
merged = (
    pr.PyRanges(merged)
    .count_overlaps(pr.PyRanges(bulk), overlap_col="n_bulk_donors")
    .df
)
merged.index = merged.index + 1
data["n_bulk_donors"] = data["Cluster"].map(merged["n_bulk_donors"])
data["n_bulk_donors"] = data["n_bulk_donors"].apply(lambda x: 37 if x > 37 else x)
data["cells_per_bulk_donor"] = data["n_cells"] / data["n_bulk_donors"]
data["cells_per_bulk_donor"] = data["cells_per_bulk_donor"].apply(
    lambda x: 0 if x == math.inf else x
)

## Display statistics of unlabeled peaks

1. number of peaks per cell per donor, TODO: check for correlations with clinical features and number of reads
1. width
1. n_reads
1. rpm
1. n_ref_reads
1. n_unique_5end
1. frac_unique_5end
1. n_unique_3end
1. frac_unique_5end
1. n_duplicates
1. frac_duplicates
1. n_contigs
1. frac_contigs
1. n_proper_pairs
1. frac_proper_pairs
1. num_supp_alignments_mean
1. frac_mean_supp_alignments
1. mate_alignment_score_normed_mean
1. min_mapq
1. n_cells
1. n_donors
1. n_bulk_donors
1. cells_per_donor
1. cells_per_bulk_donor

In [None]:
df = (
    data.groupby(["libd_id", "race", "diagnosis", "donor_id", "cell_id", "age"])
    .size()
    .reset_index(name="n_peaks")
)

fig, ax = plt.subplots(1, 3, figsize=(20, 6))
df.sort_values("diagnosis", ascending=False, inplace=True)
sns.boxplot(data=df, y="libd_id", x="n_peaks", ax=ax[0])
mean_peaks = data.groupby("cell_id", observed=True).size().mean()

# annotate ax with mean peaks
ax[0].axvline(mean_peaks, color="red", linestyle="--")

for _, df in data.groupby("cell_id"):
    sns.ecdfplot(
        df,
        x="n_reads",
        c=sns.color_palette()[0],
        alpha=0.2,
        stat="count",
        ax=ax[1],
        log_scale=True,
    )
    sns.ecdfplot(
        df, x="width", c=sns.color_palette()[0], alpha=0.2, stat="count", ax=ax[2]
    )
ax[1].set_ylabel("# peaks")
ax[2].set_ylabel("# peaks")

In [None]:
features_scale = [
    ("width", False),
    ("n_reads", True),
    ("n_ref_reads", True),
    ("n_unique_5end", True),
    ("n_unique_3end", True),
    ("n_duplicates", True),
    ("n_contigs", True),
    ("n_proper_pairs", True),
    ("num_supp_alignments_mean", True),
    ("mate_alignment_score_normed_mean", False),
    ("alignment_score_normed_mean", False),
    ("orientation_bias", False),
    ("min_mapq", False),
    ("n_cells", False),
    ("n_donors", False),
    ("n_bulk_donors", False),
    ("n_HIP_cells", False),
    ("n_DLPFC_cells", False),
    ("n_donor_cells", False),
]
features, scales = zip(*features_scale)
n_plots = len(features_scale)
nrows = math.floor(math.sqrt(n_plots))
ncols = math.ceil(n_plots / nrows)

In [None]:
g, axs = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 6))

# setup colors
colors = sns.color_palette("tab10", n_colors=2)
opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": colors[0], "AA": colors[1]},
    "alpha": 0.5,
}

for ax, (f, s) in zip(axs.flatten(), features_scale):
    for d, df in data.groupby("libd_id"):
        sns.ecdfplot(data=df, x=f, log_scale=s, ax=ax, **opts)
    ax.get_legend().remove()

g.legend(
    title="Race",
    labels=data["race"].unique(),
    loc="center right",
    bbox_to_anchor=(1, 0.5),
)

In [None]:
# by chromosome
df = (
    data.groupby(["Chromosome", "race", "diagnosis", "cell_id", "age"])
    .size()
    .reset_index(name="n_peaks")
)

fig, ax = plt.subplots(1, 3, figsize=(21, 6))
df.sort_values("n_peaks", inplace=True)
sns.boxplot(data=df, y="Chromosome", x="n_peaks", ax=ax[0])
sns.ecdfplot(data, x="n_reads", hue="Chromosome", alpha=0.5, ax=ax[1], log_scale=True)
sns.ecdfplot(data, x="width", hue="Chromosome", alpha=0.5, ax=ax[2])
ax[1].set_ylabel("# peaks")
ax[1].get_legend().remove()
ax[2].set_ylabel("# peaks")

# move legend to the right
ax[2].legend(
    title="Chromosome",
    labels=data["Chromosome"].unique(),
    loc="center left",
    bbox_to_anchor=(1, 0.5),
)

In [None]:
g, axs = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 6), sharey=True)

for ax, (f, s) in zip(axs.flatten(), features_scale):
    sns.ecdfplot(data=data, x=f, log_scale=s, ax=ax, hue="Chromosome", alpha=0.5)
    ax.get_legend().remove()

g.legend(
    title="Chromosome",
    labels=data["Chromosome"].unique(),
    loc="center left",
    bbox_to_anchor=(1, 0.5),
)

## 4. Collate labels of the peaks

a. Find additional KRGLs by looking for overlaps with bulk SLAVseq
b. Find additional KRGLs by looking for overlaps with single-cell SLAVseq
c. Find additional KNRGLs by looking for overlaps with bulk SLAVseq

In [None]:
data["label"] = data.progress_apply(collate_labels, axis=1)

In [None]:
# get germline labels for each donor
# TODO: use bulk here or single-cell data as reference?
def germline_distance(donor_df: pd.DataFrame) -> pd.DataFrame:
    assert len(donor_df["donor_id"].unique()) == 1, "Expected only one donor"
    d = donor_df["donor_id"].unique()[0]
    gdf = pr.PyRanges(bulk.query('donor_id == @d and label != "OTHER"')).merge()
    donor_df = pr.PyRanges(donor_df).df
    donor_df["germline_dist"] = (
        pr.PyRanges(donor_df).nearest(gdf, overlap=False).df["Distance"].abs()
    )
    return donor_df


data = (
    data.groupby(["donor_id"])
    .progress_apply(germline_distance)
    .reset_index(drop=True)
    .sort_values(["Chromosome", "Start"])
)

features_scale.append(("germline_dist", True))
features, scale = zip(*features_scale)

In [None]:
labels = [
    "primer_sites",
    "l1hs",
    "l1pa2",
    "l1pa3",
    "l1pa4",
    "l1pa5",
    "l1pa6",
    "megane_gaussian",
    "megane_breakpoints",
    "graffite",
    "xtea",
    "bulk",
]
data["other"] = data[labels].sum(axis=1) == 0
labels.append("other")

g, axs = plt.subplots(nrows, ncols, figsize=(ncols * 6, nrows * 6), sharey=True)
axs = axs.flatten()

for ax, (f, s) in zip(axs[0 : len(features)], features_scale):
    for l in labels:
        sns.ecdfplot(data=data[data[l]], x=f, log_scale=s, ax=ax)

g.legend(title="annotation", labels=labels, loc="center left", bbox_to_anchor=(1, 0.5))

In [None]:
df = (
    data.groupby(["label", "race", "diagnosis", "cell_id", "age", "libd_id"])
    .size()
    .reset_index(name="n_peaks")
)

fig, ax = plt.subplots(1, 3, figsize=(21, 10))
df.sort_values("n_peaks", inplace=True)
sns.boxplot(
    data=df, y="libd_id", x="n_peaks", hue="label", ax=ax[0], hue_order=HUE_ORDER
)
sns.ecdfplot(
    data,
    x="n_reads",
    hue="label",
    alpha=0.5,
    ax=ax[1],
    log_scale=True,
    hue_order=HUE_ORDER,
)
sns.ecdfplot(data, x="width", hue="label", alpha=0.5, ax=ax[2], hue_order=HUE_ORDER)

means = data.groupby(["cell_id", "label"]).size().groupby("label").mean()
print(
    f"""
       Mean KRNGL peaks per cell: {means["KNRGL"]:0.2f}
       Mean KRGL peaks per cell: {means["KRGL"]:0.2f}
       Mean OTHER peaks per cell: {means["OTHER"]:0.2f}
       """
)

In [None]:
df = data.melt(
    id_vars=["label", "cell_id"],
    value_vars=features,
    var_name="feature",
    value_name="value",
)

g = sns.displot(
    data=df,
    x="value",
    hue="label",
    col="feature",
    kind="ecdf",
    col_wrap=6,
    facet_kws={"sharex": False},
)

for ax in g.axes:
    for f, s in features_scale:
        text = ax.title.get_text().split(" = ")[-1]
        if f == text:
            ax.set_xscale("log" if s else "linear")

In [None]:
plots = [
    ("germline_dist", "n_bulk_donors", (True, False)),
    ("germline_dist", "n_donors", (True, False)),
    ("germline_dist", "n_cells", (True, False)),
    ("germline_dist", "cells_per_donor", (True, False)),
    ("germline_dist", "n_donor_cells", (True, False)),
    ("germline_dist", "n_HIP_cells", (True, False)),
    ("germline_dist", "n_DLPFC_cells", (True, False)),
    ("n_cells", "n_donors", (False, False)),
    ("n_cells", "n_bulk_donors", (False, False)),
    ("n_cells", "n_donor_cells", (False, False)),
    ("n_donors", "n_bulk_donors", (False, False)),
]

for i, (x, y, s) in enumerate(plots):
    datashader_plot(data, x, y, s, plot_width=100, plot_height=100)

In [None]:
# get all peaks that are KRGL in bulk
brdf = pr.PyRanges(bulk.query("label == 'KRGL'"))
# find overlaps across cells
data = pr.PyRanges(data).count_overlaps(brdf, overlap_col="bulk_ref").df
# label as bulk_ref if overlap
data["bulk_ref"] = data["bulk_ref"] > 0

# report new KRGL peaks per donor
for d, df in data.groupby("donor_id"):
    new_krgls = df.query("label != 'KRGL' and bulk_ref")
    krgl_low_germ_dist = new_krgls.query("germline_dist < 20000")
    lost_knrgls = df.query("label == 'KNRGL' and bulk_ref")
    knrgl_low_germ_dist = lost_knrgls.query("germline_dist < 20000")
    print(
        f"""Donor {d}:
					{len(new_krgls)} ({len(krgl_low_germ_dist)} germ_dist < 20kb) new KRGLs 
					{len(lost_knrgls)} ({len(knrgl_low_germ_dist)} germ_dist < 20kb) lost KNRGLs
			"""
    )

data.loc[data["bulk_ref"], "label"] = "KRGL"

In [None]:
knrgl_low_germ_dist = data.query("label == 'KNRGL' and germline_dist < 20000")
print(f"Found {len(knrgl_low_germ_dist)} KNRGL peaks with germline distance < 20kb.")

new_krgls = data.query("label == 'OTHER' and bulk")
print(f"Added {len(new_krgls)} new KNRGLs.")

data.loc[(data["bulk"]) & (data["label"] == "OTHER"), "label"] = "KNRGL"
knrgl_low_germ_dist = data.query("label == 'KNRGL' and germline_dist < 20000")
print(f"Found {len(knrgl_low_germ_dist)} KNRGL peaks with germline distance < 20kb.")

In [None]:
df = (
    data.groupby(["label", "race", "diagnosis", "cell_id", "age", "libd_id"])
    .size()
    .reset_index(name="n_peaks")
)

fig, ax = plt.subplots(1, 3, figsize=(21, 10))
df.sort_values("n_peaks", inplace=True)
sns.boxplot(
    data=df, y="libd_id", x="n_peaks", hue="label", ax=ax[0], hue_order=HUE_ORDER
)
sns.ecdfplot(
    data,
    x="n_reads",
    hue="label",
    alpha=0.5,
    ax=ax[1],
    log_scale=True,
    hue_order=HUE_ORDER,
)
sns.ecdfplot(data, x="width", hue="label", alpha=0.5, ax=ax[2], hue_order=HUE_ORDER)

means = data.groupby(["cell_id", "label"]).size().groupby("label").mean()
print(
    f"""
       Mean KRNGL peaks per cell: {means["KNRGL"]:0.2f}
       Mean KRGL peaks per cell: {means["KRGL"]:0.2f}
       Mean OTHER peaks per cell: {means["OTHER"]:0.2f}
       """
)

## Remove peaks at reference L1 insertions

In [None]:
# remove reference insertions and peaks with low mapping quality
print("Filtering peaks")
print("Found {} peaks with reference reads.".format(len(data[data["n_ref_reads"] > 0])))
# print("Found {} peaks with reference clusters.".format(data["ref"].sum()))
print(
    "Found {} peaks with bulk SLAVseq reference clusters.".format(
        data["bulk_ref"].sum()
    )
)
print("Found {} peaks at primer sites.".format(data["primer_sites"].sum()))
for l in ["l1hs", "l1pa2", "l1pa3", "l1pa4", "l1pa5", "l1pa6"]:
    print("Found {} peaks at {} sites.".format(data[l].sum(), l))

data = data.query("label != 'KRGL'").reset_index(drop=True)
print("{} peaks remain after filtering".format(len(data)))

## Visualize

In [None]:
for i, (x, y, s) in enumerate(plots):
    datashader_plot(data, x, y, s, plot_width=100, plot_height=100)

## Save

In [None]:
data.to_parquet(snakemake.output[0], index=False)  # type: ignore