In [None]:
from pathlib import Path
import warnings, math

warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import pyranges as pr
from pyslavseq.preprocessing import collate_labels, df2tabix
from pyslavseq.plotting import datashader_plot

HUE_ORDER = ["KNRGL", "OTHER", "KRGL"]

## Load data

1. meta data
1. bulk peaks
1. single-cell peaks

In [None]:
# meta data
meta = pd.read_csv(snakemake.config["samples"], sep="\t", dtype={"sample_id": str, "tissue_id": str, "donor_id": str})  # type: ignore
donors = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore
meta = meta.merge(donors, on=["donor_id"]).rename(columns={"sample_id": "cell_id"})

# bulk peaks
bulk = pd.read_csv(snakemake.input.bulk, sep="\t")  # type: ignore
bulk.columns = bulk.columns.str.replace("#", "")
bulk["donor_id"] = bulk["donor_id"].astype(str)

# single-cell peaks
data = []
min_reads, max_mapq = 10, 60
for f in tqdm(snakemake.input.cells):  # type: ignore
    df = pd.read_csv(f, sep="\t").query(
        "max_mapq >= @max_mapq and n_reads >= @min_reads"
    )
    df.columns = df.columns.str.replace("#", "")
    cell_id = Path(f).name.rstrip(".labelled.bed.gz")
    df["cell_id"] = cell_id
    df["tissue"] = "DLPFC" if "usd" in cell_id.lower() else "HIP"
    data.append(df)

data = pd.concat(data).reset_index(drop=True).merge(meta, on="cell_id")

# add additional features
data["Width"] = data["End"] - data["Start"]
data.query("Width >= 400", inplace=True)
data[f"orientation_bias"] = (
    np.maximum(data[f"n_fwd"], data[f"n_rev"]) / data[f"n_reads"]
)
data[f"frac_proper_pairs"] = data[f"n_proper_pairs"] / data[f"n_reads"]
data[f"frac_duplicates"] = data[f"n_duplicates"] / (
    data[f"n_reads"] + data[f"n_duplicates"]
)
data[f"frac_unique_3end"] = data[f"n_unique_3end"] / data[f"n_reads"]
data[f"frac_unique_5end"] = data[f"n_unique_5end"] / data[f"n_reads"]
data[f"frac_mean_supp_alignments"] = (
    data[f"num_supp_alignments_mean"] / data[f"n_reads"]
)

ndonors = data["donor_id"].nunique()
ncells = data["cell_id"].nunique()
print(f"Loaded {len(data)} peaks from {ncells} cells from {ndonors} donors.")

data.drop(columns=["bulk", "bulk_id"], inplace=True)
data = pr.PyRanges(data).count_overlaps(pr.PyRanges(bulk), overlap_col="bulk").df

## Compute sharing of peaks across cells and donors

In [None]:
merged = pr.PyRanges(data).merge().df
merged["Width"] = merged["End"] - merged["Start"]

# plot width of clusters
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=False)
sns.ecdfplot(data, x="Width", stat="count", ax=ax1, log_scale=True)
ax1.set_title("Single-cell peaks")
sns.ecdfplot(merged, x="Width", stat="count", ax=ax2, log_scale=True)
ax2.set_title("Peak clusters")

## Look at peaks per cell per donor

In [None]:
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# peaks per cell
df = (
    data.groupby(["cell_id", "libd_id", "donor_id", "race"])
    .size()
    .reset_index(name="n_peaks")
)
df.sort_values("race", ascending=False, inplace=True)
sns.boxplot(df, x="n_peaks", y="libd_id", hue="race", ax=ax1)
ax1.set_xlabel("# single-cell peaks")
ax1.set_ylabel(None)


colors = sns.color_palette("tab10", n_colors=2)
opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": colors[0], "AA": colors[1]},
}

for d, df in data.groupby("donor_id"):
    sns.ecdfplot(df, x="bulk", stat="count", ax=ax2, **opts)
ax2.set_xlabel("Number of bulk peak overlaps")
ax2.set_ylabel("# single-cell peaks")
ax2.set_title("Single-cell peaks vs bulk peaks overlaps")

## Remove reference insertions

In [None]:
data["label"] = data.progress_apply(collate_labels, axis=1)


def collate_labels_with_bulk(row):
    assert hasattr(row, "label"), "No label column found."

    if row.bulk and row.label == "KRGL":
        return "KRGL"
    elif row.bulk and row.label == "KNRGL":
        return "KNRGL"
    else:
        return "OTHER"


data["label"] = data.progress_apply(collate_labels_with_bulk, axis=1)

data = data.sort_values(["Chromosome", "Start"]).reset_index(drop=True)
data = data.query("label != 'KRGL'").reset_index(drop=True)

In [None]:
def germline_distance(donor_df: pd.DataFrame) -> pd.DataFrame:
    assert len(donor_df["donor_id"].unique()) == 1, "Expected only one donor"
    d = donor_df["donor_id"].unique()[0]
    gdf = pr.PyRanges(bulk.query("donor_id == @d")).merge()
    donor_df = pr.PyRanges(donor_df).df.sort_values(["Chromosome", "Start"])
    donor_df["germline_distance"] = (
        pr.PyRanges(donor_df).nearest(gdf, overlap=False).df["Distance"].abs()
    )
    return donor_df


data = (
    data.groupby(["donor_id"])
    .progress_apply(germline_distance)
    .reset_index(drop=True)
    .sort_values(["Chromosome", "Start"])
)

In [None]:
sns.ecdfplot(data, x="germline_distance", stat="count", hue="label", log_scale=True)

In [None]:
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# peaks per cell
df = (
    data.groupby(["cell_id", "libd_id", "donor_id", "race", "label"])
    .size()
    .reset_index(name="n_peaks")
)
df.sort_values("race", ascending=False, inplace=True)
sns.boxplot(df, x="n_peaks", y="libd_id", hue="label", ax=ax1)
ax1.set_xlabel("# single-cell peaks")
ax1.set_ylabel(None)


colors = sns.color_palette("tab10", n_colors=2)
opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": colors[0], "AA": colors[1]},
}

for d, df in data.groupby("donor_id"):
    sns.ecdfplot(df, x="bulk", stat="count", ax=ax2, **opts)
ax2.set_xlabel("Number of bulk peak overlaps")
ax2.set_ylabel("# single-cell peaks")
ax2.set_title("Single-cell peaks vs bulk peaks overlaps")

In [None]:
merged = pr.PyRanges(data).merge().df
merged["Width"] = merged["End"] - merged["Start"]

# plot width of clusters
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6), sharex=True, sharey=False)
sns.ecdfplot(data, x="Width", stat="count", ax=ax1, log_scale=True)
ax1.set_title("Single-cell peaks")
sns.ecdfplot(merged, x="Width", stat="count", ax=ax2, log_scale=True)
ax2.set_title("Peak clusters")

In [None]:
" ".join([c for c in data.columns])

In [None]:
plots = [
    ("germline_distance", "n_reads", (True, True)),
    ("germline_distance", "three_end_clippedA_mean", (True, False)),
    ("germline_distance", "three_end_clippedA_q0", (True, False)),
    ("germline_distance", "three_end_clippedA_q1", (True, False)),
    ("germline_distance", "alignment_score_mean", (True, False)),
    ("germline_distance", "alignment_score_normed_mean", (True, False)),
    ("germline_distance", "n_proper_pairs", (True, False)),
    ("germline_distance", "n_unique_5end", (True, False)),
    ("germline_distance", "n_unique_clipped_3end", (True, False)),
    ("germline_distance", "5end_gini", (True, False)),
]

for i, (x, y, s) in enumerate(plots):
    datashader_plot(data, x, y, s, plot_width=100, plot_height=100)

## Save data

In [None]:
data.sort_values(["Chromosome", "Start"], inplace=True)
df2tabix(data, snakemake.output[0])