Compute sensitivity of bulk and single-cell SLAVseq signals

In [None]:
from pathlib import Path
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import pyranges as pr
from upsetplot import UpSet

## Bulk sensitivity

In [None]:
meta = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore

bdata, ldata, mdata = [], [], []
l1hs = pr.read_bed(snakemake.input.l1hs_rmsk)  # type: ignore
for b, m in zip(snakemake.input.bulk, snakemake.input.megane):  # type: ignore
    assert (
        Path(m).parent.name == Path(b).parent.name
    ), "filenames are not sorted properly"
    bdf = pd.read_csv(b, sep="\t").query("n_reads >= 5")
    bdf.columns = bdf.columns.str.replace("#", "")
    bdf["donor_id"] = Path(b).parent.name
    bdata.append(bdf)

    # megane
    meg = pr.read_bed(m).df
    meg["AC"] = meg["Strand"].astype(int)
    meg_l1hs = meg["Score"].str.contains("L1HS")
    meg = pr.PyRanges(meg[meg_l1hs])
    mdf = meg.join(
        pr.PyRanges(bdf[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]),
        how="left",
    ).df
    mdf["n_reads"] = mdf["n_reads"].apply(lambda x: 0 if x < 0 else x)
    mdf["n_proper_pairs"] = mdf["n_proper_pairs"].apply(lambda x: 0 if x < 0 else x)
    mdf["donor_id"] = Path(m).parent.name
    mdata.append(mdf)

    # l1hs
    ldf = l1hs.join(
        pr.PyRanges(bdf[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]),
        how="left",
    ).df
    ldf["n_reads"] = ldf["n_reads"].apply(lambda x: 0 if x < 0 else x)
    ldf["n_proper_pairs"] = ldf["n_proper_pairs"].apply(lambda x: 0 if x < 0 else x)
    ldf["donor_id"] = Path(b).parent.name
    ldata.append(ldf)

ldata = pd.concat(ldata).merge(meta, on="donor_id")
ldata["locus"] = tuple(zip(ldata["Chromosome"], ldata["Start"], ldata["End"]))
ldata = ldata.groupby(["donor_id", "race", "locus"])["n_reads"].max().reset_index()
ldata["locus"] = ldata["locus"].astype(str)
mdata = pd.concat(mdata).merge(meta, on="donor_id")
mdata["locus"] = tuple(zip(mdata["Chromosome"], mdata["Start"], mdata["End"]))
mdata = (
    mdata.groupby(["donor_id", "race", "locus", "AC"])["n_reads"].max().reset_index()
)
mdata["locus"] = mdata["locus"].astype(str)
bdata = pd.concat(bdata).merge(meta, on="donor_id")
bdata["Width"] = bdata["End"] - bdata["Start"]
print(f"Loaded {len(bdata)} peaks from {bdata['donor_id'].nunique()} donors")

In [None]:
# cdf plots
g, axs = plt.subplots(1, 2, figsize=(16, 8))
cols = sns.color_palette("tab10", n_colors=2)

opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": cols[0], "AA": cols[1]},
    "alpha": 0.5,
    "stat": "count",
    "log_scale": True,
}


for d in bdata["donor_id"].unique():
    for ax, data in zip(axs, [ldata, mdata]):
        df = data.query("donor_id == @d")
        df = df.loc[df.groupby("locus")["n_reads"].idxmax()]
        # add one for log scale
        df["n_reads"] = df["n_reads"] + 1
        sns.ecdfplot(df, x="n_reads", ax=ax, **opts)

axs[0].set_title("# Reference L1HS")
axs[1].set_title("# Non-Reference L1HS (detected from WGS)")

In [None]:
# boxplots
g, axs = plt.subplots(2, 1, figsize=(16, 16))

for data, ax in zip([ldata, mdata], axs):

    locus_order = data.groupby(["locus"])["n_reads"].mean().sort_values().index

    df = data.groupby(["locus", "donor_id"])["n_reads"].max().reset_index()
    df = df.set_index("locus").loc[locus_order].reset_index()
    df["locus"] = df["locus"].astype(str)
    sns.boxplot(
        data=df, x="locus", y="n_reads", showfliers=False, ax=ax, log_scale=True
    )

    # remove xtick labels
    ax.set_xticklabels([])

axs[0].set_title("Reference L1HS")
axs[1].set_title("Non-Reference L1HS (detected from WGS)")

In [None]:
# KRGL heatmap
df = (
    ldata.groupby(["locus", "donor_id"])["n_reads"]
    .max()
    .reset_index()
    .pivot_table(index="locus", columns="donor_id", values="n_reads")
    .fillna(0)
)

log_norm = colors.LogNorm(vmin=df.min().min() + 1, vmax=df.max().max())
sns.clustermap(df, cmap="viridis", norm=log_norm, yticklabels=False, method="ward")

In [None]:
# KNRGL heatmap
assert len(mdata[["locus", "donor_id"]]) == len(
    mdata[["locus", "donor_id"]].drop_duplicates()
), "duplicate locus-donor pairs found!"
locus_order = mdata.groupby(["locus"])["AC"].sum().sort_values().index

df = (
    mdata.groupby(["locus", "donor_id"])["n_reads"]
    .max()
    .reset_index()
    .pivot_table(index="locus", columns="donor_id", values="n_reads")
    .fillna(0)
)

log_norm = colors.LogNorm(vmin=df.min().min() + 1, vmax=df.max().max())
sns.clustermap(
    df.loc[locus_order],
    cmap="viridis",
    norm=log_norm,
    yticklabels=False,
    row_cluster=False,
    col_cluster=False,
)

## Single cells

In [None]:
data, ldata, mdata = [], [], []

# iterate over donors
with tqdm(total=len(snakemake.input.cells)) as pbar:
    for m in snakemake.input.megane:  # type: ignore
        meg = pr.read_bed(m).df
        meg_l1hs = meg["Score"].str.contains("L1HS")
        meg = pr.PyRanges(meg[meg_l1hs])

        # get cells
        donor_id = Path(m).parent.name

        # iterate over cells
        donor_cells = [
            c for c in snakemake.input.cells if Path(c).parent.name == donor_id
        ]
        print(f"Found {len(donor_cells)} cells for donor {donor_id}")
        for c in donor_cells:
            pbar.update()
            df = pd.read_csv(c, sep="\t").query("n_reads >= 5")
            df.columns = df.columns.str.replace("#", "")
            df["donor_id"] = Path(c).parent.name
            df["cell_id"] = Path(c).name.rstrip(".labelled.bed.gz")
            data.append(df)

            # megane
            mdf = meg.join(
                pr.PyRanges(
                    df[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]
                ),
                how="left",
            ).df
            mdf["n_reads"] = mdf["n_reads"].apply(lambda x: 0 if x < 0 else x)
            mdf["n_proper_pairs"] = mdf["n_proper_pairs"].apply(
                lambda x: 0 if x < 0 else x
            )
            mdf["donor_id"] = Path(m).parent.name
            mdf["cell_id"] = Path(c).name.rstrip(".labelled.bed.gz")
            mdata.append(mdf)

            # l1hs
            ldf = l1hs.join(
                pr.PyRanges(
                    df[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]
                ),
                how="left",
            ).df
            ldf["n_reads"] = ldf["n_reads"].apply(lambda x: 0 if x < 0 else x)
            ldf["n_proper_pairs"] = ldf["n_proper_pairs"].apply(
                lambda x: 0 if x < 0 else x
            )
            ldf["donor_id"] = Path(m).parent.name
            ldf["cell_id"] = Path(c).name.rstrip(".labelled.bed.gz")
            ldata.append(ldf)

ldata = pd.concat(ldata).merge(meta, on="donor_id")
ldata["locus"] = tuple(zip(ldata["Chromosome"], ldata["Start"], ldata["End"]))
mdata = pd.concat(mdata).merge(meta, on="donor_id")
mdata["locus"] = tuple(zip(mdata["Chromosome"], mdata["Start"], mdata["End"]))
data = pd.concat(data).merge(meta, on="donor_id")
data["Width"] = data["End"] - data["Start"]
print(
    f"Loaded {len(data)} peaks from {data['cell_id'].nunique()} cells from {data['donor_id'].nunique()} donors"
)

In [None]:
ldf = (
    ldata.groupby(["cell_id", "donor_id", "race", "diagnosis"])
    .apply(lambda x: sum(x["n_reads"] > 0) / len(x))
    .reset_index(name="sensitivity")
)

mdf = (
    mdata.groupby(["cell_id", "donor_id", "race", "diagnosis"])
    .apply(lambda x: sum(x["n_reads"] > 0) / len(x))
    .reset_index(name="sensitivity")
)

In [None]:
# TODO: make this 2d?
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
plt.subplots_adjust(wspace=0.1)  # Adjust to your needs

sns.histplot(ldf, x="sensitivity", bins=100, ax=ax1)
ax1.set_title("Reference L1HS")
ax1.set_xlabel("% convered (sensitivity)")
ax1.set_ylabel("# cells")

sns.histplot(data=mdf, x="sensitivity", bins=100, ax=ax2)
ax2.set_title("Non-Reference L1HS (detected from WGS)")
ax2.set_xlabel("% convered (sensitivity)")

In [None]:
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), sharey=True)
plt.subplots_adjust(wspace=0.1)  # Adjust to your needs

sns.boxplot(data=ldf, x="sensitivity", y="donor_id", hue="race", ax=ax1)
sns.boxplot(data=mdf, x="sensitivity", y="donor_id", hue="race", ax=ax2)
ax1.set_title("Reference L1HS")
ax1.set_xlabel("% convered (sensitivity)")
ax2.set_title("Non-Reference L1HS (detected from WGS)")
ax2.set_xlabel("% convered (sensitivity)")