In [None]:
from pathlib import Path
import warnings, math

warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm.notebook import tqdm
import pandas as pd

tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import pyranges as pr
from pyslavseq.preprocessing import collate_labels, df2tabix
from pyslavseq.plotting import datashader_plot

HUE_ORDER = ["KNRGL", "OTHER", "KRGL"]

In [None]:
meta = pd.read_csv(snakemake.config["donors"], sep="\t", dtype={"donor_id": str})  # type: ignore

bdata, ldata, mdata = [], [], []
l1hs = pr.read_bed(snakemake.input.l1hs)
for b, m in zip(snakemake.input.bulk, snakemake.input.megane):  # type: ignore
    assert (
        Path(m).parent.name == Path(b).parent.name
    ), "filenames are not sorted properly"
    bdf = pd.read_csv(b, sep="\t")
    bdf.columns = bdf.columns.str.replace("#", "")
    bdf["donor_id"] = Path(b).parent.name
    bdata.append(bdf)

    # megane
    meg = pr.read_bed(m)
    mdf = meg.join(
        pr.PyRanges(bdf[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]),
        how="left",
    ).df
    mdf["n_reads"] = mdf["n_reads"].apply(lambda x: 0 if x < 0 else x)
    mdf["n_proper_pairs"] = mdf["n_proper_pairs"].apply(lambda x: 0 if x < 0 else x)
    mdf["donor_id"] = Path(m).parent.name
    mdata.append(mdf)

    # l1hs
    ldf = l1hs.join(
        pr.PyRanges(bdf[["Chromosome", "Start", "End", "n_reads", "n_proper_pairs"]]),
        how="left",
    ).df
    ldf["n_reads"] = ldf["n_reads"].apply(lambda x: 0 if x < 0 else x)
    ldf["n_proper_pairs"] = ldf["n_proper_pairs"].apply(lambda x: 0 if x < 0 else x)
    ldf["donor_id"] = Path(b).parent.name
    ldata.append(ldf)

ldata = pd.concat(ldata).merge(meta, on="donor_id")
ldata["locus"] = tuple(zip(ldata["Chromosome"], ldata["Start"], ldata["End"]))
mdata = pd.concat(mdata).merge(meta, on="donor_id")
mdata["locus"] = tuple(zip(mdata["Chromosome"], mdata["Start"], mdata["End"]))
bdata = pd.concat(bdata).merge(meta, on="donor_id")
bdata["Width"] = bdata["End"] - bdata["Start"]

In [None]:
sprint(f"Loaded {len(bdata)} peaks from {bdata['donor_id'].nunique()} donors")
avg_peaks = bdata.groupby("donor_id").size().mean()
sd_peaks = bdata.groupby("donor_id").size().std()

# check if sd is na, for <3-sample testing
if not pd.isna(sd_peaks):
    print(f"{int(avg_peaks)} ± {int(sd_peaks)} peaks per donor")
else:
    print(f"{int(avg_peaks)} peaks per donor")

# print all columns
print("Columns:")
print(*bdata.columns)

In [None]:
g, axs = plt.subplots(2, 2, figsize=(16, 16), sharey="row", sharex="col")
colors = sns.color_palette("tab10", n_colors=2)

opts = {
    "hue_order": ["CAUC", "AA"],
    "hue": "race",
    "palette": {"CAUC": colors[0], "AA": colors[1]},
    "alpha": 0.5,
    "stat": "count",
    "log_scale": True,
}


for d in bdata["donor_id"].unique():
    for ax, df in zip(
        axs, [ldata.query("donor_id == @d"), mdata.query("donor_id == @d")]
    ):
        df = df.loc[df.groupby("locus")["n_reads"].idxmax()]
        # add one for log scale
        df["n_reads"] = df["n_reads"] + 1
        df["n_proper_pairs"] = df["n_proper_pairs"] + 1
        sns.ecdfplot(df, x="n_reads", ax=ax[0], **opts)
        sns.ecdfplot(df, x="n_proper_pairs", ax=ax[1], **opts)

axs[0, 0].set_ylabel("# Reference L1HS")
axs[1, 0].set_ylabel("# Non-Reference L1HS (detected from WGS)")

In [None]:
# save
bdata.sort_values(["Chromosome", "Start"], inplace=True)
df2tabix(bdata, snakemake.output[0])  # type: ignore