# Peak calling testing

In [1]:
from pathlib import Path
import logging
from tqdm import tqdm
from collections import defaultdict

logging.basicConfig(level=logging.INFO)

import pysam
import pandas as pd
import pyranges as pr
import seaborn as sns
import matplotlib.pyplot as plt
from scripts.pyslavseq.sliding_window import SlidingWindow

INFO:numexpr.utils:Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


### Load donor data

In [2]:
# find knrgl bed and bulk BAMs for each individual
individuals = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/7donor_donors.tsv", sep="\t"
)["donor_id"].values.astype(str)
indv_data = defaultdict(dict)
outdir = "/iblm/netapp/data4/mcuoco/sz_slavseq"

for i in individuals:
    for f in Path(f"{outdir}/results/align/{i}/").glob("gDNA*tagged.sorted.bam"):
        indv_data[i]["bulk"] = str(f)

    # convert this path to a string
    indv_data[i]["knrgl"] = str(
        Path(outdir) / "resources" / f"{i}_insertions_1kb_3end.bed"
    )

    indv_data[i]["cells"] = [
        str(f)
        for f in Path(f"{outdir}/results/align/{i}").rglob("*tagged.sorted.bam")
        if "gDNA" not in f.name
    ]

In [3]:
blacklist = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/union/GRCh38_alldifficultregions.bed.gz",
    sep="\t",
    skiprows=1,
    header=None,
    names=["Chromosome", "Start", "End"],
)

rmsk = pr.read_bed(
    "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/rmsk_1kb_3end.bed", as_df=True
).sort_values(["Chromosome", "Start", "End"])

## Bulk peaks

In [4]:
bam = pysam.AlignmentFile(indv_data["1"]["bulk"])
knrgl = pd.read_csv(indv_data["1"]["knrgl"], sep="\t")
knrgl["Name"] = "KNRGL"
line1 = pd.concat(
    [
        knrgl[["Chromosome", "Start", "End", "Name"]],
        rmsk[["Chromosome", "Start", "End", "Name"]],
    ]
).sort_values(["Chromosome", "Start", "End"])

### Test Peak Callers

In [5]:
# speed test
import cProfile, pstats, io

with cProfile.Profile() as prof:
    spc = SlidingWindow(bam, contigs=["chr22"], min_mapq=5)
    peaks = []
    for p in spc.make_windows(
        size=200,
        step=1,
        min_rpm=2,
        strand_split=True,
        merge=True,
        features=True,
    ):
        peaks.append(p)

    s = io.StringIO()
    ps = pstats.Stats(prof, stream=s).sort_stats(pstats.SortKey.CUMULATIVE)
    ps.print_stats()
    print(s.getvalue())

INFO:root:4380613 filtered reads in the bam file
INFO:root:Making windows on chr22


         205056476 function calls (205050943 primitive calls) in 76.057 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      127    0.007    0.000   46.004    0.362 /iblm/logglun02/mcuoco/workflows/sz_slavseq/workflow/scripts/pyslavseq/sliding_window.py:263(make_windows)
      128    0.250    0.002   45.456    0.355 /iblm/logglun02/mcuoco/workflows/sz_slavseq/workflow/scripts/pyslavseq/sliding_window.py:145(merge)
    59252   36.409    0.001   44.880    0.001 /iblm/logglun02/mcuoco/workflows/sz_slavseq/workflow/scripts/pyslavseq/sliding_window.py:109(windows)
        1    0.000    0.000   30.070   30.070 /iblm/logglun02/mcuoco/workflows/sz_slavseq/workflow/scripts/pyslavseq/sliding_window.py:81(__init__)
        1    0.000    0.000   30.066   30.066 {method 'count' of 'pysam.libcalignmentfile.AlignmentFile' objects}
        1    4.345    4.345   30.066   30.066 pysam/libcalignmentfile.pyx:1353(count)
  8761226    5.656  

In [None]:
# sliding window
spc = SlidingWindow(bam, contigs=["chr1"], min_mapq=5)
peaks = []
for p in spc.make_windows(
    size=200,
    step=1,
    min_rpm=2,
    strand_split=True,
    merge=True,
    features=True,
):
    peaks.append(p)

peaks = pd.DataFrame(peaks)

In [None]:
_, peak_df, line_df = spc.coverage(peaks[peaks["max_mapq"] == 60], line1)

In [None]:
sns.ecdfplot(data=peak_df, x="diff", hue="Name").set(xscale="log", xlim=1)

In [None]:
sns.boxplot(data=peak_df, x="alignment_score_mean", y="Name")

In [None]:
sns.boxplot(data=peak_df, x="num_supp_alignments_mean", y="Name")

In [None]:
peak_df[(peak_df["Name"] == "NoneNR")]

In [None]:
peak_df[(peak_df["Name"] == "KNRGL")]

#### MACS2 (poor coverage of KNRGL and RMSK)

In [None]:
from subprocess import Popen, PIPE, DEVNULL, CalledProcessError
from tempfile import NamedTemporaryFile, TemporaryDirectory

In [None]:
def macs2(bam: str, extra: str = ""):
    """
    Run MACS2 on bam file
    """

    names = [
        "Chromosome",
        "Start",
        "End",
        "Name",
        "Score",
        "Strand",
        "signalValue",
        "pValue",
        "qValue",
        "peak",
    ]

    with TemporaryDirectory() as tmpdir:
        cmd = f"macs2 callpeak -t {bam} -g hs --outdir {tmpdir} --name test {extra}"
        try:
            Popen(cmd, shell=True).communicate()
        except CalledProcessError as e:
            print(e.output)

        df = pd.read_csv(
            f"{tmpdir}/test_peaks.narrowPeak", sep="\t", header=None, names=names
        )

        df["width"] = df["End"] - df["Start"]

    return df

In [None]:
with NamedTemporaryFile(suffix=".bam") as tmpfile:
    Popen(
        f"samtools view -bh -F 2052 {bam}", shell=True, stdout=tmpfile, stderr=PIPE
    ).communicate()
    tmpfile.seek(0)
    peaks = macs2(tmpfile.name, extra="-f BAMPE --nolambda --nomodel --extsize 50")

## Single-cell Peaks

In [None]:
bam = pysam.AlignmentFile(indv_data["1"]["cells"][0])
knrgl = pd.read_csv(indv_data["1"]["knrgl"], sep="\t")
knrgl["Name"] = "KNRGL"
line1 = pd.concat(
    [
        knrgl[["Chromosome", "Start", "End", "Name"]],
        rmsk[["Chromosome", "Start", "End", "Name"]],
    ]
).sort_values(["Chromosome", "Start", "End"])

In [None]:
# sliding window
spc = SlidingWindow(bam, contigs=["chr1"], min_mapq=5)
peaks = []
for p in spc.make_windows(
    size=200,
    step=1,
    min_rpm=2,
    strand_split=True,
    merge=True,
    features=False,
):
    peaks.append(p)

peaks = pd.DataFrame(peaks)

In [None]:
_, peak_df, line_df = spc.coverage(peaks[peaks["max_mapq"] == 60], line1)

In [None]:
sns.ecdfplot(data=peak_df, x="diff", hue="Name").set(xscale="log", xlim=1)