In [1]:
import pysam, sys, os
import cProfile, pstats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pyranges as pr
from time import time

sys.path.append((os.path.abspath("..")))
from src.peaks import SlidingPeakCaller, OverlapPeakCaller
from src.util.read_l1 import read_rmsk, read_knrgl

## Read in ref and non-ref insertions

In [2]:
# read annotations of reference and non-reference insertions 
rmsk = read_rmsk("/raidixshare_logg01/mcuoco/internal/BSMN/resources/hs38DH/hs38DH.fa.out")
knrgl = read_knrgl("/raidixshare_logg01/mcuoco/internal/BSMN/resources/LIBD74/hs38DH_insertions.bed").df

## Label knrgl in bulk that have coverage

In [3]:
bulk_fn = "/home/mcuoco/workflows/sz_slavseq/results/align/tags/hs38DH/LIBD74/bulk/gDNA_usd2.bam"

with pysam.AlignmentFile(bulk_fn) as bam:
	res = [bam.count(i.Chromosome, i.Start, i.End) for i in knrgl.itertuples()]

knrgl["coverage"] = res
knrgl = knrgl.loc[knrgl["coverage"] > 0,:]

## Define function for testing peaks

In [4]:
def label_peaks(df, knrgl, contigs):

	if "width" not in df.columns:
		df["width"] = df["Start"] - df["End"]
	df = pr.PyRanges(df.loc[df["Chromosome"].isin(contigs), :])
	knrgl = pr.PyRanges(knrgl.loc[knrgl["Chromosome"].isin(contigs), :])

	# add rmsk and knrgl labels
	df_knrgl = df.overlap(knrgl)

	if len(df_knrgl) == 0:
		df = df.df.set_index(["Chromosome", "Start", "End"])
		df["knrgl"] = False
		df["label"] = "other"
		print(f"0/{len(df)} peaks covering 0/{len(knrgl)} knrgl L1 annotations")
		return df
	
	df_knrgl = df_knrgl.df.set_index(["Chromosome", "Start", "End"])
	knrgl_covered = len(knrgl.overlap(df))

	df = df.df.set_index(["Chromosome", "Start", "End"])
	df["knrgl"] = df.index.isin(df_knrgl.index)
	df["label"] = df.apply(lambda x: "knrgl" if x.knrgl else "other", axis=1)

	knrgl_peaks = sum(df["label"] == "knrgl")
	
	print(f"{knrgl_peaks}/{len(df)} peaks covering {knrgl_covered}/{len(knrgl)} knrgl L1 annotations")

	return df

## Test different peak calling methods

In [5]:
def test_peaks(bam_fn, fxn, kwargs, contigs=None):
	with pysam.AlignmentFile(bam_fn) as bam:
		pc = fxn(bam, **kwargs)
		peaks = pc.run_peak_caller(contigs)
		peaks.rename(columns={"chr":"Chromosome", "start":"Start", "end":"End"}, inplace=True)
		return label_peaks(peaks, knrgl, contigs)

In [8]:
fn = {
	"bulk": "/home/mcuoco/workflows/sz_slavseq/results/align/tags/hs38DH/LIBD74/bulk/gDNA_usd2.bam",
	"mda": "/home/mcuoco/workflows/sz_slavseq/results/align/tags/hs38DH/LIBD74/mda/usd02_A2_S128.bam"
}

tests = {
	"Sliding 750x250 NoFilter NoMerge NoBG": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": False, "merge": False, "window_size": 750, "step_size": 250, "bg_sizes": None}},
	"Sliding 750x250 NoMerge NoBG": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": False, "window_size": 750, "step_size": 250, "bg_sizes": None}},
	"Sliding 750x250 NoBG": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 750, "step_size": 250, "bg_sizes": None}},
	"OverlappingReads": {"fxn": OverlapPeakCaller, "kwargs": {"read_filter": True}},
	"Sliding 750x250": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 750, "step_size": 250, "bg_sizes": [5000, 10000, 20000]}},
    "Sliding 750x1": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 750, "step_size": 1, "bg_sizes": [5000, 10000, 20000]}},
    "Sliding 500x1": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 500, "step_size": 1, "bg_sizes": [5000, 10000, 20000]}},
    "Sliding 400x1": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 400, "step_size": 1, "bg_sizes": [5000, 10000, 20000]}},
    "Sliding 300x1": {"fxn": SlidingPeakCaller, "kwargs": {"read_filter": True, "merge": True, "window_size": 300, "step_size": 1, "bg_sizes": [5000, 10000, 20000]}},
}

In [9]:
df_list = []
for n, f in fn.items():
	for k, v in tests.items():
		print(f"Running {k} on {n}...")
		df = test_peaks(f, contigs = ["chr5"], **v)
		df["file"] = n
		df["name"] = k
		df_list.append(df.copy())
		
df = pd.concat(df_list)

Running Sliding 750x250 NoFilter NoMerge NoBG on bulk...
72/25436 peaks covering 17/18 knrgl L1 annotations
Running Sliding 750x250 NoMerge NoBG on bulk...
68/864 peaks covering 16/18 knrgl L1 annotations
Running Sliding 750x250 NoBG on bulk...
16/304 peaks covering 16/18 knrgl L1 annotations
Running OverlappingReads on bulk...
17/4401 peaks covering 17/18 knrgl L1 annotations
Running Sliding 750x250 on bulk...
9/216 peaks covering 9/18 knrgl L1 annotations
Running Sliding 750x1 on bulk...
15/100 peaks covering 15/18 knrgl L1 annotations
Running Sliding 500x1 on bulk...
15/213 peaks covering 15/18 knrgl L1 annotations
Running Sliding 400x1 on bulk...
15/237 peaks covering 15/18 knrgl L1 annotations
Running Sliding 300x1 on bulk...
14/222 peaks covering 14/18 knrgl L1 annotations
Running Sliding 750x250 NoFilter NoMerge NoBG on mda...
49/25786 peaks covering 13/18 knrgl L1 annotations
Running Sliding 750x250 NoMerge NoBG on mda...
39/3681 peaks covering 11/18 knrgl L1 annotations
Runnin

AssertionError: peak chr5:100640750-100646250 width must be less than background window chr5:100641000-100646000 width

In [None]:
plt.clf()
fig, axes = plt.subplots(2, 1)
fig1 = sns.boxplot(ax = axes[0], data = df, y = "name", x = "count", hue="label", dodge=True)
fig1.set(xscale = "log", xlabel = "# reads", ylabel = "Peak caller")
axes[0].legend_.remove()
fig2 = sns.boxplot(ax = axes[1], data=df, y = "name", x = "width", hue="label", dodge=True)
fig2.set(xlabel = "width (bp)", ylabel = "Peak caller")
plt.tight_layout()
sns.move_legend(axes[1],loc="center right", bbox_to_anchor=(1.3, 1.2),title=None,frameon=False,)
plt.show()

In [None]:
# plot
fig = sns.JointGrid(data=df.reset_index(), x="width", y="nreads", hue="label", marginal_ticks=True)
fig.plot_joint(sns.scatterplot, alpha = 0.5)
fig.ax_joint.set(yscale = "log")

fig.plot_marginals(sns.histplot, bins=100, element="step", fill=False)
fig.ax_marg_x.set(yscale = "log")
fig.ax_marg_y.set(xscale = "log")
fig.ax_joint.set(xlabel = "Peak width (bp)", ylabel = "Number of reads")
plt.show()