In [None]:
import pandas as pd
import pyranges as pr
import seaborn as sns

# Read germline insertions

In [None]:
# read the rmsk file
rmsk = pd.read_csv(
    snakemake.input.rl1[0],
    skiprows=3,
    delim_whitespace=True,
    names=["Chromosome", "Start", "End", "Strand", "repeat"],
    usecols=[4, 5, 6, 8, 9],
)

# filter for rep_names
rep_names = [
    "L1HS_3end",
    "L1PA2_3end",
    "L1PA3_3end",
    "L1PA4_3end",
    "L1PA5_3end",
    "L1PA6_3end",
]
rmsk = rmsk[rmsk["repeat"].isin(rep_names)]

rmsk["Strand"] = rmsk.apply(
    lambda x: "+" if x.Strand == "+" else "-", axis=1
)

rmsk = pr.PyRanges(rmsk)
rmsk.head

In [None]:
knrgl = pd.read_csv(
    snakemake.input.knrgl[0],
    sep="\t",
    header=None,
    names=["Chromosome", "Start", "End"],
    dtype= {"Chromosome": str, "Start": int, "End": int},
)
knrgl = pr.PyRanges(knrgl)
knrgl.head


# Compare called peaks using multiple parameters

In [None]:
summary = []
for fn, t in zip(snakemake.input.peaks, snakemake.params.t):
	print(f"Reading {fn}...")
	peaks = pd.read_csv(
    	fn,
    	sep="\t",
    	header=None,
    	names=["Chromosome", "Start", "End", "num_reads"],
    	dtype= {"Chromosome": str, "Start": int, "End": int, "num_reads": int},
	)
	peaks["width"] = peaks.End - peaks.Start
	peaks = pr.PyRanges(peaks)
	data = {
		"t": t,
		"total_peaks": len(peaks),
		"min_reads / peak": peaks.num_reads.min(),
		"mean_reads / peak": int(peaks.num_reads.mean()),
		"max_reads / peak": peaks.num_reads.max(),
		"min_width / peak": peaks.width.min(),
		"mean_width / peak": int(peaks.width.mean()),
		"max_width / peak": peaks.width.max(),
		"knrgl_covered": len(knrgl.overlap(peaks)),
		"peaks_in_knrgl": len(peaks.intersect(knrgl)),
		"rmsk_covered": len(rmsk.overlap(peaks)),
		"peaks_in_rmsk": len(peaks.intersect(rmsk)),
	}
	summary.append(data)

summary = pd.DataFrame.from_records(summary)
print(summary)