# Investigate custom peak calling method

In [None]:
import pandas as pd
import pyranges as pr
import seaborn as sns

## Distribution of peaks

In [None]:
peaks = pd.read_csv(
    snakemake.input.peaks[0],
    sep="\t",
    header=None,
    usecols=[0, 1, 2, 3],
    names=["chr", "start", "end", "num_reads"],
    dtype={"chr": str, "start": int, "end": int, "num_reads": int},
)

fig = peaks.num_reads.plot(
    kind="hist", bins=100, log=True, title=f"Total calls: {len(peaks)}"
)
fig.set_xlabel("Number of reads / call")

## Rmsk L1 insertions covering peaks

In [None]:
# read the rmsk file
rmsk = pd.read_csv(
    snakemake.input.rl1[0],
    skiprows=3,
    delim_whitespace=True,
    names=["chrom", "start", "end", "strand", "repeat"],
    usecols=[4, 5, 6, 8, 9],
)

# filter for rep_names
rep_names = [
    "L1HS_3end",
    "L1PA2_3end",
    "L1PA3_3end",
    "L1PA4_3end",
    "L1PA5_3end",
    "L1PA6_3end",
]
rmsk = rmsk[rmsk["repeat"].isin(rep_names)]

rmsk["strand"] = rmsk.apply(lambda x: "+" if x.strand == "+" else "-", axis=1)

In [None]:
# convert to pyranges
peaks = peaks.rename(columns={"chr": "Chromosome", "start": "Start", "end": "End"})
rmsk = rmsk.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})
rmsk_df = pr.PyRanges(rmsk)
peaks_df = pr.PyRanges(peaks)

# find overlaps
overlaps = rmsk_df.overlap(peaks_df)
overlaps = overlaps.df.drop_duplicates(subset=["Chromosome", "Start", "End"])

# count overlaps
o_counts = overlaps.groupby(["Chromosome", "repeat"]).size().reset_index()
rmsk_counts = rmsk_df.df.groupby(["Chromosome", "repeat"]).size().reset_index()
plot_df = o_counts.merge(
    rmsk_counts, on=["Chromosome", "repeat"], suffixes=("_called", "_total")
).rename(columns={"0_called": "num_called", "0_total": "num_total"})

In [None]:
print(
    f"{len(peaks_df.intersect(rmsk_df))} / {len(peaks)} peaks overlapping {plot_df.num_called.sum()} / {plot_df.num_total.sum()} rmsk L1 insertions"
)
fig = sns.relplot(
    data=plot_df,
    x="num_total",
    y="num_called",
    kind="scatter",
    facet_kws=dict(sharex=False, sharey=False),
)
fig.set(
    xlabel="# rmsk insertions / chr", ylabel="# calls overlapping rmsk insertions / chr"
)

for g in fig.axes:
    for ax in g:
        ax.set_ylim(ax.get_xlim())
        ax.axline((0, 0), slope=1, color="gray", linestyle="--")

sns.despine()

In [None]:
plot_df = peaks_df.intersect(rmsk_df)
fig = plot_df.num_reads.plot(
    kind="hist", bins=100, log=True, title=f"Total calls: {len(plot_df)}"
)
fig.set_xlabel("Number of reads / call")

In [None]:
plot_df.df.sort_values("num_reads", ascending=False).head(10)

## KNRGL covering peaks

In [None]:
knrgl = pd.read_csv(
    snakemake.input.knrgl[0],
    sep="\t",
    header=None,
    names=["Chromosome", "Start", "End"],
    dtype={"Chromosome": str, "Start": int, "End": int},
)
knrgl_df = pr.PyRanges(knrgl)

In [None]:
# find overlaps
overlaps = knrgl_df.overlap(peaks_df)
overlaps = overlaps.df.drop_duplicates(subset=["Chromosome", "Start", "End"])

# count overlaps
o_counts = overlaps.groupby("Chromosome").size().reset_index()
knrgl_counts = knrgl_df.df.groupby("Chromosome").size().reset_index()
plot_df = o_counts.merge(
    knrgl_counts, on="Chromosome", suffixes=("_called", "_total")
).rename(columns={"0_called": "num_called", "0_total": "num_total"})

In [None]:
print(
    f"{len(peaks_df.intersect(knrgl_df))} / {len(peaks)} peaks overlapping {plot_df.num_called.sum()} / {plot_df.num_total.sum()} non-reference L1 insertions"
)
fig = sns.scatterplot(data=plot_df, x="num_total", y="num_called")
fig.set_ylim(0, fig.get_xlim()[1])
fig.set_xlim(0, fig.get_xlim()[1])
fig.axline((0, 0), slope=1, color="gray", linestyle="--")
fig.set(
    xlabel="# KNRGL insertions / chr",
    ylabel="# calls overlapping KNRGL insertions / chr",
)
sns.despine()

In [None]:
plot_df = peaks_df.intersect(knrgl_df)
fig = plot_df.num_reads.plot(
    kind="hist", bins=100, log=True, title=f"Total calls: {len(plot_df)}"
)
fig.set_xlabel("Number of reads / call")

In [None]:
plot_df.df.sort_values("num_reads", ascending=True).head(10)