# Investigate custom peak calling method

In [1]:
import pandas as pd
import pyranges as pr
import seaborn as sns
import pysam
import numpy as np

## Read in reference and non-reference insertions

In [None]:
# read the rmsk file
rmsk = pd.read_csv(
    snakemake.input.rl1[0],
    skiprows=3,
    delim_whitespace=True,
    names=["Chromosome", "Start", "End", "Strand", "repeat"],
    usecols=[4, 5, 6, 8, 9],
)
# filter for rep_names
rep_names = [
    "L1HS_3end",
    "L1PA2_3end",
    "L1PA3_3end",
    "L1PA4_3end",
    "L1PA5_3end",
    "L1PA6_3end",
]
rmsk = rmsk[rmsk["repeat"].isin(rep_names)]
rmsk["Strand"] = rmsk.apply(lambda x: "+" if x.Strand == "+" else "-", axis=1)
rmsk = pr.PyRanges(rmsk)

In [None]:
knrgl = pd.read_csv(
    snakemake.input.knrgl[0],
    sep="\t",
    header=None,
    names=["Chromosome", "Start", "End", "Strand", "SVLEN", "SVTYPE"],
    dtype={"Chromosome": str, "Start": int, "End": int},
)
knrgl["Start"] = knrgl.apply(
    lambda x: x.Start - 750 if x.Strand == "-" else x.Start, axis=1
)
knrgl["End"] = knrgl.apply(lambda x: x.End + 750 if x.Strand == "+" else x.End, axis=1)
knrgl = pr.PyRanges(knrgl)

## Read in the peaks and report metrics

In [None]:
cells = pd.DataFrame(
    {
        "pfn": snakemake.input.cell_peaks,
        "bfn": snakemake.input.cell_bam,
        "dna_type": "mda",
    }
)
bulks = pd.DataFrame(
    {
        "pfn": snakemake.input.bulk_peaks,
        "bfn": snakemake.input.bulk_bam,
        "dna_type": "bulk",
    }
)
samples = pd.concat([cells, bulks])

metrics = []
all_peaks = []
knrgl_peaks = []
rmsk_peaks = []
for s in samples.itertuples():
    sample = s.pfn.split("/")[-1].split(".")[0]
    peaks = pd.read_csv(
        s.pfn,
        sep="\t",
        header=None,
        usecols=[0, 1, 2, 3, 4],
        names=["chr", "start", "end", "width", "total_reads"],
    )
    peaks = peaks.rename(
        columns={
            "chr": "Chromosome",
            "start": "Start",
            "end": "End",
            "total_reads": "num_reads",
        }
    )
    peaks["sample_id"] = sample
    peaks["dna_type"] = s.dna_type
    all_peaks.append(peaks)
    peaks = pr.PyRanges(peaks)
    knrgl_peaks.append(peaks.overlap(knrgl).df)
    rmsk_peaks.append(peaks.overlap(rmsk).df)
    data = {
        "sample_id": sample,
        "dna_type": s.dna_type,
        "total_peaks": len(peaks),
        "total_reads": int(pysam.view("-c", s.bfn).rstrip("\n")),
        "min_reads / peak": peaks.num_reads.min(),
        "mean_reads / peak": int(peaks.num_reads.mean()),
        "max_reads / peak": peaks.num_reads.max(),
        "min_width / peak": peaks.width.min(),
        "mean_width / peak": int(peaks.width.mean()),
        "max_width / peak": peaks.width.max(),
        "knrgl_covered": len(knrgl.overlap(peaks)),
        "peaks_in_knrgl": len(knrgl_peaks[-1]),
        "total_knrgl": len(knrgl),
        "rmsk_covered": len(rmsk.overlap(peaks)),
        "peaks_in_rmsk": len(rmsk_peaks[-1]),
        "total_rmsk": len(rmsk),
    }
    metrics.append(data)


metrics = pd.DataFrame.from_records(metrics)
all_peaks = pd.concat(all_peaks)
knrgl_peaks = pd.concat(knrgl_peaks)
rmsk_peaks = pd.concat(rmsk_peaks)

all_peaks = pd.merge(
    all_peaks,
    knrgl_peaks,
    how="left",
    on=["Chromosome", "Start", "End", "num_reads", "width", "dna_type", "sample_id"],
    indicator="KNRGL",
)
all_peaks["KNRGL"] = np.where(all_peaks["KNRGL"] == "both", True, False)
all_peaks = pd.merge(
    all_peaks,
    rmsk_peaks,
    how="left",
    on=["Chromosome", "Start", "End", "num_reads", "width", "dna_type", "sample_id"],
    indicator="RMSK",
)
all_peaks["RMSK"] = np.where(all_peaks["RMSK"] == "both", True, False)
all_peaks["label"] = all_peaks.apply(
    lambda x: "KNRGL" if x.KNRGL else "RMSK" if x.RMSK else "OTHER", axis=1
).drop(columns=["KNRGL", "RMSK"])

In [1]:
print(
    "Total bulk peaks: "
    + str(int(metrics.loc[metrics["dna_type"] == "bulk", "total_peaks"]))
)
print(
    "Total KNRGL from WGS: "
    + str(int(metrics.loc[metrics["dna_type"] == "bulk", "total_knrgl"]))
)
print(
    "Total bulk peaks overlapping KNRGL: "
    + str(int(metrics.loc[metrics["dna_type"] == "bulk", "peaks_in_knrgl"]))
)
print(
    "Total KNRGL overlapped by bulk peaks: "
    + str(int(metrics.loc[metrics["dna_type"] == "bulk", "knrgl_covered"]))
)

SyntaxError: unterminated string literal (detected at line 2) (3702127098.py, line 2)

In [None]:
bulk_peaks = pr.PyRanges(all_peaks.loc[all_peaks["dna_type"] == "bulk"])
missed = knrgl.overlap(bulk_peaks, invert=True).df
missed["detected"] = False
covered = knrgl.overlap(bulk_peaks).df
covered["detected"] = True
df = pd.concat([covered, missed]).sort_values(by=["Chromosome", "Start", "End"])
with pysam.AlignmentFile(snakemake.input.bulk_bam[0], "rb") as f:
    coverage = []
    max_mapq = []
    max_ya = []
    max_yg = []
    for x in df.itertuples():
        c = 0
        mq = 0
        ya = 0
        yg = 0
        for r in f.fetch(x.Chromosome, x.Start, x.End):
            c += 1
            if r.is_read1 and r.has_tag("YA") and r.has_tag("YG"):
                mq = max(mq, r.mapping_quality)
                ya = max(ya, r.get_tag("YA"))
                yg = max(yg, r.get_tag("YG"))
        coverage.append(c)
        max_mapq.append(mq)
    df["coverage"] = coverage
    df["max_mapq"] = max_mapq
print(df.loc[(df["detected"] == False) & (df["coverage"] > 0), :])
df.loc[(df["detected"] == False) & (df["coverage"] > 0), :].to_csv(
    snakemake.output.bulk_insertions, sep="\t", index=False
)

In [None]:
knrgl_peaks.loc[knrgl_peaks["dna_type"] == "bulk", :].head()

In [None]:
knrgl_peaks.sort_values("width", ascending=False)

In [None]:
sns.stripplot(y="total_peaks", x="dna_type", hue="dna_type", data=metrics)

In [None]:
fig = sns.jointplot(
    data=metrics, x="total_reads", y="total_peaks", hue="dna_type", alpha=0.5
)

In [None]:
fig = sns.jointplot(
    data=metrics, x="knrgl_covered", y="peaks_in_knrgl", hue="dna_type", alpha=0.5
)
x0, x1 = fig.ax_joint.get_xlim()
y0, y1 = fig.ax_joint.get_ylim()
lims = [max(x0, y0), min(x1, y1)]
fig.ax_joint.plot(lims, lims, linestyle="dashed", color="black", alpha=0.5)

In [None]:
my_samples = (
    all_peaks.loc[all_peaks.dna_type == "mda", "sample_id"]
    .drop_duplicates()
    .values.tolist()[0:11]
)
my_samples.append(
    *all_peaks.loc[all_peaks.dna_type == "bulk", "sample_id"]
    .drop_duplicates()
    .values.flatten()
)

In [None]:
# boxplot of number of reads per peak
fig = sns.boxplot(
    data=all_peaks.loc[all_peaks.sample_id.isin(my_samples), :],
    y="sample_id",
    x="num_reads",
    hue="dna_type",
    dodge=False,
)
fig.set_xscale("log")
fig.set_xlabel("Number of reads / peak")

In [None]:
# boxplot of peak width
fig = sns.boxplot(
    data=all_peaks.loc[all_peaks.sample_id.isin(my_samples), :],
    y="sample_id",
    x="width",
    hue="dna_type",
    dodge=False,
)
fig.set_xscale("log")
fig.set_xlabel("Peak width (bp)")

In [None]:
# scatter
fig = sns.relplot(
    data=all_peaks.loc[all_peaks.sample_id.isin(my_samples), :],
    y="num_reads",
    x="width",
    hue="label",
    col="sample_id",
    col_wrap=4,
    kind="scatter",
    alpha=0.5,
)
fig.set(
    yscale="log",
    xscale="log",
    ylabel="Number of reads / peak",
    xlabel="Peak width (bp)",
)

## Investigate # of peaks in bulk found in cells

In [None]:
for pfn, bfn in zip(snakemake.input.bulk_peaks, snakemake.input.bulk_bam):
    bulk_peaks = pd.read_csv(
        pfn,
        sep="\t",
        header=None,
        usecols=[0, 1, 2, 3, 4],
        names=["chr", "start", "end", "width", "total_reads"],
    )
    bulk_peaks = bulk_peaks.rename(
        columns={
            "chr": "Chromosome",
            "start": "Start",
            "end": "End",
            "total_reads": "num_reads",
        }
    )
    bulk_peaks["width"] = bulk_peaks.End - bulk_peaks.Start
    bulk_peaks = pr.PyRanges(bulk_peaks)

bulk_in_cells = []

for pfn, bfn in zip(snakemake.input.cell_peaks, snakemake.input.cell_bam):
    cell_peaks = pd.read_csv(
        pfn,
        sep="\t",
        header=None,
        usecols=[0, 1, 2, 3, 4],
        names=["chr", "start", "end", "width", "total_reads"],
    )
    cell_peaks = cell_peaks.rename(
        columns={
            "chr": "Chromosome",
            "start": "Start",
            "end": "End",
            "total_reads": "num_reads",
        }
    )
    cell_peaks["width"] = cell_peaks.End - cell_peaks.Start
    cell_peaks = pr.PyRanges(cell_peaks)
    bulk_peaks.overlap(cell_peaks)
    bulk_in_cells.append(bulk_peaks.overlap(cell_peaks).df)

bulk_in_cells = pd.concat(bulk_in_cells)

In [None]:
plot_df = (
    bulk_in_cells.value_counts(["Chromosome", "Start", "End", "num_reads", "width"])
    .to_frame("num_cells")
    .reset_index()
)
print(f"{len(plot_df)} bulk peaks found in cells")

fig = sns.jointplot(
    data=plot_df,
    x="num_cells",
    y="num_reads",
    alpha=0.5,
    marginal_kws={"bins": 100},
    marginal_ticks=True,
    ratio=2,
)
fig.ax_joint.set_yscale("log")

In [None]:
fig = sns.scatterplot(
    data=plot_df, x="width", y="num_reads", hue="num_cells", alpha=0.5
)
fig.set(
    yscale="log",
    xscale="log",
    ylabel="Number of reads / peak",
    xlabel="Peak width (bp)",
)