In [1]:
import pandas as pd
import seaborn as sns
import pyranges as pr
import pysam

ModuleNotFoundError: No module named 'pandas'

In [6]:
peaks = pd.read_csv(
    snakemake.input.peaks[0],
    sep="\t",
    header=None,
    names=[
        "chrom",
        "start",
        "end",
        "name",
        "score",
        "strand",
        "signalValue",
        "pValue",
        "qValue",
        "peak",
    ],
)
peaks["size"] = peaks["end"] - peaks["start"]

Unnamed: 0,chrom,start,end,name,score,strand,signalValue,pValue,qValue,peak
0,chr21,6510757,6511507,B_peak_1,58,.,5.97015,8.53252,5.87856,233
1,chr21,8606391,8607245,B_peak_2,434,.,11.1211,46.5078,43.4759,568
2,chr21,8666242,8667082,B_peak_3,227,.,9.27152,25.6704,22.794,561
3,chr21,8676252,8677256,B_peak_4,846,.,12.0812,87.8702,84.6404,375
4,chr21,8767232,8768071,B_peak_5,160,.,8.80734,18.828,16.0201,390


In [None]:
sns.histplot(data=peaks, x="size")

In [None]:
peaks["num_reads"] = None
samfile = pysam.AlignmentFile(snakemake.input.bam[0], "rb")

# Get number of reads in each peak
for ind in peaks.index:
    chr = peaks["chrom"][ind]
    start = peaks["start"][ind]
    end = peaks["end"][ind]
    num_reads = samfile.count(chr, start, end, read_callback='all')
    peaks["num_reads"][ind] = num_reads

In [None]:
sns.scatterplot(data=peaks, x="size", y="num_reads")

In [None]:
# calculate FRiP score
peak_reads = peaks["num_reads"].sum()
total_reads = samfile.count("chr21", read_callback='all') + samfile.count("chr22", read_callback='all')

frip = peak_reads / total_reads

# TODO:

## Evaluate peak calls by themselves

1. Generate histogram of peak size
2. Generate scatter plot of peak size vs reads in peak
3. Compute fraction of reads in peaks

## Evaluate peak calls' ability to capture germline insertinos

1. Read in the germline insertion calls
2. Compute the fraction of germline insertions that are captured by the peak calls (use pyranges for intersection)
3. Compute the fraction of germline insertions that are captured by the windows (use pyranges for intersection)

## Try different parameters for the peak calls and repeat the above

In [None]:
# functions to read in germline insertions
def read_non_ref_db():
	df = pd.read_csv(
		snakemake.input.non_ref_l1,
		sep="\t",
		header=None,
		names=["chrom", "start", "end"],
		dtype={"chrom": str, "start": int, "end": int},
	)
	return df

def read_rmsk():
    """
    Read the repeatmasker output table and return locations of L1HS and L1PA2-6
    """
    # read the rmsk file
    df0 = pd.read_csv(
        snakemake.input.ref_l1,
        skiprows=3,
        delim_whitespace=True,
        names=["chrom", "start", "end", "strand", "repeat"],
        usecols=[4, 5, 6, 8, 9],
    )

    # filter for rep_names
    rep_names = [
        "L1HS_3end",
        "L1PA2_3end",
        "L1PA3_3end",
        "L1PA4_3end",
        "L1PA5_3end",
        "L1PA6_3end",
    ]
    # logging.info(f"Filtering for rep_names: {rep_names}")
    df0 = df0[df0["repeat"].isin(rep_names)]

    # save to new dataframe
    df1 = pd.DataFrame()
    df1["chrom"] = df0["chrom"].astype(str)
    # set start positions depending on strand
    df1["start"] = df0.apply(
        lambda x: x["end"] if x["strand"] != "+" else x["start"], axis=1
    )
    df1["end"] = df1["start"]
    df1["start"] -= 1  # make zero-based

    return df1

def read_germline():
	non_ref = read_non_ref_db()
	ref = read_rmsk()
	germline = pd.concat([non_ref, ref])
	return germline

In [None]:
germline_df = read_germline()
germline_df = germline_df.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})
print(len(germline_df.index))

In [None]:
peak_regions = peaks[["chrom", "start", "end"]]
peak_regions = peak_regions.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})

# compute the fraction of germline insertions that are captured by the peak calls
gr = pr.PyRanges(germline_df)
gr_peak = pr.pyRanges(peak_regions)

gl_peak = gr.intersect(gr_peak) 
print(gl_peak) # number of rows = number of insertions captured by peak calls

In [None]:
windows = pd.read_pickle(snakemake.input.labels[0]).reset_index()
windows = windows[["chrom", "start", "end"]]
windows = windows.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"})

gr_window = pr.pyRanges(windows)
gl_window = gr.intersect(gr_window)
print(gl_window)


In [None]:
# how many insertions are shared between windows and peaks?
shared = gl_peak.intersect(gl_window)
print(shared)