In [4]:
from scripts.get_features import BamWindows
from scripts.get_labels import read_knrgl, read_rmsk
import pandas as pd
import pyranges as pr
import seaborn as sns

In [5]:
# read in the annotations
knrgl = read_knrgl(
    "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/CommonBrain_insertions.bed"
)
rmsk = read_rmsk("/iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa.out")

In [6]:
f = "/iblm/logglun02/mcuoco/workflows/sz_slavseq/results/align/CommonBrain/plate1_A4_S22.tagged.sorted.bam"
res = BamWindows(f, 750, 250).bam_windows()

In [7]:
def covered(windows, annotation):

    windows = pr.PyRanges(windows)
    annotation = pr.PyRanges(annotation)

    return len(windows.overlap(annotation)), len(annotation.overlap(windows))


def label_windows(df: pd.DataFrame, other_df: pd.DataFrame, label: str):
    assert type(df) == pd.DataFrame, "df must be a pandas DataFrame"
    assert type(other_df) == pd.DataFrame, "other_df must be a pandas DataFrame"
    assert label not in df.columns, f"{label} already in df.columns"

    # convert to pyranges
    pr_df = pr.PyRanges(df)
    pr_other_df = pr.PyRanges(other_df)

    # get the windows that overlap with the other_df
    overlapping = pr_df.overlap(pr_other_df, strandedness="opposite").df

    # set the index to the chromosome, start, and end
    # TODO: check if reads are in same orientation as repeats
    df.set_index(["Chromosome", "Start", "End"], inplace=True)
    overlapping.set_index(["Chromosome", "Start", "End"], inplace=True)

    # label the windows that overlap
    df[label] = df.index.isin(overlapping.index)

    # reset the index
    df.reset_index(inplace=True)

    return df

In [11]:
res["Strand"] = res.apply(lambda x: "+" if x["frac_fwd"] > 0.5 else "-", axis=1)

In [10]:
res.columns

Index(['fwd', 'rev', 'nreads', 'frac_fwd', 'orientation_bias', 'starts_gini',
       'mean_template_length', 'sd_template_length', 'ML_mean', 'MG_mean',
       'MS_mean', 'MA_mean', 'Chromosome', 'Start', 'End', 'fwd_bg', 'rev_bg',
       'nreads_bg', 'frac_fwd_bg', 'orientation_bias_bg', 'starts_gini_bg',
       'mean_template_length_bg', 'sd_template_length_bg', 'ML_mean_bg',
       'MG_mean_bg', 'MS_mean_bg', 'MA_mean_bg', 'strand'],
      dtype='object')