# Try different strategies to reduce FPs

1. Different window sizes

In [1]:
from pathlib import Path
import pandas as pd
import pyranges as pr
import numpy as np
from joblib import Parallel, delayed
from pysam import AlignmentFile
from scripts.pyslavseq.sliding_window import SlidingWindow
from scripts.pyslavseq.preprocessing import label

In [2]:
# get bam files for feature generation
with open("../resources/bad_cells.txt", "r") as f:
    bad_cells = [line.strip() for line in f.readlines()]

files = []
for f in Path("../results/align/CommonBrain/").rglob("*.tagged.sorted.bam"):
    cell_id = f.stem.rstrip(".tagged.sorted")
    if cell_id not in bad_cells:
        files.append(f)

In [3]:
# helper function to calculate features for each cell
def get_features(bamfile: str, size: int, step: int):
    "Calculate features for a single cell"
    with AlignmentFile(bamfile, "rb") as bam:
        sw = SlidingWindow(bam, min_mapq=5).make_windows(
            size=size,
            step=step,
            strand_split=False,
            merge=False,
            features=True,
        )
        windows = []
        for w in sw:
            if w["n_ref_reads"] > 0:
                continue
            if w["Chromosome"] not in [f"chr{i}" for i in range(1, 23)]:
                continue
            windows.append(w)

    windows = pd.DataFrame(windows)
    windows["cell_id"] = Path(bamfile).stem.rstrip(".tagged.sorted")

    # keep autosomes
    windows = windows.loc[windows["Chromosome"].isin([f"chr{i}" for i in range(1, 23)])]

    return windows

In [4]:
# blacklist regions
mhc = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/OtherDifficult/GRCh38_MHC.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
kir = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/OtherDifficult/GRCh38_KIR.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
trs = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/LowComplexity/GRCh38_AllTandemRepeats_201to10000bp_slop5.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
segdups = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/SegmentalDuplications/GRCh38_segdups.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
gaps = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/OtherDifficult/GRCh38_gaps_slop15kb.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
false_dup = pd.read_csv(
    "https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/release/genome-stratifications/v3.0/GRCh38/OtherDifficult/GRCh38_false_duplications_correct_copy.bed.gz",
    sep="\t",
    header=None,
    skiprows=1,
    names=["Chromosome", "Start", "End"],
)
blacklist = pd.concat([mhc, trs, segdups, gaps, false_dup, kir])

In [6]:
anno = {
    "xtea": pr.read_bed(
        "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/CommonBrain_insertions.bed"
    ).df,
    "xtea_1kb_3end": pr.read_bed(
        "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/CommonBrain_insertions_1kb_3end.bed"
    ).df,
    "xtea_20kb": pr.read_bed(
        "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/CommonBrain_insertions_20kb.bed"
    ).df,
}

rmsk = pr.read_bed("/iblm/netapp/data4/mcuoco/sz_slavseq/resources/rmsk.bed").df
rmsk_1kb_3end = pr.read_bed(
    "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/rmsk_1kb_3end.bed"
).df
rmsk_20kb = pr.read_bed(
    "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/rmsk_20kb.bed"
).df

for l1 in ["L1HS", "L1PA2", "L1PA3", "L1PA4", "L1PA5", "L1PA6"]:
    anno[l1] = rmsk.loc[rmsk["Name"].str.contains(l1), :]
    anno[l1 + "_1kb_3end"] = rmsk_1kb_3end.loc[
        rmsk_1kb_3end["Name"].str.contains(l1), :
    ]
    anno[l1 + "_20kb"] = rmsk_20kb.loc[rmsk_20kb["Name"].str.contains(l1), :]

In [8]:
for s in [750, 1500, 3000, 6000, 12000, 24000]:
    data = Parallel(n_jobs=32, verbose=2)(
        delayed(get_features)(str(f), s, 250) for f in files
    )
    data = pd.concat(data)
    data = label(data, blacklist, "blacklist")
    data = data.loc[data["blacklist"] == False]
    data = data.drop(columns=["blacklist"])
    for id, df in anno.items():
        data = label(data, df, id)

    assert (
        data.shape[0]
        == data[["Chromosome", "Start", "End", "cell_id"]].drop_duplicates().shape[0]
    ), "some rows have been duplicated during labeling!"

    # save to pickle
    data.to_pickle(f"../results/model/experiments/CommonBrain_{s}.pkl")

[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
