# Evaluate SLAV-seq coverage of rmsk, and knrgl


In [None]:
from subprocess import Popen, PIPE, DEVNULL
from tempfile import NamedTemporaryFile
from io import StringIO
from collections import defaultdict
from pathlib import Path

import pandas as pd
import pyranges as pr
import pysam

from tqdm import tqdm

from scripts.get_labels import read_knrgl
from myutils.rmsk import read_rmsk

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# find knrgl bed and bulk BAMs for each individual
individuals = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/bulk_donors.tsv", sep="\t"
)["donor_id"].values.astype(str)
indv_data = {i: {"knrgl": None, "bulk": None} for i in individuals}

for bulk in Path("/iblm/netapp/data4/mcuoco/sz_slavseq/results/align/").rglob(
    "*/gDNA_usd*.sorted.bam"
):
    if bulk.parts[-2] in individuals:
        indv_data[bulk.parts[-2]]["bulk"] = str(bulk)

for knrgl in Path("/iblm/netapp/data4/mcuoco/sz_slavseq/resources/").rglob(
    "*_insertions.bed"
):
    if knrgl.name.split("_")[0] in individuals:
        indv_data[knrgl.name.split("_")[0]]["knrgl"] = str(knrgl)

## Calculate coverage of RMSK and KNRGL per individual

In [None]:
# define helper functions
def bedtools_intersect(bam, bed, view_args=""):
    cmd = f"samtools view -b {bam} {view_args} | bedtools intersect -abam stdin -b {bed} -bed -wa -wb"
    p = Popen(cmd, shell=True, stdout=PIPE, stderr=DEVNULL)
    names = [
        "Chromosome",
        "Start",
        "End",
        "read_id",
        "mapq",
        "Strand",
        "intersect_start",
        "intersect_end",
        "x",
        "y",
        "Flag",
        "z",
        "ChromosomeB",
        "StartB",
        "EndB",
        "NameB",
        "ScoreB",
        "StrandB",
    ]

    with StringIO(p.stdout.read().decode()) as bed:
        df = pd.read_csv(bed, sep="\t", header=None, names=names)

    df.drop(["x", "y", "z"], axis=1, inplace=True)
    df["Flag"] = df["Flag"].str.rstrip(",").astype(int)
    return df


def bedtools_inverse_intersect(bam, bed, view_args=""):
    cmd = f"samtools view -b {bam} {view_args} | bedtools intersect -abam stdin -bed -v -wa -b"

    if isinstance(bed, list):
        for b in bed:
            cmd += f" {b}"
    else:
        cmd += f" {bed}"

    p = Popen(cmd, shell=True, stdout=PIPE, stderr=DEVNULL)
    names = [
        "Chromosome",
        "Start",
        "End",
        "read_id",
        "mapq",
        "Strand",
        "intersect_start",
        "intersect_end",
        "x",
        "y",
        "Flag",
        "z",
    ]
    with StringIO(p.stdout.read().decode()) as bed:
        df = pd.read_csv(bed, sep="\t", header=None, names=names, on_bad_lines="skip")

    df.drop(["x", "y", "z"], axis=1, inplace=True)
    df["Flag"] = df["Flag"].str.rstrip(",").astype(int)

    return df

In [None]:
# read in repeatmasker output
rmsk = read_rmsk("/iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa.out")

rep_names = [
    "L1HS_3end",
    "L1PA2_3end",
    "L1PA3_3end",
    "L1PA4_3end",
    "L1PA5_3end",
    "L1PA6_3end",
]

rmsk = rmsk.loc[(rmsk["repName"].isin(rep_names)) & (rmsk["repEnd"] > 860), :]
rmsk = rmsk.apply(
    lambda x: x
    if (x["strand"] == "+" and x["repStart"] < 765)
    or (x["strand"] == "-" and x["repLeft"] < 765)
    else None,
    axis=1,
).dropna()

rmsk["genoStart"] = rmsk.apply(
    lambda x: x["genoStart"] - 1000 if x["strand"] == "-" else x["genoStart"], axis=1
)
rmsk["genoEnd"] = rmsk.apply(
    lambda x: x["genoEnd"] + 1000 if x["strand"] == "+" else x["genoEnd"], axis=1
)

rmsk = rmsk.rename(
    columns={
        "genoName": "Chromosome",
        "genoStart": "Start",
        "genoEnd": "End",
        "strand": "Strand",
    }
).loc[:, ["Chromosome", "Start", "End", "Strand"]]
rmsk = rmsk.loc[(rmsk.Start >= 0) & (rmsk.End >= 0), :]

In [None]:
# define read groups to search
read_groups = {
    "all": "",
    "read1": "-f 64",
    "read2": "-f 128",
    "primary read1": "-F 256 -f 64",
    "primary read2": "-F 256 -f 128",
    "proper pair": "-f 2",
    "primary proper pair": "-F 256 -f 2",
    "not proper pair primary read1": "-F 258 -f 128",
    "not proper pair primary read2": "-F 258 -f 64",
}

In [None]:
# collect knrgl and rmsk coverage info for each donor that has 90x WGS (repeat once 30x WGS is available)

rmsk_tmp = NamedTemporaryFile()
pr.PyRanges(rmsk).to_bed(rmsk_tmp.name)

res = []
# ~30 min per iteration
# for each donor
for indv, data in tqdm(indv_data.items(), total=len(indv_data)):
    bam = data["bulk"]
    knrgl = read_knrgl(data["knrgl"])

    knrgl_tmp = NamedTemporaryFile()
    pr.PyRanges(knrgl).to_bed(knrgl_tmp.name)

    # for each read group
    for i, (k, v) in enumerate(read_groups.items()):

        # total reads
        if k == "all":
            total_reads = int(pysam.view(bam, "-c").rstrip("\n"))
        else:
            total_reads = int(pysam.view(bam, v, "-c").rstrip("\n"))

        # reads outside rmsk, knrgl, and blast
        outside_reads = bedtools_inverse_intersect(
            bam, [rmsk_tmp.name, knrgl_tmp.name], view_args=v
        )

        # reads overlapping rmsk and knrgl
        knrgl_reads = bedtools_intersect(bam, knrgl_tmp.name, view_args=v)
        knrgl_reads["label"] = "KNRGL"
        rmsk_reads = bedtools_intersect(bam, rmsk_tmp.name, view_args=v)
        rmsk_reads["label"] = "RMSK"

        # get number of reads per region
        df = (
            pd.concat([knrgl_reads, rmsk_reads])
            .groupby(["ChromosomeB", "StartB", "EndB", "label"])
            .size()
            .reset_index(name="reads")
        )

        df["read_group"] = k
        df["indv"] = indv
        df["total_reads"] = total_reads
        df["outside_reads"] = len(outside_reads)

        res.append(df)

    knrgl_tmp.close()

rmsk_tmp.close()
res = pd.concat(res)

res.to_csv("indv_coverage.tsv", sep="\t", index=False)

## Analyze results

In [None]:
res = pd.read_csv("indv_coverage.tsv", sep="\t")

In [None]:
# create facetgrid
g = sns.FacetGrid(
    res[res["read_group"] == "all"],
    col="label",
    hue="indv",
    sharex=False,
    sharey=True,
    height=4.5,
    aspect=1,
    gridspec_kws={"wspace": 0.5},
)
g.map_dataframe(sns.ecdfplot, x="reads", stat="proportion").set(xscale="log")
g.add_legend()

In [None]:
# create facetgrid
g = sns.FacetGrid(
    res[res["label"] == "KNRGL"],
    col="read_group",
    hue="indv",
    sharex=False,
    sharey=True,
    height=4.5,
    aspect=1,
    col_wrap=3,
    gridspec_kws={"wspace": 0.5},
)
g.map_dataframe(sns.ecdfplot, x="reads", stat="proportion").set(xscale="log")
g.add_legend()

In [None]:
# create facetgrid
g = sns.FacetGrid(
    res[res["label"] == "RMSK"],
    col="read_group",
    hue="indv",
    sharex=False,
    sharey=True,
    height=4.5,
    aspect=1,
    col_wrap=3,
    gridspec_kws={"wspace": 0.5},
)
g.map_dataframe(sns.ecdfplot, x="reads", stat="proportion").set(xscale="log")
g.add_legend()

In [None]:
# wrangle for plotting
plot_df = (
    res.drop(["ChromosomeB", "StartB", "EndB"], axis=1)
    .groupby(["read_group", "indv", "total_reads", "outside_reads", "label"])
    .count()
    .reset_index()
    .rename(columns={"reads": "covered"})
    .pivot(
        index=["read_group", "indv", "total_reads", "outside_reads"],
        columns="label",
        values="covered",
    )
    .reset_index()
    .rename(columns={"KNRGL": "knrgl_covered", "RMSK": "rmsk_covered"})
)

plot_df["total_knrgl"] = pd.Series([1] * len(plot_df))
plot_df["total_rmsk"] = pd.Series([1] * len(plot_df))

plot_df.indv = plot_df.indv.astype(str)
for indv, data in indv_data.items():
    knrgl = read_knrgl(data["knrgl"])
    plot_df.loc[(plot_df.indv == indv), "total_knrgl"] = len(knrgl)
    plot_df.loc[(plot_df.indv == indv), "total_rmsk"] = len(rmsk)

plot_df["frac knrgl covered"] = plot_df.knrgl_covered / plot_df.total_knrgl
plot_df["frac rmsk covered"] = plot_df.rmsk_covered / plot_df.total_rmsk

plot_df["inside_reads"] = plot_df.total_reads - plot_df.outside_reads
plot_df["frac alignments covering L1"] = plot_df.inside_reads / plot_df.total_reads

plot_df.rename(columns={"total_reads": "total alignments"}, inplace=True)

plot_df = (
    plot_df[
        [
            "read_group",
            "indv",
            "total alignments",
            "frac knrgl covered",
            "frac rmsk covered",
            "frac alignments covering L1",
        ]
    ]
    .drop_duplicates()
    .melt(
        id_vars=["read_group", "indv"],
        value_vars=[
            "total alignments",
            "frac alignments covering L1",
            "frac knrgl covered",
            "frac rmsk covered",
        ],
    )
)

In [None]:
sns.catplot(
    plot_df[plot_df["read_group"] == "all"],
    y="indv",
    x="value",
    col="label",
    kind="bar",
    sharey=False,
    sharex=False,
    height=4,
    aspect=1,
    legend=True,
    col_wrap=2,
)

In [None]:
sns.catplot(
    plot_df,
    x="indv",
    y="value",
    col="read_group",
    row="label",
    kind="bar",
    sharey="row",
    sharex=False,
    height=4,
    aspect=1,
)