# Edits analysis

In [None]:
from matplotlib_venn import venn3, venn3_circles
from typing import List, Tuple
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import pybedtools
import pysam
import os

In [None]:
# constant data variables
BASEDIR = "/data/"
REPORTSDIR = os.path.join(
    BASEDIR, 
    "/path/to/reports/folder",
)
NA12878DIR = os.path.join(
    BASEDIR,
    "/path/to/NA12878",
)

## Number of edits called

In [None]:
GUIDES = ["EMX1", "HEKSite4", "RNF2", "VEGFASite3"]
CELLTYPES = ["GM12878", "K562"]

In [None]:
# plot the number of edits called by each tool on each dataset
TOOLS = ["mutect2", "strelka", "varscan"]
for cell_type in CELLTYPES:
    for guide in GUIDES:
        data = []
        for tool in TOOLS:
            edits = pd.read_csv(
                os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"), sep="\t"
            )
            data.append(
                [
                    tool.upper(), 
                    edits[edits.EDITTYPE == "snv"].shape[0], 
                    edits[edits.EDITTYPE == "insertion"].shape[0],
                    edits[edits.EDITTYPE == "deletion"].shape[0],
                    edits[edits.EDITTYPE.str.contains("-")].shape[0],
                ]
            )
        f, ax = plt.subplots(1, 1, figsize=(8, 8))
        data = pd.DataFrame(
            data, columns=["TOOL", "SNV", "INSERTION", "DELETION", "POLYPLOID"]
        )
        data.plot(x="TOOL", kind="bar", stacked=False, ax=ax)
        ax.set_xlabel("Variant Calling Tool", size=14)
        ax.set_ylabel("Counts", size=14)
        ax.tick_params(axis="both", labelsize=12)
        ax.set_title(f"Edit type ({guide}-{cell_type})", size=16)
        plt.show()

## Number of edits by mismatches

In [None]:

for guide in GUIDES:
    for cell_type in CELLTYPES:
        for tool in TOOLS:
            edits = pd.read_csv(
                os.path.join(
                    REPORTSDIR, tool, f"{guide}_{cell_type}.txt"
                ), 
                sep="\t"
            )
            data = []
            for mm in range(5):
                data.append(
                    [
                        mm, 
                        edits[(edits.EDITTYPE == "snv") & (edits.MM == mm)].shape[0], 
                        edits[(edits.EDITTYPE == "insertion") & (edits.MM == mm)].shape[0],
                        edits[(edits.EDITTYPE == "deletion") & (edits.MM == mm)].shape[0],
                        edits[(edits.EDITTYPE.str.contains("-")) & (edits.MM == mm)].shape[0],
                    ]
                )
            f, ax = plt.subplots(1, 1, figsize=(8, 8))
            data = pd.DataFrame(
                data, columns=["MM", "SNV", "INSERTION", "DELETION", "POLYPLOID"]
            )
            data.plot(x="MM", kind="bar", stacked=False, ax=ax)
            ax.set_xlabel("Mismatches", size=14)
            ax.set_ylabel("Counts", size=14)
            ax.tick_params(axis="both", labelsize=12, rotation=0)
            ax.set_title(f"Edit type ({guide}-{cell_type}-{tool})", size=16)
            plt.show()

## Edits call agreement

In [None]:
for cell_type in CELLTYPES:
    for guide in GUIDES:
        variants = []
        for tool in TOOLS:
            edits = pd.read_csv(
                os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"), sep="\t"
            )
            variants.append(set(edits.apply(lambda x: f"{x[1]}:{x[7]}", axis=1)))
        f, ax = plt.subplots(1, 1, figsize=(8, 8))
        venn3(variants, set_labels=["MUTECT2", "STRELKA", "VARSCAN"])
        venn3_circles(variants)
        ax.set_title(f"Edits called ({guide}-{cell_type})", size=16)
        plt.show()

## High-confidence SNPs called as edits

In [None]:
CHROMS = [f"chr{i}" for i in range(1, 23)]

# find number of edits that are high confidence variants
def fetch_snps(vcf: pysam.TabixFile, chrom: str, pos: int) -> List[str]:
    if chrom not in CHROMS:  # skip sex chroms
        return []
    return list(vcf.fetch(chrom, pos - 1, pos))

def high_confidence_snps(
    edits: pd.DataFrame, vcf: pysam.TabixFile
) -> Tuple[List[List[str]], pd.DataFrame]:
    hc_snps = list(edits.apply(lambda x: fetch_snps(vcf, x[1], x[7]), axis=1))
    return hc_snps, edits.iloc[[i for i, snp in enumerate(hc_snps) if bool(snp)], :]

def plot_hc_snps(
    hcsnps: List[List[str]], hcsnpsdf: pd.DataFrame, guide: str, cell_type: str
) -> None:
    counts = [len(hcsnps) - hcsnpsdf.shape[0], hcsnpsdf.shape[0]]
    labels = ["Edits", "HC SNPs"]
    f, ax = plt.subplots(1, 1, figsize=(8, 8))
    ax.bar(labels, counts, width=.2, color="#0093D3")
    ax.set_ylabel("Counts", size=14)
    ax.tick_params(axis="both", labelsize=12)
    ax.set_title(
        f"Number of HC SNP called as edits ({guide}-{cell_type})", size=16
    )
    plt.savefig(os.path.join("figures", f"hc_varscan_{guide.lower()}_{cell_type.lower()}.png"))
    plt.show()

In [None]:
# MUTECT2
tool = "mutect2"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
for cell_type in CELLTYPES:
    for guide in GUIDES:
        edits = pd.read_csv(
            os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"), sep="\t"
        )
        hcsnps, hcsnpsdf = high_confidence_snps(edits, na12878vcf)
        # store HC SNPs called as edits
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists:
            os.mkdir(outdir)
        hcsnpsdf.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}_hc.txt"), sep="\t", index=False
        )
        plot_hc_snps(hcsnps, hcsnpsdf, guide, cell_type)  # plot histogram

In [None]:
# STRELKA
tool = "strelka"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
for cell_type in CELLTYPES:
    for guide in GUIDES:
        edits = pd.read_csv(
            os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"), sep="\t"
        )
        hcsnps, hcsnpsdf = high_confidence_snps(edits, na12878vcf)
        # store HC SNPs called as edits
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists:
            os.mkdir(outdir)
        hcsnpsdf.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}_hc.txt"), sep="\t", index=False
        )
        plot_hc_snps(hcsnps, hcsnpsdf, guide, cell_type)  # plot histogram

In [None]:
# VARSCAN
tool = "varscan"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
for cell_type in CELLTYPES:
    for guide in GUIDES:
        edits = pd.read_csv(
            os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"), sep="\t"
        )
        hcsnps, hcsnpsdf = high_confidence_snps(edits, na12878vcf)
        # store HC SNPs called as edits
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists:
            os.mkdir(outdir)
        hcsnpsdf.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}_hc.txt"), sep="\t", index=False
        )
        plot_hc_snps(hcsnps, hcsnpsdf, guide, cell_type)  # plot histogram

## HC SNPs and edits agreement

In [None]:
# compute agreement between edits and high-confidence SNPs
CHROMS = [f"chr{i}" for i in range(1, 23)]
def compare_alleles(alte: str, alts: str) -> int:
    alte = alte.split(",")  # handle polyploid alleles
    if any([aa == alts for aa in alte]):
        return 1
    return 0

def hcedits_jaccard(edits: pd.DataFrame, vcf: pysam.TabixFile):
    snps = list(edits.apply(lambda x: fetch_snps(vcf, x[1], x[7]), axis=1))
    hceditsidxs = [i for i, snp in enumerate(snps) if bool(snp)]
    if len(hceditsidxs) == 0:
        return np.nan
    return (
        sum([compare_alleles(edits.iloc[i,9], snps[i][0].split()[4]) for i in hceditsidxs]) / len(hceditsidxs)
    )

def plot_heatmap(hcedits: pd.DataFrame, tool: str) -> None:
    f, ax = plt.subplots(1, 1, figsize=(8, 8))
    sns.heatmap(hcedits, cmap="coolwarm", annot=True, vmin=0, vmax=1)
    ax.set_title(f"Jaccard distance HC edits - {tool}", size=16)
    ax.set_xlabel("Cell type", size=14)
    ax.set_ylabel("Guide", size=14)
    ax.tick_params(axis="both", labelsize=12)
    plt.show()

In [None]:
# MUTECT2
tool = "mutect2"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
jaccard_distances = {  # compute Jaccard distances
    cell_type: {
        guide: hcedits_jaccard(
            pd.read_csv(
                os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"),
                sep="\t"
            ),
            na12878vcf
        )
        for guide in GUIDES
    }
    for cell_type in CELLTYPES
}
plot_heatmap(pd.DataFrame(jaccard_distances), tool)

In [None]:
# STRELKA
tool = "strelka"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
jaccard_distances = {  # compute Jaccard distances
    cell_type: {
        guide: hcedits_jaccard(
            pd.read_csv(
                os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"),
                sep="\t"
            ),
            na12878vcf
        )
        for guide in GUIDES
    }
    for cell_type in CELLTYPES
}
plot_heatmap(pd.DataFrame(jaccard_distances), tool)

In [None]:
# VARSCAN
tool = "varscan"
na12878vcf = pysam.Tabixfile(
    os.path.join(NA12878DIR, "HG001_GRCh38_1_22_v4.2.1_annotated.vcf.gz")
)  # HC SNPs VCF
jaccard_distances = {  # compute Jaccard distances
    cell_type: {
        guide: hcedits_jaccard(
            pd.read_csv(
                os.path.join(REPORTSDIR, tool, f"{guide}_{cell_type}.txt"),
                sep="\t"
            ),
            na12878vcf
        )
        for guide in GUIDES
    }
    for cell_type in CELLTYPES
}
plot_heatmap(pd.DataFrame(jaccard_distances), tool)