# Construct edits datasets

In [1]:
from typing import Any, List, Optional
from tqdm import tqdm

import pandas as pd

import sys
import os

In [2]:
# constant data variables
BASEDIR = "/data/"
CASOFFINDER = os.path.join(BASEDIR, "/path/to/casoffinder")
EDITSDIR = os.path.join(
    BASEDIR, 
    "/path/to/edits/folder/",
)
REPORTSDIR = os.path.join(
    BASEDIR, 
    "/path/to/reports/folder/",
)

In [16]:
# functions to create the reports
def casoffinder_report(guide: str) -> str:
    guide = guide.replace("Site3", "3").replace("Site4", "4")  # rename guide 
    return f"casoffinder.{guide}.txt.out"

def recover_site(
    chrom: str, pos: int, strand: str, upstream: bool, downstream: bool
) -> str:
    center = pos + 17 if strand == "+" else pos + 6
    start = center - 10
    stop = center + 10
    if upstream:
        return f"{chrom}:{start - 1 - 20}-{start - 1}"
    elif downstream:
        return f"{chrom}:{stop + 1}-{stop + 1 + 20}"
    return f"{chrom}:{start}-{stop}"

def read_vcf(vcf: str, tool: str) -> List[List[Any]]:
    if tool == "mutect2":  # MUTECT2
        with open(vcf, mode="r") as infile:
            variants = [
                line.strip().split() for line in infile if not line.startswith("#")
            ]
    elif tool == "strelka":  # STRELKA 
        with open(vcf.replace(".vcf", "_somatic.snvs.vcf"), mode="r") as infile:  # SNVs
            variants = [
                line.strip().split() for line in infile if not line.startswith("#")
            ]
        with open(vcf.replace(".vcf", "_somatic.indels.vcf"), mode="r") as infile:  # indels
            variants = variants + [
                line.strip().split() for line in infile if not line.startswith("#")
            ]
    elif tool == "varscan":  # VARSCAN 
        with open(vcf.replace(".vcf", ".snp.vcf"), mode="r") as infile:  # SNVs
            variants = [
                line.strip().split() for line in infile if not line.startswith("#")
            ]
        with open(vcf.replace(".vcf", ".indel.vcf"), mode="r") as infile:  # indels
            variants = variants + [
                line.strip().split() for line in infile if not line.startswith("#")
            ]
    return variants

def read_edits(edits: pd.Series, site: str) -> pd.DataFrame:
    data = pd.DataFrame(edits)  # read called edits
    if data.empty:
        return pd.DataFrame()  # return empty DataFrame
    data = data.iloc[:, [0, 1, 3, 4]]  # keep chrom, pos, ref, alt
    data.columns = ["CHROM", "EDITPOS", "REF", "ALT"]
    data["SITE"] = site  # add site name (used in later join)
    return data

def edits_df(edits: pd.Series, offtargets: pd.DataFrame) -> pd.DataFrame: 
    edits_df = pd.concat(
        [read_edits(edits.loc[i], offtargets.SITE[i]) for i in range(edits.shape[0])]
    )
    if edits_df.empty:
        return pd.DataFrame(
            columns=[
                "GUIDE", "CHR", "POS", "TARGET", "STRAND", "MM", "SITE", "EDITPOS", "REF", "ALT", "EDITTYPE"
            ]
        )
    edits_df = offtargets.merge(edits_df, on="SITE")  # merge offtargets and edits
    edits_df.drop(["CHROM"], axis=1, inplace=True)  # remove redundant chrom column
    edits_df.reset_index(drop=True, inplace=True)
    return edits_df

def etype(allele_ref: str, allele_alt: str) -> str:
    if len(allele_ref) < len(allele_alt):  # insertion
        return "insertion"
    elif len(allele_ref) > len(allele_alt):  # deletion
        return "deletion"
    return "snv"  # base-case SNV

def assign_type(ref: str, alt: str) -> str:
    if "," in alt:  # polyploid alternative allele
        return "-".join(list(set([etype(ref, aa) for aa in alt.split(",")])))
    return etype(ref, alt)  # regular alternative allele

def construct_report(
    guide: str, 
    tool: str, 
    cell_type: str,
    upstream: Optional[bool] = False, 
    downstream: Optional[bool] = False,
) -> pd.DataFrame:
    assert (upstream + downstream == 0) or (upstream + downstream == 1)    
    sys.stderr.write(
        f"Constructing report for edits called by {tool} on cell type "
        f"{cell_type} and guide {guide}...\n"
    )
    offtargets = pd.read_csv(
        os.path.join(CASOFFINDER, casoffinder_report(guide)), sep="\t", header=None
    )
    # rename casoffinder report columns and sort sites by mismatches number
    offtargets.columns = ["GUIDE", "CHR", "POS", "TARGET", "STRAND", "MM"]
    offtargets.sort_values("MM", ascending=True, inplace=True)
    offtargets["SITE"] = offtargets.apply(
        lambda x: recover_site(x[1], x[2], x[4], upstream, downstream), axis=1
    )  # recover edits sites (used for later join)
    # recover edits called on each off-target site
    tqdm.pandas()  # track apply() progress
    edits = offtargets.progress_apply(
        lambda x: read_vcf(
            os.path.join(EDITSDIR, tool, cell_type, guide, f"{x[6]}.vcf"), tool
        ), 
        axis=1
    )
    edits_dataset = edits_df(edits, offtargets)  # construct the edits dataset
    if not edits_dataset.empty:
        edits_dataset["EDITTYPE"] = edits_dataset.apply(
            lambda x: assign_type(x[8], x[9]), axis=1
        )  # assign edits type
    return edits_dataset

## Datasets construction

In [10]:
GUIDES = ["EMX1", "HEKSite4", "RNF2", "VEGFASite3"]
CELLTYPES = ["GM12878", "K562"]

In [18]:
# GATK MUTECT2
tool = "mutect2"
for cell_type in CELLTYPES:
    for guide in GUIDES:
        # if constructing reports for upstream or downstream regions, set the 
        # appropriate flags in construct_reports()
        edits = construct_report(guide, tool, cell_type, upstream=True)  
        # store the edits dataset
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        edits.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}.txt"), index=False, sep="\t"
        )

Constructing report for edits called by mutect2 on cell type GM12878 and guide EMX1...
100%|██████████| 293/293 [00:00<00:00, 1255.49it/s]
Constructing report for edits called by mutect2 on cell type GM12878 and guide HEKSite4...
100%|██████████| 832/832 [00:00<00:00, 1265.66it/s]
Constructing report for edits called by mutect2 on cell type GM12878 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 1112.13it/s]
Constructing report for edits called by mutect2 on cell type GM12878 and guide VEGFASite3...
100%|██████████| 6509/6509 [00:05<00:00, 1184.81it/s]
Constructing report for edits called by mutect2 on cell type K562 and guide EMX1...
100%|██████████| 293/293 [00:00<00:00, 1319.75it/s]
Constructing report for edits called by mutect2 on cell type K562 and guide HEKSite4...
100%|██████████| 832/832 [00:00<00:00, 1277.82it/s]
Constructing report for edits called by mutect2 on cell type K562 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 1026.47it/s]
Constructing report for edit

In [19]:
# STRELKA
tool = "strelka"
for cell_type in CELLTYPES:
    for guide in GUIDES:
        # if constructing reports for upstream or downstream regions, set the 
        # appropriate flags in construct_reports()
        edits = construct_report(guide, tool, cell_type, upstream=True)
        # store the edits dataset
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        edits.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}.txt"), index=False, sep="\t"
        )

Constructing report for edits called by strelka on cell type GM12878 and guide EMX1...
100%|██████████| 293/293 [00:05<00:00, 54.38it/s]
Constructing report for edits called by strelka on cell type GM12878 and guide HEKSite4...
100%|██████████| 832/832 [00:13<00:00, 61.74it/s]
Constructing report for edits called by strelka on cell type GM12878 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 57.18it/s]
Constructing report for edits called by strelka on cell type GM12878 and guide VEGFASite3...
100%|██████████| 6509/6509 [01:46<00:00, 61.04it/s]
Constructing report for edits called by strelka on cell type K562 and guide EMX1...
100%|██████████| 293/293 [00:04<00:00, 60.04it/s]
Constructing report for edits called by strelka on cell type K562 and guide HEKSite4...
100%|██████████| 832/832 [00:14<00:00, 58.77it/s]
Constructing report for edits called by strelka on cell type K562 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 35.54it/s]
Constructing report for edits called by st

In [20]:
# VARSCAN
tool = "varscan"
for cell_type in CELLTYPES:
    for guide in GUIDES:
        # if constructing reports for upstream or downstream regions, set the 
        # appropriate flags in construct_reports()
        edits = construct_report(guide, tool, cell_type, upstream=True)
        # store the edits dataset
        outdir = os.path.join(REPORTSDIR, tool)
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        edits.to_csv(
            os.path.join(outdir, f"{guide}_{cell_type}.txt"), index=False, sep="\t"
        )

Constructing report for edits called by varscan on cell type GM12878 and guide EMX1...
100%|██████████| 293/293 [00:00<00:00, 309.36it/s]
Constructing report for edits called by varscan on cell type GM12878 and guide HEKSite4...
100%|██████████| 832/832 [00:01<00:00, 436.05it/s]
Constructing report for edits called by varscan on cell type GM12878 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 415.23it/s]
Constructing report for edits called by varscan on cell type GM12878 and guide VEGFASite3...
100%|██████████| 6509/6509 [00:13<00:00, 465.25it/s]
Constructing report for edits called by varscan on cell type K562 and guide EMX1...
100%|██████████| 293/293 [00:00<00:00, 393.99it/s]
Constructing report for edits called by varscan on cell type K562 and guide HEKSite4...
100%|██████████| 832/832 [00:02<00:00, 384.21it/s]
Constructing report for edits called by varscan on cell type K562 and guide RNF2...
100%|██████████| 7/7 [00:00<00:00, 378.74it/s]
Constructing report for edits calle