# Build edits datasets

In [26]:
from typing import List, Optional
from tqdm import tqdm

import pandas as pd
import numpy as np

import sys
import os

### Constant variables

In [76]:
BASEDIR = "/data/pinello/PROJECTS/2017_07_DARPA_SIMULATIONS"
GUIDESEQ = os.path.join(
    BASEDIR, "offtargetDetection/casoffinder/offby6/CRISPRessoWGS/guideseq_anno"
)
CIRCLESEQ = os.path.join(BASEDIR, "offtargetDetection/circleseq/")
EDITS = os.path.join(
    BASEDIR, "wgs/GM12878-Cas9/WGS1000/detectWithOtherTools/manuel_experiments/VCFs"
)
REPORTS = os.path.join(
    BASEDIR, "wgs/GM12878-Cas9/WGS1000/detectWithOtherTools/manuel_experiments/reports"
)
GUIDES = ["EMX1", "HEKSite4", "RNF2", "VEGFASite3"]
EXPERIMENTS = ["circleseq", "guideseq"]
CELLTYPES = ["GM12878", "K562"]
PADSIZE = 10000
INSERTION = "insertion"
DELETION = "deletion"
SNV = "snv"

## Mutect2

In [66]:
def read_mutect2(vcf: str) -> List[List]:
    """ (PRIVATE)
    
    The function parses the input VCF obtained running Mutect2.
    """
    try:
        handle = open(vcf, mode="r")
        variants = [
            line.strip().split() for line in handle if not line.startswith("#")
        ]
    except OSError:
        raise OSError(f"An error occurred while parsing {vcf}")
    finally:
        handle.close()
    return variants


def parse_variants(variants: pd.Series, target_sites: List[str]) -> pd.DataFrame:
    """The function builds a pandas DataFrame storing the input variants."""
    assert len(variants) == len(target_sites)
    vdf = {
        "SITE": [], "CHROM": [], "POS": [], "ID": [], "REF": [], "ALT": [], "FILTER": []
    }
    for i, tvariants in enumerate(variants):
        if bool(tvariants):  # skip target sites without variants
            for v in tvariants:
                vdf["SITE"].append(target_sites[i])
                vdf["CHROM"].append(v[0])
                vdf["POS"].append(v[1])
                vdf["ID"].append(v[2])
                vdf["REF"].append(v[3])
                vdf["ALT"].append(v[4])
                vdf["FILTER"].append(v[6])
    vdf = pd.DataFrame(vdf)
    return vdf


def assign_vtype(ref: str, alt: str) -> str:
    """The function assigns their type (insertion, deletion, or SNV) to each 
    input variant.
    """
    if "," in alt:  # polyploid alternative allele
        vtypes = []
        for aa in alt.split(","):
            if len(ref) < len(aa):
                vtypes.append(INSERTION)
            elif len(aa) < len(ref):
                vtypes.append(DELETION)
            else:
                vtypes.append(SNV)
        return "-".join(list(set(vtypes)))
    else:  # regular alternative allele
        if len(ref) < len(alt):
            return INSERTION
        elif len(alt) < len(ref):
            return DELETION
        return SNV


def compute_distance(vpos: int, target_coord: int) -> int:
    """The function computes the distance between variant position and the 
    target start/stop coordinates.
    """
    return vpos - target_coord


def assign_flag(
    vpos: int, target_start: int, target_stop: int, ref: str, alt: str, vtype: str
) -> bool:
    """The function assesses if the edit occurred inside or outside the expected 
    target region. If the variant is a deletion or insertion the edit will be 
    flagged as TP if it overlaps the target position range.
    """
    if vpos >= target_start and vpos <= target_stop:
        return "TP"
    else:
        if "," in alt:  # polyploid allele
            if INSERTION in vtype or DELETION in vtype:
                for aa in alt.split(","):
                    if len(aa) < len(ref):  # deletion:
                        padpos = list(range(vpos, vpos + len(ref)))
                        if any([p <= target_stop and p >= target_start for p in padpos]):
                            return "TP"
                    if len(ref) < len(aa):  # insertion
                        padpos = list(range(vpos, vpos + len(aa)))
                        if any([p <= target_stop and p >= target_start for p in padpos]):
                            return "TP"
        else:
            if DELETION in vtype:
                padpos = list(range(vpos, vpos + len(ref)))
                if any([p <= target_stop and p >= target_start for p in padpos]):
                    return "TP"
            elif INSERTION in vtype:
                padpos = list(range(vpos, vpos + len(alt)))
                if any([p <= target_stop and p >= target_start for p in padpos]):
                    return "TP"
    return "FP"
        

##### On-regions

In [73]:
def build_dataset_mutect2(exp_type: str, guide: str, cell_type: str) -> None:
    """The function build a TSV file listing the edits called by Mutect2."""
    # parse target files
    if exp_type == "circleseq":
        targets = pd.read_csv(
            os.path.join(CIRCLESEQ, f"{guide}.circleseq.hg19.hg38.targetname"), sep="\t"
        )
        # column renaming for later join
        columns = targets.columns.tolist()
        targets.columns = columns[:-1] + ["SITE"]
        tqdm.pandas()
        edits = targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "onregion", 
                    guide,
                    f"{x[-1]}.{x[0]}:{int(x[1]) - PADSIZE}-{int(x[2]) + PADSIZE}.vcf.filtered.vcf"
                )
            ),
            axis=1
        )
    else:  # exp_type == "guideseq"
        targets = pd.read_csv(
            os.path.join(GUIDESEQ, f"{guide}.guideseq"), sep="\t"
        )
        # column renaming for later join
        columns = targets.columns.tolist()
        targets.columns = columns[:6] + ["SITE"] + columns[7:]
        tqdm.pandas()
        edits = targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "onregion", 
                    guide,
                    f"{x[6]}.{x[0]}:{int(x[1]) - PADSIZE}-{int(x[2]) + PADSIZE}.vcf.filtered.vcf"
                )
            ),
            axis=1
        )
    # build variants dataset
    edits = parse_variants(edits, targets.SITE.tolist())
    # assign variant type (SNP/indel)
    edits["TYPE"] = edits.apply(lambda x : assign_vtype(x[4], x[5]), axis=1)
    # join targets and edits datasets
    edits = edits.merge(targets, on="SITE")
    # keep columns of interest
    if exp_type == "circleseq":
        keep = [
            "SITE", "CHROM", "POS", "REF", "ALT", "FILTER", "TYPE", "Start", "End", "Strand", "Distance"
        ]
    else:  # exp_type == "guideseq"
        keep = [
            "SITE", "CHROM", "POS", "REF", "ALT", "FILTER", "TYPE", "start", "end", "Strand", "Mismatch Total"
        ]
    edits = edits[keep]
    edits.columns = [
        "SITE", "CHROM", "VAR-POS", "REF", "ALT", "FILTER", "TYPE", "TARGET-START", "TARGET-STOP", "STRAND", "MISMATCHES"
    ]  # rename columns
    # computes distance between edits and expected edit sites
    edits["START-DISTANCE"] = edits.apply(
        lambda x : compute_distance(int(x[2]), int(x[7])), axis=1
    )
    edits["STOP-DISTANCE"] = edits.apply(
        lambda x : compute_distance(int(x[2]), int(x[8])), axis=1
    )
    # assess if edits occurred inside or outside the expected target site
    edits["FLAG"] = edits.apply(
        lambda x : assign_flag(int(x[2]), int(x[7]), int(x[8]), x[3], x[4], x[6]), 
        axis=1
    )
    # write the report
    outfile = f"mutect2_{exp_type}_{cell_type}_{guide}_onregion.tsv"
    edits.to_csv(
        os.path.join(REPORTS, "mutect2", exp_type, cell_type, "onregion", guide, outfile),
        sep="\t",
        index=False
    )

In [77]:
# construct datasets
for experiment in EXPERIMENTS:
    for cell_type in CELLTYPES:
        for guide in GUIDES:
            build_dataset_mutect2(experiment, guide, cell_type)

100%|██████████| 158/158 [00:01<00:00, 142.66it/s]
100%|██████████| 1043/1043 [00:06<00:00, 170.66it/s]
100%|██████████| 38/38 [00:00<00:00, 169.11it/s]
100%|██████████| 1242/1242 [00:07<00:00, 170.44it/s]
100%|██████████| 158/158 [00:00<00:00, 945.31it/s]
100%|██████████| 1043/1043 [00:05<00:00, 173.87it/s]
100%|██████████| 38/38 [00:00<00:00, 163.96it/s]
100%|██████████| 1242/1242 [00:07<00:00, 174.70it/s]
100%|██████████| 16/16 [00:00<00:00, 146.27it/s]
100%|██████████| 134/134 [00:00<00:00, 162.92it/s]
100%|██████████| 1/1 [00:00<00:00, 128.27it/s]
100%|██████████| 60/60 [00:00<00:00, 164.75it/s]
100%|██████████| 16/16 [00:00<00:00, 1227.93it/s]
100%|██████████| 134/134 [00:00<00:00, 177.33it/s]
100%|██████████| 1/1 [00:00<00:00, 159.68it/s]
100%|██████████| 60/60 [00:00<00:00, 184.90it/s]


#### Off-regions

In [95]:
def build_dataset_mutect2(exp_type: str, guide: str, cell_type: str) -> None:
    """The function build a TSV file listing the edits called by Mutect2."""
    # parse target files
    if exp_type == "circleseq":
        targets = pd.read_csv(
            os.path.join(CIRCLESEQ, f"{guide}.circleseq.hg19.hg38.targetname"), sep="\t"
        )
        # column renaming for later join
        columns = targets.columns.tolist()
        targets.columns = columns[:-1] + ["SITE"]
        tqdm.pandas()
        edits = targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "offregion", 
                    guide,
                    f"{x[-1]}.{x[0]}:{int(x[1]) - 100 - PADSIZE}-{int(x[1]) - 100}.vcf.filtered.vcf"
                )
            ),
            axis=1
        ) + targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "offregion", 
                    guide,
                    f"{x[-1]}.{x[0]}:{int(x[2]) + 100}-{int(x[2]) + 100 + PADSIZE}.vcf.filtered.vcf"
                )
            ),
            axis=1
        )
    else:  # exp_type == "guideseq"
        targets = pd.read_csv(
            os.path.join(GUIDESEQ, f"{guide}.guideseq"), sep="\t"
        )
        # column renaming for later join
        columns = targets.columns.tolist()
        targets.columns = columns[:6] + ["SITE"] + columns[7:]
        tqdm.pandas()
        edits = targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "offregion", 
                    guide,
                    f"{x[6]}.{x[0]}:{int(x[1]) - 100 - PADSIZE}-{int(x[1]) - 100}.vcf.filtered.vcf"
                )
            ),
            axis=1
        ) + targets.progress_apply(
            lambda x : read_mutect2(
                os.path.join(
                    EDITS, 
                    "mutect2", 
                    exp_type, 
                    cell_type, 
                    "offregion", 
                    guide,
                    f"{x[6]}.{x[0]}:{int(x[2]) + 100}-{int(x[2]) + 100 + PADSIZE}.vcf.filtered.vcf"
                )
            ),
            axis=1
        )
    # build variants dataset
    edits = parse_variants(edits, targets.SITE.tolist())
    # assign variant type (SNP/indel)
    edits["TYPE"] = edits.apply(lambda x : assign_vtype(x[4], x[5]), axis=1)
    # join targets and edits datasets
    edits = edits.merge(targets, on="SITE")
    # keep columns of interest
    if exp_type == "circleseq":
        keep = [
            "SITE", "CHROM", "POS", "REF", "ALT", "FILTER", "TYPE", "Start", "End", "Strand", "Distance"
        ]
    else:  # exp_type == "guideseq"
        keep = [
            "SITE", "CHROM", "POS", "REF", "ALT", "FILTER", "TYPE", "start", "end", "Strand", "Mismatch Total"
        ]
    edits = edits[keep]
    edits.columns = [
        "SITE", "CHROM", "VAR-POS", "REF", "ALT", "FILTER", "TYPE", "TARGET-START", "TARGET-STOP", "STRAND", "MISMATCHES"
    ]  # rename columns
    # computes distance between edits and expected edit sites
    edits["START-DISTANCE"] = edits.apply(
        lambda x : compute_distance(int(x[2]), int(x[7])), axis=1
    )
    edits["STOP-DISTANCE"] = edits.apply(
        lambda x : compute_distance(int(x[2]), int(x[8])), axis=1
    )
    # assess if edits occurred inside or outside the expected target site
    edits["FLAG"] = edits.apply(
        lambda x : assign_flag(int(x[2]), int(x[7]), int(x[8]), x[3], x[4], x[6]), 
        axis=1
    )
    assert all([flag == "FP" for flag in edits.FLAG.tolist()])
    # write the report
    outfile = f"mutect2_{exp_type}_{cell_type}_{guide}_offregion.tsv"
    edits.to_csv(
        os.path.join(REPORTS, "mutect2", exp_type, cell_type, "offregion", guide, outfile),
        sep="\t",
        index=False
    )

In [96]:
# construct datasets
for experiment in EXPERIMENTS:
    for cell_type in CELLTYPES:
        for guide in GUIDES:
            build_dataset_mutect2(experiment, guide, cell_type)

100%|██████████| 158/158 [00:00<00:00, 178.17it/s]
100%|██████████| 158/158 [00:00<00:00, 170.62it/s]
100%|██████████| 1043/1043 [00:05<00:00, 178.96it/s]
100%|██████████| 1043/1043 [00:05<00:00, 184.08it/s]
100%|██████████| 38/38 [00:00<00:00, 176.70it/s]
100%|██████████| 38/38 [00:00<00:00, 176.44it/s]
100%|██████████| 1242/1242 [00:06<00:00, 183.89it/s]
100%|██████████| 1242/1242 [00:06<00:00, 179.39it/s]
100%|██████████| 158/158 [00:00<00:00, 167.41it/s]
100%|██████████| 158/158 [00:00<00:00, 185.53it/s]
100%|██████████| 1043/1043 [00:05<00:00, 177.55it/s]
100%|██████████| 1043/1043 [00:05<00:00, 181.39it/s]
100%|██████████| 38/38 [00:00<00:00, 174.70it/s]
100%|██████████| 38/38 [00:00<00:00, 188.20it/s]
100%|██████████| 1242/1242 [00:07<00:00, 169.11it/s]
100%|██████████| 1242/1242 [00:06<00:00, 180.97it/s]
100%|██████████| 16/16 [00:00<00:00, 159.55it/s]
100%|██████████| 16/16 [00:00<00:00, 184.98it/s]
100%|██████████| 134/134 [00:00<00:00, 172.45it/s]
100%|██████████| 134/134 [0