From 92fceb0a539342b9cf9d55b378a7a8d8152101c1 Mon Sep 17 00:00:00 2001 From: Maximilian Haeussler Date: Thu, 11 Oct 2018 15:23:34 +0200 Subject: [PATCH] adding cctop eff score --- README.md | 2 +- bin/src/cctop_standalone/CCTop.py | 712 ++++++++++++++++++++++++ bin/src/cctop_standalone/LICENSE | 22 + bin/src/cctop_standalone/README | 77 +++ bin/src/cctop_standalone/bedInterval.py | 86 +++ crispor.py | 3 +- crisporEffScores.py | 17 +- todo.txt | 24 +- 8 files changed, 937 insertions(+), 6 deletions(-) create mode 100644 bin/src/cctop_standalone/CCTop.py create mode 100644 bin/src/cctop_standalone/LICENSE create mode 100644 bin/src/cctop_standalone/README create mode 100644 bin/src/cctop_standalone/bedInterval.py diff --git a/README.md b/README.md index 8c950a2..18f8875 100755 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ For the Cpf1 scoring model: sudo pip install keras tensorflow h5py -Install required R libraries: +Install required R libraries for the WangSVM efficiency score: sudo Rscript -e 'install.packages(c("e1071"), repos="http://cran.rstudio.com/")' sudo Rscript -e 'source("https://bioconductor.org/biocLite.R"); biocLite(c("limma"));' diff --git a/bin/src/cctop_standalone/CCTop.py b/bin/src/cctop_standalone/CCTop.py new file mode 100644 index 0000000..2568310 --- /dev/null +++ b/bin/src/cctop_standalone/CCTop.py @@ -0,0 +1,712 @@ +# -*- coding: UTF-8 -*- + +''' +Created on 16 Jun 2015 + +@author: juanlmateo +''' + +import sys +import argparse +import textwrap +import os +# from subprocess import Popen, PIPE +import re +import math + +from bedInterval import BedInterval + +iupac_code = {'R':'[AG]', 'Y':'[CT]', 'S':'[GC]', 'W':'[AT]', 'K':'[GT]', 'M':'[AC]', 'B':'[CGT]', 'D':'[AGT]', 'H':'[ACT]', 'V':'[ACG]', 'N':'[ACGT]'} + +def build_expression(seq): + result = '' + for c in seq: + if c in iupac_code: + result = result + iupac_code[c] + else: + result = result + c + + return result + + +def reverse_complement(sequence): + rev_comp = [] + + for idx in range(len(sequence) - 1, -1, -1): + if sequence[idx] == 'A': + rev_comp = rev_comp + ['T'] + elif sequence[idx] == 'C': + rev_comp = rev_comp + ['G'] + elif sequence[idx] == 'G': + rev_comp = rev_comp + ['C'] + elif sequence[idx] == 'T': + rev_comp = rev_comp + ['A'] + else: + rev_comp = rev_comp + ['N'] + return "".join(rev_comp) + +def reverse_complementPAM(sequence): + # wilcard characters are replaced by '*', treated as mismatches + rev_comp = [] + + for idx in range(len(sequence) - 1, -1, -1): + if sequence[idx] == 'A': + rev_comp = rev_comp + ['T'] + elif sequence[idx] == 'C': + rev_comp = rev_comp + ['G'] + elif sequence[idx] == 'G': + rev_comp = rev_comp + ['C'] + elif sequence[idx] == 'T': + rev_comp = rev_comp + ['A'] + else: + rev_comp = rev_comp + ['*'] + return "".join(rev_comp) + +def getOffTargetsNGG(sequence, bowtiePath, indexPath, exons, genes, coreMismatches, coreRange, totalMismatches, revPAM): + + if (coreMismatches == "NA" or coreRange == "NA"): + command = bowtiePath + os.path.sep + "bowtie " + " -a " + indexPath + " -n " + str(2 + 1) # plus 1 mismatch in the PAM, maximum 3! + command = command + " -l " + str(2 + len(revPAM)) # minimum 5, additional 3 bases from the PAM + command = command + " -e " + str(totalMismatches * 30 + 30) + command = command + " -y " # try hard to find all mismatched seeds + command = command + " --quiet " # just print alignments, no info + command = command + " -c " + revPAM + reverse_complement(sequence[:-len(revPAM)]) + else: + command = bowtiePath + os.path.sep + "bowtie " + " -a " + indexPath + " -n " + str(coreMismatches + 1) # plus 1 mismatch in the PAM, maximum 3! + command = command + " -l " + str(coreRange + len(revPAM)) # minimum 5, additional 3 bases from the PAM + command = command + " -e " + str(totalMismatches * 30 + 30) + command = command + " -y " # try hard to find all mismatched seeds + command = command + " --quiet " # just print alignments, no info + command = command + " -c " + revPAM + reverse_complement(sequence[:-len(revPAM)]) + bowtieOutput = os.popen(command, "r") + offtargets = [] + + while True: + line = bowtieOutput.readline() + if not line: break + + columns = line.rstrip('\n').split('\t') + mismatches = columns[7].split(':') + # mismatches in the PAM are not allowed + # bowtie shows the mismatches wrt. the input sequence, independently of the orientation of the alignment + if int(mismatches[0]) < (len(revPAM) - 1): + continue + + newOfftarget = Offtarget(False, columns[2], columns[1], int(columns[3]), columns[7], columns[4], len(sequence), 3, coreRange) + newOfftarget.setGeneInfo(exons, genes) + offtargets.append(newOfftarget) + + return offtargets + +# Cpf1 +def getOffTargetsTTTN(sequence, bowtiePath, indexPath, exons, genes, totalMismatches, PAM): + + + command = bowtiePath + os.path.sep + "bowtie " + " -a " + indexPath + " -n " + str(2 + 1) # plus 1 mismatch in the PAM, maximum 3! + command = command + " -l 6" # minimum 5, additional 4 bases from the PAM + command = command + " -e " + str(totalMismatches * 30 + 30) + command = command + " -y " # try hard to find all mismatched seeds + command = command + " --quiet " # just print alignments, no info + command = command + " -c TTTN" + sequence[len(PAM):] + bowtieOutput = os.popen(command, "r") + offtargets = [] + + while True: + line = bowtieOutput.readline() + if not line: break + + columns = line.rstrip('\n').split('\t') + mismatches = columns[7].split(':') + # mismatches in the PAM are not allowed + # bowtie shows the mismatches wrt. the input sequence, independently of the orientation of the alignment + if int(mismatches[0]) < (len(PAM) - 1): + # if mismatches[0] == '0' or mismatches[0] == '1': + continue + + newOfftarget = Offtarget(True, columns[2], columns[1], int(columns[3]), columns[7], columns[4], len(sequence), len(PAM), "NA") + newOfftarget.setGeneInfo(exons, genes) + offtargets.append(newOfftarget) + + return offtargets + +def getOffTargetsOtherPAMs(sequence, bowtiePath, indexPath, exons, genes, totalMismatches, PAM): + # NNGRRT, NNNNGATT, NNAGAAW, NAAAAC + # A**C**, AATC****, *TTCT**, GTTTT* + revPAM = reverse_complement(PAM) + countWildcard = revPAM.count('N') + countWildcardInSeed = revPAM[0:5].count('N') + bowtieOutput = os.popen(bowtiePath + os.path.sep + "bowtie " + " -a " + indexPath + " -n " + str(countWildcardInSeed) + + " -l 5" + + " -e " + str(totalMismatches * 30 + countWildcard * 30) + + " -y " + # try hard to find all mismatched seeds + " --quiet " + # just print alignments, no info + " -c " + revPAM + reverse_complement(sequence[:-len(PAM)]), "r") + offtargets = [] + + while True: + line = bowtieOutput.readline() + if not line: break + columns = line.rstrip('\n').split('\t') + mismatches = columns[7].split(',') + # we need to check the the matched sequence has a valid PAM + if PAM == "NNGRRT": # A**C** + # mismatches substituting the 'R' wildcard (first two) must be 'A' or 'G' + if (columns[1] == '-' and not (mismatches[0][2] in 'AG' and mismatches[1][2] in 'AG')) or (columns[1] == '+' and not (mismatches[0][2] in 'TC' and mismatches[1][2] in 'TC')): + continue + if PAM == "NNNNGATT": #AATC**** + # all mismatches in the seed (l 5) allowed + pass + if PAM == "NNAGAAW": # *TTCT** + # mismatches substituting the 'W' wildcard (first one) must be 'A' or 'T' + if not (mismatches[0][2] in 'AT'): + continue + if PAM == "NAAAAC": # GTTTT* + # all mismatches in the seed (l 5) allowed + pass + + newOfftarget = Offtarget(False, columns[2], columns[1], int(columns[3]), columns[7], columns[4], len(sequence), len(PAM), "NA") + newOfftarget.setGeneInfo(exons, genes) + offtargets.append(newOfftarget) + + return offtargets + +def getOffTargets(sequence, bowtiePath, indexPath, exons, genes, coreMismatches, coreRange, totalMismatches, pamType) : + + if pamType == 'NRG' or pamType == 'NGG': + offtargets = getOffTargetsNGG(sequence, bowtiePath, indexPath, exons, genes, coreMismatches, coreRange, totalMismatches, "CCN") + if pamType == 'NRG': + offtargets = offtargets + getOffTargetsNGG(sequence, bowtiePath, indexPath, exons, genes, coreMismatches, coreRange, totalMismatches, "CTN") + elif pamType == 'TTTN': + offtargets = getOffTargetsTTTN(sequence, bowtiePath, indexPath, exons, genes, totalMismatches, pamType) + else: + offtargets = getOffTargetsOtherPAMs(sequence, bowtiePath, indexPath, exons, genes, totalMismatches, pamType) + + offtargets.sort(key=lambda offtarget: (offtarget.score, offtarget.distance)) + + return offtargets + +class Offtarget: + # New offtarget site coming for forward search, TTTN + def __newFwd(self, chromosome, strand, start, substitutions, sequence, lengthSeq, lengthPAM, coreRange): + self.chromosome = chromosome + self.strand = strand + if strand == "+": # the search is done with the forward sequence! + self.sequence = list(sequence) # to make the string modifiable + else: + self.sequence = list(reverse_complement(sequence)) # to make the string modifiable + self.start = start # assuming bed coordinates + self.end = start + lengthSeq + + tmp = substitutions.split(",") + + self.mismatches = len(tmp) - 1 + self.alignment = ['|'] * (lengthSeq - lengthPAM) + self.score = 0 + for substitution in tmp: + [idx, nt] = substitution.split(':') + idx = int(idx) + if strand == "+": + self.sequence[idx] = nt[0] + else: + self.sequence[idx] = reverse_complement(nt[0]) + if idx < lengthPAM: # The mismatch in the PAM is not considered for score calculation of alignment + continue + self.score = self.score + pow(1.2, idx - lengthPAM + 1) + self.alignment[idx - lengthPAM] = '-' + if coreRange > 0 and coreRange != "NA": + self.alignment = "PAM[" + "".join(self.alignment[:coreRange]) + "]" + "".join(self.alignment[coreRange:]) + else: + self.alignment = "PAM" + "".join(self.alignment) + self.sequence = "".join(self.sequence) + + # New offtarget site coming for reverse search, NGG and other PAMs + def __newRev(self, chromosome, strand, start, substitutions, sequence, lengthSeq, lengthPAM, coreRange): + self.chromosome = chromosome + if strand == "+": # the search is done with the reverse complemented sequence! + self.strand = "-" + self.sequence = list(sequence) # to make the string modifiable + else: + self.strand = "+" + self.sequence = list(reverse_complement(sequence)) # to make the string modifiable + self.start = start # assuming bed coordinates + self.end = start + lengthSeq + + tmp = substitutions.split(",") + + self.mismatches = len(tmp) - 1 + self.alignment = ['|'] * (lengthSeq - lengthPAM) + self.score = 0 + for substitution in tmp: + [idx, nt] = substitution.split(':') + idx = int(idx) + if strand == "+": + self.sequence[idx] = nt[0] + else: + self.sequence[idx] = reverse_complement(nt[0]) + if idx < lengthPAM: # The mismatch in the PAM is not considered for score calculation of alignment + continue + self.score = self.score + pow(1.2, lengthSeq - idx) + self.alignment[lengthSeq - 1 - idx] = '-' + if coreRange > 0 and coreRange != "NA": + self.alignment = "".join(self.alignment[:-coreRange]) + "[" + "".join(self.alignment[-coreRange:]) + "]PAM" + else: + self.alignment = "".join(self.alignment) + "PAM" + self.sequence = reverse_complement("".join(self.sequence)) + + + def __init__(self, forward, chromosome, strand, start, substitutions, sequence, lengthSeq, lengthPAM, coreRange): + if(forward): + self.__newFwd(chromosome, strand, start, substitutions, sequence, lengthSeq, lengthPAM, coreRange) + else: + self.__newRev(chromosome, strand, start, substitutions, sequence, lengthSeq, lengthPAM, coreRange) + + def setGeneInfo(self, exons, genes): + closest = exons.closest(self.chromosome, self.start, self.end) + + self.geneID = closest[0] + self.geneName = closest[1] + self.distance = closest[2] + self.intragenic = genes.overlaps(self.chromosome, self.start, self.end) + + def getGenomicCoordinates(self): + return [self.chromosome, str(self.start + 1), str(self.end)] + def getBedCoordinates(self): + return [self.chromosome, str(self.start), str(self.end)] + +######################################################################## +# CRISPRater score calculation +######################################################################## +model_weight = [0.14177385, 0.06966514, 0.04216254, 0.03303432, 0.02355430, -0.04746424, -0.04878001, -0.06981921, -0.07087756, -0.08160700] +model_offset = 0.6505037 + +patternCG = re.compile("[CG]") +def getGCFreq(seq): + cg = len(patternCG.findall(seq)) + return(float(cg)/len(seq)) + +def calcFeatures(seq): + feat = [0]*10 + feat[0] = getGCFreq(seq[3:13]) + if(seq[19]=="G"): + feat[1] = 1 + if(seq[2]=="T" or seq[2]=="A"): + feat[2] = 1 + if(seq[11]=="G" or seq[11]=="A"): + feat[3] = 1 + if(seq[5]=="G"): + feat[4] = 1 + if(seq[3]=="T" or seq[3]=="A"): + feat[5] = 1 + if(seq[17]=="G" or seq[17]=="A"): + feat[6] = 1 + if(seq[4]=="C" or seq[4]=="A"): + feat[7] = 1 + if(seq[13]=="G"): + feat[8] = 1 + if(seq[14]=="A"): + feat[9] = 1 + return(feat) + +def getScore(seq): + features = calcFeatures(seq) + + score = 0 + for idx in range(0,len(features)): + score = score + features[idx]*model_weight[idx] + score = score + model_offset + return(score) + +def getScoreText(score): + text = "{0:.2f}".format(score) + if score < 0.56: + return text + ' (LOW)' #low score -> red + elif score > 0.74: + return text + ' (HIGH)' #high score -> blue + else: + return text + ' (MEDIUM)' #medium score -> grey + +######################################################################## + +class sgRNAbindingSite: + def __init__(self, targetSeq, sequence, position, strand, fwdPrimer, revPrimer): + self.sequence = sequence + if(len(targetSeq)==20): + self.effi_score = getScore(targetSeq) + else: + self.effi_score = None + self.position = position # leftmost coordinates, including the PAM if 5' + self.strand = strand + self.score = None + self.label = None + self.oligo1 = "" # leading GG, forward + self.oligo2 = "" # leading GG, reverse + self.oligoAfwd = "" # adding, forward + self.oligoArev = "" # adding, reverse + self.oligoSfwd = "" # substituting, forward + self.oligoSrev = "" # substituting, reverse + # oligos + if fwdPrimer == "TAGG": # T7 + if targetSeq[0] == 'G' and targetSeq[1] == 'G': + self.oligo1 = 'TA' + targetSeq + self.oligo2 = 'AAAC' + reverse_complement(targetSeq[2:]) + elif targetSeq[0] == 'G' and not targetSeq[1] == 'G': + self.oligoAfwd = 'TAg' + targetSeq + self.oligoArev = 'AAAC' + reverse_complement(self.oligoAfwd[4:]) + self.oligoSfwd = 'TAGg' + targetSeq[2:] + self.oligoSrev = 'AAAC' + reverse_complement(self.oligoSfwd[4:]) + else: + self.oligoAfwd = 'TAgg' + targetSeq + self.oligoArev = 'AAAC' + reverse_complement(self.oligoAfwd[4:]) + self.oligoSfwd = 'TAgg' + targetSeq[2:] + self.oligoSrev = 'AAAC' + reverse_complement(self.oligoSfwd[4:]) + elif fwdPrimer == "CACCG": + if targetSeq[0] == 'G': + self.oligo1 = 'CACC' + targetSeq + self.oligo2 = 'AAAC' + reverse_complement(targetSeq) + else: + self.oligoAfwd = 'CACCg' + targetSeq + self.oligoArev = 'AAAC' + reverse_complement('G' + self.oligoAfwd[5:]) + self.oligoSfwd = 'CACCg' + targetSeq[1:] + self.oligoSrev = 'AAAC' + reverse_complement('G' + self.oligoSfwd[5:]) + else: + self.oligo1 = fwdPrimer + targetSeq + self.oligo2 = revPrimer + reverse_complement(targetSeq) + + def addOffTargets(self, offTargets, coordinates, isThereGeneInfo): + self.offTargets = offTargets + # score computation + # if the query is plasmid not score computation + # the real target is not considered as an off-target for score computation + # more off-targets => lower score + # less mismatches => lower score + # closer to exon => lower score + averMismatches = 0 + totalMismatches = 0.0 + averDistance = 0 + for idx in range(len(offTargets)): + offTarget = offTargets[idx] + + if not coordinates is None: + # checking if the "off-target" site is in fact the on-target + if offTarget.chromosome == coordinates[0] and offTarget.start >= coordinates[1] and offTarget.start <= coordinates[2]: + continue + if offTarget.distance == 'NA' and isThereGeneInfo: + continue + totalMismatches = totalMismatches + 1 + averMismatches = averMismatches + offTarget.score + + if isThereGeneInfo: + if offTarget.distance != 0: + try: + averDistance = averDistance + math.log10(offTarget.distance) + except ValueError: + print(offTarget.distance) + else: + averDistance = averDistance + 0 + + if totalMismatches > 0: + self.score = averMismatches / totalMismatches + averDistance / totalMismatches - totalMismatches + else: + self.score = 100 + + + +class sgRNAbindingSites: + def __init__(self): + self.sites = [] + def add(self, targetSeq, sequence, position, strand, fwdPrimer, revPrimer): + newSite = sgRNAbindingSite(targetSeq, sequence, position, strand, fwdPrimer, revPrimer) + self.sites.append(newSite) + +def getSeqCoords(seq, bowtiePath, indexPath): + # We use bowtie to determine if the query sequence was extracted from the target genome + if len(seq) < 500: + # returns 0-index coordinates, bowtie uses 0-index + bowtieOutput = os.popen(bowtiePath + os.path.sep + "bowtie " + indexPath + " --quiet -c " + seq, "r") + + line = bowtieOutput.readline() + if not line: return None + columns = line.split('\t') + # [chromosome, start, end, strand] + return [columns[2], int(columns[3]), int(columns[3]) + len(seq), columns[1]] + else: + # 5 prime + bowtieOutput = os.popen(bowtiePath + os.path.sep + "bowtie " + indexPath + " --quiet -c " + seq[0:100], "r") + line = bowtieOutput.readline() + if not line: return None + columns5 = line.split('\t') + + # 3 prime + bowtieOutput = os.popen(bowtiePath + os.path.sep + "bowtie " + indexPath + " --quiet -c " + seq[-100:], "r") + line = bowtieOutput.readline() + if not line: return None + columns3 = line.split('\t') + + if columns5[2] != columns3[2]: + return None + if columns5[1] != columns3[1]: + return None + if (int(columns3[3]) + 100 - int(columns5[3])) != len(seq): + return None + + # [chromosome, start, end, strand] + return [columns5[2], int(columns5[3]), int(columns3[3]) + 100, columns5[1]] + +def getFormattedCoords(coords): + return coords[0] + ":" + coords[1] + "-" + coords[2] + +def getPlainOTPosition(distance, intragenic): + if (distance == 0): + return "Exonic" + elif(intragenic): + return "Intronic" + else: + return "Intergenic" + +def addCandidateTargets(pam, target_size, sgRNA5, sgRNA3, query, strand, candidates, fwdPrimer, revPrimer): + reg_exp = build_expression(pam) + sgRNA5_re = '^' + build_expression(sgRNA5) + sgRNA3_re = build_expression(sgRNA3) + '$' + if pam == 'TTTN': + indices = [m.start() for m in re.finditer('(?=' + reg_exp + ')', query, re.I)] + for index in indices: + if (index + target_size + len(pam)) > len(query): + continue + candidate_sequence = query[index + len(pam):index + len(pam) + target_size] + pam_sequence = query[index:index + len(pam)] + if (not re.search(sgRNA5_re, candidate_sequence) is None) and (not re.search(sgRNA3_re, candidate_sequence) is None): + # we need to transform the index from the reversed sequence to the forward sequence + if strand == '+': + candidates.add(candidate_sequence, pam_sequence + candidate_sequence, index, strand, fwdPrimer, revPrimer) + else: + candidates.add(candidate_sequence, pam_sequence + candidate_sequence, len(query) - (index + target_size + len(pam)), strand, fwdPrimer, revPrimer) + else: + indices = [m.start() for m in re.finditer('(?=' + reg_exp + ')', query, re.I)] + for index in indices: + if (index - target_size) < 0: + continue + candidate_sequence = query[index - target_size:index] + pam_sequence = query[index:index + len(pam)] + if (not re.search(sgRNA5_re, candidate_sequence) is None) and (not re.search(sgRNA3_re, candidate_sequence) is None): + # we need to transform the index from the reversed sequence to the forward sequence + if strand == '+': + candidates.add(candidate_sequence, candidate_sequence + pam_sequence, index - target_size, strand, fwdPrimer, revPrimer) + else: + candidates.add(candidate_sequence, candidate_sequence + pam_sequence, len(query) - (index + len(pam)), strand, fwdPrimer, revPrimer) + +def valid_dinucleotideIUPAC(string): + validChars = ['A', 'C', 'G', 'T', 'N'] + list(iupac_code.keys()) + string = string.upper() + if string != ''.join(c for c in string if c in validChars) or len(string) != 2: + msg = "%r is not a valid dinucleotide sequence" % string + raise argparse.ArgumentTypeError(msg) + return string + +def valid_overhang(string): + validChars = ['A', 'C', 'G', 'T', 'N'] + string = string.upper() + if string != ''.join(c for c in string if c in validChars) or len(string) > 5: + msg = "%r is not a valid overhang sequence (up to 5 nt)" % string + raise argparse.ArgumentTypeError(msg) + return string + +def doSearch(name, query, pamType, targetSize, totalMismatches, coreLength, coreMismatches, sgRNA5, sgRNA3, fwdPrimer, revPrimer, outputFolder, bowtiePath, indexPath, exonsFile, genesFile, maxOT): + + if pamType not in ["NGG", "NRG"]: + coreLength = "NA" + coreMismatches = "NA" + totalSeqSize = args.targetSize + len(pamType) + + # exons and genes + exons = BedInterval() + genes = BedInterval() + if exonsFile is not None and genesFile is not None: + try: + from bx.intervals.intersection import Interval + exons.loadFile(exonsFile) + genes.loadFile(genesFile) + except ImportError: + sys.stderr.write('The bx-python module is not available. Ignoring exon and gene files!\n') + + coordinates = getSeqCoords(query, bowtiePath, indexPath) + if not coordinates is None: + # What is the input sequence is in the reverse strand??? + # so we use the reverse complement + if coordinates[3] == "-": + query = reverse_complement(query) + + candidates = sgRNAbindingSites() + addCandidateTargets(pamType, targetSize, sgRNA5, sgRNA3, query, '+', candidates, fwdPrimer, revPrimer) + addCandidateTargets(pamType, targetSize, sgRNA5, sgRNA3, reverse_complement(query), '-', candidates, fwdPrimer, revPrimer) + + + if(len(candidates.sites) < 1): + sys.stderr.write('No candidates found in the query sequence named %s.' % name) + return + + maxScore = -1e10 + minScore = 1e10 + for idx in range(len(candidates.sites)): + ot = getOffTargets(candidates.sites[idx].sequence, bowtiePath, indexPath, exons, genes, coreMismatches, coreLength, totalMismatches, pamType) + if maxOT < float("inf"): + candidates.sites[idx].addOffTargets(ot[:maxOT], coordinates, len(exons.chroms) > 0) + else: + candidates.sites[idx].addOffTargets(ot, coordinates, len(exons.chroms) > 0) + + if candidates.sites[idx].score > maxScore: + maxScore = candidates.sites[idx].score + if candidates.sites[idx].score < minScore: + minScore = candidates.sites[idx].score + sys.stderr.write("Sequence %s: Done with candidate %i out of %i.\n" % (name, idx + 1, len(candidates.sites))) + + # scaling scores to the range [0, 1000] + if len(candidates.sites) > 1: + for idx in range(len(candidates.sites)): + if maxScore > minScore: + candidates.sites[idx].score = (candidates.sites[idx].score - minScore) / (maxScore - minScore) * 1000 + else: + candidates.sites[idx].score = 1000 + elif len(candidates.sites) == 1: + candidates.sites[0].score = 1000 + else: + return + + # sorting candidates by score + candidates.sites.sort(key=lambda site: (site.score), reverse=True) + for idx in range(len(candidates.sites)): + candidates.sites[idx].label = 'T' + str(idx + 1) + + # reporting + bedFile = open(outputFolder + os.path.sep + name + '.bed', 'w') + if coordinates is not None: + for idx in range(len(candidates.sites)): + bedFile.write(coordinates[0] + '\t' + str(coordinates[1] + candidates.sites[idx].position) + '\t' + str(coordinates[1] + candidates.sites[idx].position + totalSeqSize) + '\t' + candidates.sites[idx].label + '\t' + str(int(candidates.sites[idx].score)) + '\t' + candidates.sites[idx].strand + '\n') + else: + for idx in range(len(candidates.sites)): + bedFile.write(name + '\t' + str(candidates.sites[idx].position) + '\t' + str(candidates.sites[idx].position + totalSeqSize) + '\t' + candidates.sites[idx].label + '\t' + str(int(candidates.sites[idx].score)) + '\t' + candidates.sites[idx].strand + '\n') + bedFile.close() + + output = open(outputFolder + os.path.sep + name + '.xls', 'w') + fasta = open(outputFolder + os.path.sep + name + '.fasta', 'w') + + output.write("Input:\t" + query + "\n") + output.write("PAM:\t" + pamType + "\n") + output.write("Target site length:\t" + str(targetSize) + "\n") + output.write("Target site 5' limitation:\t" + sgRNA5 + "\n") + output.write("Target site 3' limitation:\t" + sgRNA3 + "\n") + output.write("Core length:\t" + str(coreLength) + "\n") + output.write("Core MM:\t" + str(coreMismatches) + "\n") + output.write("Total MM:\t" + str(totalMismatches) + "\n\n") + + for idx in range(0, len(candidates.sites)): + fasta.write('>' + candidates.sites[idx].label + '\n') + fasta.write(candidates.sites[idx].sequence + '\n') + + if candidates.sites[idx].effi_score is None: + output.write(candidates.sites[idx].label + '\t' + candidates.sites[idx].sequence + '\t' + str(int(candidates.sites[idx].score)) + '\n') + else: + output.write(candidates.sites[idx].label + '\t' + candidates.sites[idx].sequence + '\t' + str(int(candidates.sites[idx].score)) + '\tCRISPRater score\t' + getScoreText(candidates.sites[idx].effi_score) + '\n') + + if candidates.sites[idx].oligo1 != '': + output.write('Oligo fwd\t' + str(candidates.sites[idx].oligo1) + '\n') + output.write('Oligo rev\t' + str(candidates.sites[idx].oligo2) + '\n') + else: + output.write('Oligo adding fwd\t' + str(candidates.sites[idx].oligoAfwd) + '\n') + output.write('Oligo adding rev\t' + str(candidates.sites[idx].oligoArev) + '\n') + if candidates.sites[idx].oligoSfwd != "" and candidates.sites[idx].oligoSrev != "": + output.write('Oligo substituting fwd\t' + str(candidates.sites[idx].oligoSfwd) + '\n') + output.write('Oligo substituting rev\t' + str(candidates.sites[idx].oligoSrev) + '\n') + + if(pamType == 'TTTN'): + output.write('Chromosome\tstart\tend\tstrand\tMM\tPAM\ttarget_seq\talignment\tdistance\tposition\tgene name\tgene id\n') + else: + output.write('Chromosome\tstart\tend\tstrand\tMM\ttarget_seq\tPAM\talignment\tdistance\tposition\tgene name\tgene id\n') + for idx2 in range(0, len(candidates.sites[idx].offTargets)): + # in html output only top 20 offtarget sites + offTarget = candidates.sites[idx].offTargets[idx2] + + output.write("\t".join(offTarget.getGenomicCoordinates())) + output.write("\t" + offTarget.strand) + output.write("\t" + str(offTarget.mismatches)) + if(pamType == 'TTTN'): + output.write("\t" + offTarget.sequence[:len(pamType)] + "\t" + offTarget.sequence[len(pamType):]) + else: + output.write("\t" + offTarget.sequence[:-len(pamType)] + "\t" + offTarget.sequence[-len(pamType):]) + output.write("\t" + offTarget.alignment + "\t" + str(offTarget.distance) + "\t" + getPlainOTPosition(offTarget.distance, offTarget.intragenic)) + output.write("\t" + offTarget.geneName + "\t" + offTarget.geneID + "\n") + output.write("\n") + + + output.close() + fasta.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description="CCTop is the CRISPR/Cas9 Target online predictor.", epilog=textwrap.dedent('''\ + If you use this tool please cite it as: + + Stemmer, M., Thumberger, T., del Sol Keyer, M., Wittbrodt, J. and Mateo, J.L. + CCTop: an intuitive, flexible and reliable CRISPR/Cas9 target prediction tool. + PLOS ONE (2015). doi:10.1371/journal.pone.0124633 + + Have fun using CCTop! + ''')) + parser.add_argument("--input", metavar="", type=argparse.FileType('r'), help="Fasta file containing the sequence(s) to be scanned for sgRNA candidates.", required=True) + parser.add_argument("--index", metavar="" , help="Path to the bowtie index files including the name of the index.", required=True) + parser.add_argument("--bowtie", metavar="", help="Path to the folder where the executable bowtie is.", default="." + os.path.sep) + parser.add_argument("--targetSize", metavar="", help="Target site length. (default: %(default)s)", default=20, type=int) + parser.add_argument("--pam", help="PAM type. (default: %(default)s)", default="NGG", choices=['NGG', 'NRG', 'TTTN', 'NNGRRT', 'NNNNGATT', 'NNAGAAW', 'NAAAAC']) + parser.add_argument("--sgRNA5", metavar="", type=valid_dinucleotideIUPAC, help="Filter candidates target sites with the most 5 prime nucleotides defined by this sequence. IUPAC code allowed. (default: %(default)s)", default="NN") + parser.add_argument("--sgRNA3", metavar="", type=valid_dinucleotideIUPAC, help="Filter candidates target sites with the most 5 prime nucleotides defined by this sequence. IUPAC code allowed. (default: %(default)s)", default="NN") + parser.add_argument("--fwdOverhang", metavar="", type=valid_overhang, help="Sequence of the 5 prime forward cloning oligo. (default: %(default)s)", default="TAGG") + parser.add_argument("--revOverhang", metavar="", type=valid_overhang, help="Sequence of the 5 prime reverse cloning oligo. (default: %(default)s)", default="AAAC") + parser.add_argument("--totalMM", metavar="", help="Number of total maximum mismatches allowed in the off-target sites. (default: %(default)s)", default=4, type=int) + parser.add_argument("--coreLength", metavar="", help="Number of bases that enclose the core of the target site. (default: %(default)s)", default=12, type=int) + parser.add_argument("--coreMM", metavar="", help="Number of maximum mismatches allowed in the core of the off-target sites. (default: %(default)s)", default=2, type=int) + parser.add_argument("--maxOT", metavar="", help="Maximum number of off-target sites to be reported. (default: %(default)s)", default=float("inf"), type=int) + parser.add_argument("--output", metavar="", help="Output folder. (default: %(default)s)", default="." + os.path.sep) + parser.add_argument("--exonsFile", metavar="", help="Path to the pseudo-bed file containing the coordinate of exons in the target genome. (default: NotUsed)", default=None) + parser.add_argument("--genesFile", metavar="", help="Path to the pseudo-bed file containing the coordinate of genes in the target genome. (default: NotUsed)", default=None) + args = parser.parse_args() + + + # check that the file is a proper multifasta file + inputFile = args.input.read() + args.input.close() + inputFile = inputFile.replace("\r", "\n").replace("\n\n", "\n") + lines = inputFile.split("\n") + + validChars = '-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' + + fileContent = [] + seqLine = "" + for line in lines: + line = line.strip() + if len(line) > 1 and line[0] == ">": # header + if seqLine != "": + fileContent[len(fileContent) - 1].append(seqLine) + name = ''.join(c for c in line if c in validChars) + fileContent.append([name]) + seqLine = "" + elif len(fileContent) == 0: # first header + name = ''.join(c for c in line if c in validChars) + fileContent.append([name]) + else: # wrong format + sys.stderr.write("It looks that your input file is not (multi)fasta format. Please check it and try again.") + sys.exit(1) + elif re.match('[ACGTNacgtn]+', line) is not None: + if fileContent == "": # we find sequence without header + sys.stderr.write("It looks that your input file is not (multi)fasta format. Please check it and try again.") + sys.exit(1) + else: + seqLine = seqLine + line + + if seqLine != "": + fileContent[len(fileContent) - 1].append(seqLine) + else: + sys.stderr.write("It looks that you file is not (multi)fasta format. Please check it and try again.") + sys.exit(1) + + for sequence in fileContent: + doSearch(sequence[0], sequence[1], args.pam, args.targetSize, args.totalMM, args.coreLength, args.coreMM, args.sgRNA5, args.sgRNA3, args.fwdOverhang, args.revOverhang, args.output, args.bowtie, args.index, args.exonsFile, args.genesFile, args.maxOT) + diff --git a/bin/src/cctop_standalone/LICENSE b/bin/src/cctop_standalone/LICENSE new file mode 100644 index 0000000..762c8be --- /dev/null +++ b/bin/src/cctop_standalone/LICENSE @@ -0,0 +1,22 @@ + +The MIT License (MIT) + +Copyright (c) 2015 Juan L. Mateo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/bin/src/cctop_standalone/README b/bin/src/cctop_standalone/README new file mode 100644 index 0000000..a9dba47 --- /dev/null +++ b/bin/src/cctop_standalone/README @@ -0,0 +1,77 @@ +CCTop is a tool to determine suitable CRISPR/Cas9 target sites in a given query +sequence(s) and predict its potential off-target sites. The online version of +CCTop is available at http://crispr.cos.uni-heidelberg.de/ + +This is a command line version of CCTop that is designed mainly to allow search +of large volume of sequences and higher flexibility. + +If you use this tool for your scientific work, please cite it as: + Stemmer, M., Thumberger, T., del Sol Keyer, M., Wittbrodt, J. and Mateo, J.L. + CCTop: an intuitive, flexible and reliable CRISPR/Cas9 target prediction tool. + PLOS ONE (2015). doi:10.1371/journal.pone.0124633 + + +REQUIREMENTS + +CCTop is implemented in Python and it requires a version 2.7 or above. + +In addition we relay on the short read aligner Bowtie 1 to identify the +off-target sites. Bowtie can be downloaded from this site +http://bowtie-bio.sourceforge.net/index.shtml in binary format for the main +platforms. +You need to create an indexed version of the genome sequence of your +target species. This can be done with the tool bowtie-build included in the +Bowtie installation. For that you simply need a fasta file containing the genome +sequence. To get the index you can do something like: + $ /bowtie-build -r -f + +The previous line will create the index files in the current folder. + +To handle gene and exon annotations we use the python library bx-python +(https://bitbucket.org/james_taylor/bx-python/). This library is only required +if you want to associate off-target sites with the closest exon/gene, otherwise +you don't need to install it. Notice, however, that in this case all candidate +target sites will be given the same score, because in the current version the +score of the candidate target sites considers only off-target sites with +associated exon/gene. + +The exon and gene files contain basically the coordinates of those elements in +bed format (http://genome.ucsc.edu/FAQ/FAQformat#format1), which are the first +three columns of the file. There are two more columns with the ID and name of +the corresponding gene and a sixth empty column to comply with the format +accepted by the library. You can generate easily such kind of files for you +target organism using Ensembl Biomart (http://www.ensembl.org/biomart). + +In case of difficulties with these files contact us and we can provide you the +ones you need or help you to generate you own. + + +INSTALLATION + +Simply download the two .py files (CCTop.py and bedInterval.py) to a folder of +your choice and follow the instructions that you can find in the respective web +sites to install Bowtie 1 and bx-python. + + +USAGE + +You can run CCTop with the -h flag to get a detailed list of the available +parameters. For instance: + $ python /CCTop.py -h + +At minimum it is necessary to specify the input (multi)fasta file (--input) and +the index file (--index). In this case CCTop assumes that the Bowtie executable +can be found in the current folder, there are not gene and exon file to use and +the rest of parameters will take default values. Notice that the index file to +specify refers to the name of the index you specified for bowtie-build together +with the path, if necessary. A command for a typical run will look something +like this: + $ python /CCTop.py --input --index --bowtie --output + +The result of the run will be three files for each sequence in the input query +file. These files will have extension .fasta, .bed and .xls, containing, +respectively, the sequence of the target sites, their coordinates and their +detailed information as in the online version of CCTop. The name of the output +file(s) will be taken from the name of the sequences in the input fasta file. + + diff --git a/bin/src/cctop_standalone/bedInterval.py b/bin/src/cctop_standalone/bedInterval.py new file mode 100644 index 0000000..03c41b1 --- /dev/null +++ b/bin/src/cctop_standalone/bedInterval.py @@ -0,0 +1,86 @@ +''' +Created on Mar 20, 2014 + +@author: juan +''' + + +class MyInterval(object): + def __init__(self,start,end,value): + self.start = start + self.end = end + self.value = value + +class BedInterval( object ): + ''' + classdocs + ''' + + + def __init__(self): + ''' + Constructor + ''' + self.chroms = {} + + def insert( self, chrom, start, end, gene_id, gene_name ): + from bx.intervals.intersection import Interval + from bx.intervals.intersection import IntervalNode + from bx.intervals.intersection import IntervalTree + if chrom in self.chroms: + self.chroms[chrom].insert( start, end, MyInterval(start,end,[gene_id, gene_name]) ) + else: + self.chroms[chrom] = IntervalTree() + self.chroms[chrom].insert( start, end, MyInterval(start,end,[gene_id, gene_name]) ) + + def loadFile(self, file): + bedFile = open(file,'r') + for line in bedFile: + line = line.rstrip('\n').split('\t') + self.insert(line[0], int(line[1]), int(line[2]), line[3], line[4]) + + def overlaps(self, chrom, start, end): + if not chrom in self.chroms: + return False + overlapping = self.chroms[chrom].find(start, end) + if len(overlapping)>0: + return True + else: + return False + + def closest(self,chrom, start, end): + if not chrom in self.chroms: + return ['NA', 'NA', 'NA'] + + #first checking if this site overlaps any from the loaded file + overlapping = self.chroms[chrom].find(start, end) + if len(overlapping)>0: + return [overlapping[0].value[0],overlapping[0].value[1],0] + + #to the left + #it finds features with a start > than 'position' + left = self.chroms[chrom].before(start, max_dist=1e5) + + #to the right + #it finds features with a end < than 'position' + right = self.chroms[chrom].after(end, max_dist=1e5) + + if len(left)>0: + if len(right)>0: + distLeft = max(0,1 + start - left[0].end) + distRight = max(0,1 + right[0].start - end) + if distLeft < distRight: + return [left[0].value[0],left[0].value[1],distLeft] + else: + return [right[0].value[0],right[0].value[1],distRight] + else: + distLeft = max(0,1 + start - left[0].end) + return [left[0].value[0],left[0].value[1],distLeft] + else: + if len(right)>0: + distRight = max(0,1 + right[0].start - end) + return [right[0].value[0],right[0].value[1],distRight] + else: + return ['NA', 'NA', 'NA'] + + diff --git a/crispor.py b/crispor.py index b7b9a30..385d04a 100755 --- a/crispor.py +++ b/crispor.py @@ -345,6 +345,7 @@ "fusi" : ("Doench '16", "Aka the 'Fusi-Score', since V4.4 using the version 'Azimuth', scores are slightly different than before April 2018 but very similar (click 'show all' to see the old scores). Range: 0-100. Boosted Regression Tree model, trained on data produced by Doench et al (881 guides, MOLM13/NB4/TF1 cells + unpublished additional data). Delivery: lentivirus. See Fusi et al. 2015 and Doench et al. 2016 and crispr.ml. Recommended for guides expressed in cells (U6 promoter). Click to sort the table by this score."), "fusiOld" : ("Doench '16-Old", "The original implementation of the Doench 2016 score, as received from John Doench. The scores are similar, but not exactly identical to the 'Azimuth' version of the Doench 2016 model that is currently the default on this site, since Apr 2018."), "najm" : ("Najm 2018", "A modified version of the Doench 2016 score ('Azimuth'), by Mudra Hegde for S. aureus Cas9. Range 0-100. See Najm et al 2018."), + "ccTop" : ("CCTop", "The efficiency score used by CCTop, called 'crisprRank'."), "aziInVitro" : ("Azimuth in-vitro", "The Doench 2016 model trained on the Moreno-Mateos zebrafish data. Unpublished model, gratefully provided by J. Listgarden"), "housden" : ("Housden", "Range: ~ 1-10. Weight matrix model trained on data from Drosophila mRNA injections. See Housden et al."), "proxGc" : ("ProxGCCount", "Number of GCs in the last 4pb before the PAM"), @@ -1391,7 +1392,7 @@ def makePosList(org, countDict, guideSeq, pam, inputPos): guideNoPam = "A"+guideNoPam if pamIsCpf1(pam): - # Cpf1 has no scores yet + # Cpf1 has no off-target scores yet mitScore=0.0 cfdScore=0.0 else: diff --git a/crisporEffScores.py b/crisporEffScores.py index 775c9fb..f6e2172 100755 --- a/crisporEffScores.py +++ b/crisporEffScores.py @@ -8,7 +8,9 @@ # - Fusi: Fusi et al, prepublication manuscript on bioarxiv, http://dx.doi.org/10.1101/021568 http://research.microsoft.com/en-us/projects/azimuth/, only available as a web API # - Housden: Housden et al, PMID 26350902, http://www.flyrnai.org/evaluateCrispr/ # - OOF: Microhomology and out-of-frame score from Bae et al, Nat Biotech 2014 , PMID24972169 http://www.rgenome.net/mich-calculator/ -# - Wu-Crispr: Wong et al, PMID, http://www.genomebiology.com/2015/16/1/218 +# - Wu-Crispr: Wong et al, http://www.genomebiology.com/2015/16/1/218 +# - DeepCpf1, Kim et al, PMID 29431740, https://www.ncbi.nlm.nih.gov/pubmed/29431740 +# - SaCas9 efficiency score (no name), Najm et al, https://www.ncbi.nlm.nih.gov/pubmed/29251726 # the input are 100bp sequences that flank the basepair just 5' of the PAM +/-50bp. # so 50bp 5' of the PAM, and 47bp 3' of the PAM -> 100bp @@ -36,6 +38,9 @@ najm2018Dir = join(dirname(__file__), "bin/najm2018/") sys.path.insert(0, najm2018Dir) +cctopDir = join(dirname(__file__), "bin/src/cctop_standalone") +sys.path.insert(0, cctopDir) + # import numpy as np # global that points to the crispor 'bin' directory with the external executables @@ -822,6 +827,8 @@ def calcAllScores(seqs, addOpt=[], doAll=False, skipScores=[], enzyme=None): logging.debug("Azimuth in-vitro") scores["aziInVitro"] = calcAziInVitro(trimSeqs(seqs, -24, 6)) + scores["ccTop"] = calcCctopScore(trimSeqs(seqs, -20, 0)) + scores["finalGc6"] = [int(s.count("G")+s.count("C") >= 4) for s in trimSeqs(seqs, -6, 0)] scores["finalGg"] = [int(s=="GG") for s in trimSeqs(seqs, -2, 0)] @@ -1110,6 +1117,14 @@ def calcDeepCpf1Scores(seqs): import DeepCpf1 return DeepCpf1.scoreSeqs(seqs) +def calcCctopScore(seqs): + import CCTop + scores = [] + for seq in seqs: + score = CCTop.getScore(seq) + scores.append(score) + return scores + # ----------- MAIN -------------- if __name__=="__main__": args, options = parseArgs() diff --git a/todo.txt b/todo.txt index 04bc209..d49af98 100644 --- a/todo.txt +++ b/todo.txt @@ -21,8 +21,6 @@ Add these new criteria: - Jennifer's zebrafish model - CRISPROff and CRISPRSpec? http://rth.dk/resources/crispr/ -Show distance of off-targets from the on-target! - Show a histogram of the mismatches instead of 0-0-10-15-40 What is site-seq and circle-seq? @@ -55,9 +53,25 @@ off-target searchers: - align against ALL human genomes: https://github.com/emptyewer/MAPster - Church lab (CRISPR-GA): http://54.80.152.219/index.php -- more scores: +- on-target scores: - https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-017-1697-6 + Says that dinucleotides improve the AUC, code at: + http://www.ams.sunysb.edu/~pfkuan/softwares.html#predictsgrna + Good R code, easy to run - CrisprPred http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0181943 + code at: https://github.com/khaled-rahman/CRISPRpred + Weird R code, nothing easy to calculate the score, looks like CS people + No updates ever + - CCTop crisprRater https://academic.oup.com/nar/article/46/3/1375/4754467 + + off-targets: + - OfftargetPredict https://academic.oup.com/bioinformatics/article/34/17/i757/5093213 + code at https://github.com/penn-hui/OfftargetPredict + - crisproff, unpublished but on github + + + induced deletions: + - see JP email, CSHL 2018 --- @@ -164,3 +178,7 @@ of good guides in the first and second exon of a gene, for creating knockouts. Thanks again, this will be super helpful for finding guides for our next few projects Zach + +Done: + +Show distance of off-targets from the on-target!