In [1]:
import pandas as pd
import numpy as np
from regex import compile
from distutils.spawn import find_executable
from subprocess import check_output, check_call
from multiprocessing import cpu_count, Manager, Pool
from sys import stderr, exit

In [2]:
count_threshold = 10

Retrieve the dataframe of spacers from `find_spacers()`

In [3]:
%store -r spacer_df

In [4]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002


In [5]:
spacer_df=spacer_df[spacer_df["spacer"].isin(spacer_df["spacer"].unique())] # in case there are duplicates

In [16]:
spacerlist=spacer_df
cpus=0
refgenome="/Users/milessmith/workspace/bowtie/GCA_000001405.15_GRCh38_no_alt_analysis_set"
large_index_size=False
reject=False

In [7]:
program = find_executable("bowtie")

In [8]:
program

'/Users/milessmith/miniconda3/envs/mc/bin/bowtie'

In [17]:
if cpus is 0:
    cpus = cpu_count()

Adding a hash value makes matching up spacers with their off-target results **much** easier.

In [10]:
spacer_df['hash']=spacer_df.apply(lambda x: hash(tuple(x)), axis = 1)

In [11]:
with open('temp.fa', 'w') as tempfile:
    for entry in spacerlist.iterrows():
        tempfile.writelines(f">{entry[1]['hash']}\n{entry[1]['spacer']}\n")

In [19]:
command = f"{program} -a -p {cpus}"
if reject:
    command = command + f" -m {reject}"

if large_index_size:
    command = command + f" --large-index {refgenome}"
else:
    command = command + f" {refgenome}"

command = command + " -f temp.fa offtargets.fa"

try:
    check_call(command.split())
except:
    stderr.write("Bowtie encountered an error. Please check the log file.")
    exit(-1)
    
print("Bowtie finished.")
bowtie_results_file = 'offtargets.fa'

Bowtie finished.


Bowtie returns a tab-delimited file we can use.

In [20]:
bowtie_results = pd.read_csv(bowtie_results_file, 
                             header=None, 
                             names = ["hash", 
                                      "strand", 
                                      "refseq", 
                                      "position", 
                                      "seq", 
                                      "readquality",
                                      "aligncount",
                                      "mismatches"],
                             sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


If there are no mismatches, Bowtie puts no value in the last column resulting in `NaN` and disrupting stuff downstream

In [21]:
bowtie_results = bowtie_results.fillna(0)

In [22]:
bowtie_results.head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
0,1062615640028147440,-,chr15,73994885,CCCCCGAGACCCCCTCTGAAGGCCGCCAGC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
1,3514993471625539887,-,chr15,73994884,CCCCCCGAGACCCCCTCTGAAGGCCGCCAG,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
2,-5966576438363230504,-,chr15,73994883,TCCCCCCGAGACCCCCTCTGAAGGCCGCCA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
3,-1658647160762133790,-,chr15,73994882,CTCCCCCCGAGACCCCCTCTGAAGGCCGCC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
4,-2482697046115976855,-,chr15,73994881,CCTCCCCCCGAGACCCCCTCTGAAGGCCGC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [24]:
spacer_df[spacer_df['hash'] == 1062615640028147440]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash
0,PML,ENSE00001363480,73994672,73994702,+,GCTGGCGGCCTTCAGAGGGGGTCTCGGGGG,0.302692,1062615640028147440


In [25]:
bowtie_results[bowtie_results['hash'] == 1062615640028147440].head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
0,1062615640028147440,-,chr15,73994885,CCCCCGAGACCCCCTCTGAAGGCCGCCAGC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [26]:
%store -r nuclease_info

In [27]:
#if nuclease_info['nuclease'].values[0] is 'SpCas9':
    # TODO: adjust this to account for other PAMs
    mmpos = '[0-9]{1,}'
    mmpos_re = compile(mmpos)
    # we only need to score the spacer portion of the sequence

In [28]:
count_threshold = 10

Calculate the likelihood a Cas9 protospacer will cut at a particular off-target site
Equation from http://crispr.mit.edu/about

The mismatch scoring algorithm from the Zhang group has three terms:
* the effect of a mismatch at a particular position
* the product of the mismatch scores, weighted by the mean distance between each mismatch
* a penalty for the number of mismatches

Score is from 0 to 1, with higher scores indicating a higher likelihood the off-target will be cut.
e.g. the score for mismatches at [15,16,17,18,19] is infinitesimally small, indicating that those
mismatches are highly destablizing


In [29]:
def scoreCas9offtarget(mismatched_positions: list,
                       start: int,
                       end: int) -> float:
    search_region = range(start,end+1)
    mismatched_positions = [int(_)-1-4 for _ in mismatched_positions if int(_) in search_region]
    # remember that the for on target scoring, we have 4N-spacer-NGG-3N
    # for Cas9 off-target scoring, we only care about the spacer portion
    # also, Python uses 0-indexed arrays, so we also have to subtract 1 
    # from the position reported by Bowtie
    if not mismatched_positions:
        score = 1
    else:
    # experimentally determined weighting penality for mismatch at each position
        M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0, 0.389, 0.079, 0.445, 0.508, 0.613, 
             0.851, 0.731, 0.828, 0.615, 0.804, 0.685, 0.583]
        if len(mismatched_positions) == 1:  # if there is only one mismatch, we should ignore the second two terms.
            score = 1 - M[mismatched_positions[0]]
        else:
            nmm = len(mismatched_positions)
            mean_distance = (max(mismatched_positions) - min(mismatched_positions)) * 1.0 / (nmm - 1)
            term_2 = (1 / ((((19 - mean_distance) / 19) * 4) + 1))
            term_3 = 1.0 / (nmm ** 2)
            term_1 = 1
            for n in mismatched_positions:
                term_1 *= 1 - M[n - 20]
            score = term_1 * term_2 * term_3
    return score

Add all of the potential off-target scores together so that the higher the offtarget score, the more desirable the spacer.  I know this is the reverse of above, but eventually less confusing - you want spacers with a high on-target and high off-target score.

In [30]:
def sumofftargets(offtargetlist: list,
                  start: int,
                  end: int) -> float:

    sum_score = sum(scoreCas9offtarget(x, start, end) for x in offtargetlist)
    if sum_score == 0:
        return 100
    else:
        return ((1.0 / sum_score) * 100)

Create two new columns to hold the mismatch information

In [31]:
spacer_df['offtarget_score'] = np.repeat(0,spacer_df.shape[0])
spacer_df['number_matching'] = np.repeat(0,spacer_df.shape[0])

In [32]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063,7644388913300317257,0,0
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208,-4710941110124975420,0,0
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884,-842285887032466582,0,0
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562,-2508762551303370522,0,0
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002,5908152694256895239,0,0


Loop through each spacer identified by `find_guides()`, match it up with a result from Bowtie by using the hash.  Skip potentials that have a number of offtargets above a certain threshold, give a perfect score for potentials with only one match, score the rest.

In [63]:
int(nuclease_info["end"][0])

24

In [55]:
74045040 in range(int(spacer_df.loc[spacer_df["hash"] == i, "start"]),int(spacer_df.loc[spacer_df["hash"] == i, "stop"]))

True

In [64]:
# for each spacer
for i in spacer_df["hash"].unique():
    # get everything for that ["hash"]
    matching_locations = bowtie_results[bowtie_results["hash"] == i]
    
    # if the number of mismatches is above a threshold, remove the spacer
        # if there are more than one perfect matches
    if matching_locations.shape[0] > count_threshold or \
    len(matching_locations[matching_locations["mismatches"] == 0].index) > 1:
        score = 0
    # if there is only one entry - no offtargets, assign a score of 0
    elif matching_locations.shape[0] == 1:
        score = 100
    # elif there are mismatch positions, get the positions and make a list holding lists of those positions and score
    else:
        bounds = range(int(spacer_df.loc[spacer_df["hash"] == i, "start"]), \
                       int(spacer_df.loc[spacer_df["hash"] == i, "stop"])) # ideally, this would take refseq into consideration
        matching_locations = matching_locations.drop(
            matching_locations[matching_locations["position"].isin(bounds)].index)
        mmpos = [mmpos_re.findall(str(_[1]['mismatches'])) for _ in matching_locations.iterrows()]
        score = sumofftargets(mmpos,
                              start=nuclease_info["start"][0],
                              end=nuclease_info["end"][0])
    spacer_df.loc[spacer_df["hash"] == i,"offtarget_score"] = score
    spacer_df.loc[spacer_df["hash"] == i,"number_matching"] = matching_locations.shape[0]


An offtarget_score of 100 means one perfect match - the ideal situation.  A score of 50 is indicative of two perfect matches, 33 is 3 perfect matches and so on.  Multiple imperfect matches will lower the score, but if it is a mismatch near the protospacer, it is probably highly destabilizing and this only slightly lowers the score

In [65]:
alpha = spacer_df[spacer_df.loc[:,"offtarget_score"] < 100]
alpha[alpha.loc[:,"offtarget_score"] > 0]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSE00003521581,74035171,74035201,+,GGACTGGCGAGGAGTGGGCTGGCGAGGAGT,0.472008,-6487921199981873697,33.333333,3
0,PML,ENSE00003521581,74035171,74035201,+,GGGCTGGCGAGGAGTGGGCTGGCGAGGAGT,0.487037,4019695212341434821,33.333333,3
0,PML,ENSE00003521581,74035171,74035201,+,GGGCTGGCGAGGAGTGGGCTGGCGAGGACT,0.470899,1938315546189992296,33.333333,3
0,PML,ENSE00001837919,74044220,74044250,+,GAGTAGGGATGGGAAGGAGGGAGGAGGAGG,0.400415,-8093125150612556507,70.57163,2
0,PML,ENSE00001837919,74044220,74044250,+,GATGGGAAGGAGGGAGGAGGAGGAAGGGGC,0.457239,7436211538453088697,50.0,2
0,PML,ENSE00001837919,74044220,74044250,+,ATGGGAAGGAGGGAGGAGGAGGAAGGGGCT,0.451201,-5950465374056825043,50.0,2
0,PML,ENSE00001837919,74044220,74044250,+,AGAACCTGTTAAAACACAGACTGCTGGGCC,0.519846,7864086333900343017,25.0,4
0,PML,ENSE00001837919,74044220,74044250,+,GAACCTGTTAAAACACAGACTGCTGGGCCC,0.555765,174422731505712675,54.525627,3
0,PML,ENSE00001837919,74044220,74044250,+,GGTCTGCATTTCTAAGAAGCTCCCAGGGGA,0.507777,1013928385937934387,47.393365,3
0,PML,ENSE00001837919,74044220,74044250,+,GCTCCCAGGGGATGCTGATGCCGCAGGTCC,0.457297,4993952616146620801,86.635722,7


Spot check the resuls for one entry with multiple potential off-targets:

In [69]:
bowtie_results[bowtie_results["hash"] == 1013928385937934387]

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
1074,1013928385937934387,-,chr15,74045824,TCCCCTGGGAGCTTCTTAGAAATGCAGACC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
1075,1013928385937934387,-,chr13,52775628,TCCCCTGGGAGCTTCTTAGAAATGCAGACC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,1,"15:G>C,27:A>C"
1076,1013928385937934387,-,chr1,115083583,TCCCCTGGGAGCTTCTTAGAAATGCAGACC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,1,"15:G>C,27:A>C"


In [70]:
a = scoreCas9offtarget([0],4,24)
b = scoreCas9offtarget([15,27],4,24)
c = scoreCas9offtarget([15,27],4,24)
d = a+b+c
e = (1/d)*100
print(f"a: {a}\nb: {b}\nc: {c}\nd: {d}\ne: {e}")

a: 1
b: 0.5549999999999999
c: 0.5549999999999999
d: 2.11
e: 47.39336492890995
