In [68]:
import pandas as pd
import numpy as np
from regex import compile
from distutils.spawn import find_executable
from subprocess import check_output, check_call
from multiprocessing import cpu_count, Manager, Pool

In [206]:
count_threshold = 10

Retrieve the dataframe of spacers from `find_spacers()`

In [3]:
%store -r spacer_df

In [4]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002


In [5]:
spacer_df=spacer_df[spacer_df["spacer"].isin(spacer_df["spacer"].unique())] # in case there are duplicates

In [6]:
spacerlist=spacer_df
cpus=0
refgenome="/Users/milessmith/workspace/merrycrispr/merrycrispr/data/bowtie/GCA_000001405.15_GRCh38_no_alt_analysis_set"
large_index_size=False
reject=False

In [7]:
program = find_executable("bowtie")

In [8]:
if cpus is 0:
    cpus = cpu_count()

Adding a hash value makes matching up spacers with their off-target results **much** easier.

In [9]:
spacer_df['hash']=spacer_df.apply(lambda x: hash(tuple(x)), axis = 1)

In [11]:
with open('temp.fa', 'w') as tempfile:
    for entry in spacerlist.iterrows():
        tempfile.writelines(f">{entry[1]['hash']}\n{entry[1]['spacer']}\n")

In [12]:
command = f"{program} -a -p {cpus}"
if reject:
    command = command + f" -m {reject}"

if large_index_size:
    command = command + f" --large-index {refgenome}"
else:
    command = command + f" {refgenome}"

command = command + " -f temp.fa offtargets.fa"

try:
    check_call(command.split())
except:
    sys.stderr.write("Bowtie encountered an error. Please check the log file.")
    sys.exit(-1)
    
print("Bowtie finished.")
bowtie_results_file = 'offtargets.fa'

Bowtie finished.


Bowtie returns a tab-delimited file we can use.

In [26]:
bowtie_results = pd.read_csv(bowtie_results_file, 
                             header=None, 
                             names = ["hash", 
                                      "strand", 
                                      "refseq", 
                                      "position", 
                                      "seq", 
                                      "readquality",
                                      "aligncount",
                                      "mismatches"],
                             sep="\t")

If there are no mismatches, Bowtie puts no value in the last column resulting in `NaN` and disrupting stuff downstream

In [93]:
bowtie_results = bowtie_results.fillna(0)

In [94]:
bowtie_results.head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
0,3034248628717091829,+,chr15,73994686,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
1,8372441828540166283,+,chr15,73994697,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
2,3797309489026528158,+,chr15,73994698,CACCTCCCCTTTCGGACAGCTCAAGGGACT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
3,5568632362213989656,+,chr15,73994712,GACAGCTCAAGGGACTCAGCCAACTGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
4,8310665659683779518,+,chr15,73994780,CTAAACCGAGAATCGAAACTAAGCTGGGGT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [32]:
spacer_df[spacer_df['hash'] == 1247966027698955524]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash
0,PML,ENSE00002582133,74026214,74026244,+,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,0.457464,1247966027698955524


In [96]:
bowtie_results[bowtie_results['hash'] == 1247966027698955524].head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
4746536,1247966027698955524,+,chr17,74766240,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,108,0
4746537,1247966027698955524,+,chr9,19237395,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,108,0
4746538,1247966027698955524,+,chr16,28705361,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,108,0
4746539,1247966027698955524,+,chr22,19358043,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,108,0
4746540,1247966027698955524,+,chr8,42359398,GGCTGGAGTGTAGTGGCGTGATCTCGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,108,0


In [293]:
%store -r nuclease_info

In [113]:
#if nuclease_info['nuclease'].values[0] is 'SpCas9':
    # TODO: adjust this to account for other PAMs
    mmpos = '[0-9]{1,}'
    mmpos_re = compile(mmpos)
    # we only need to score the spacer portion of the sequence

In [435]:
count_threshold = 10

Calculate the likelihood a Cas9 protospacer will cut at a particular off-target site
Equation from http://crispr.mit.edu/about

The mismatch scoring algorithm from the Zhang group has three terms:
* the effect of a mismatch at a particular position
* the product of the mismatch scores, weighted by the mean distance between each mismatch
* a penalty for the number of mismatches

Score is from 0 to 1, with higher scores indicating a higher likelihood the off-target will be cut.
e.g. the score for mismatches at [15,16,17,18,19] is infinitesimally small, indicating that those
mismatches are highly destablizing


In [436]:
def scoreCas9offtarget(mismatched_positions: list,
                       start: int,
                       end: int) -> float:
    search_region = range(start,end+1)
    mismatched_positions = [int(_)-1-4 for _ in mismatched_positions if int(_) in search_region]
    # remember that the for on target scoring, we have 4N-spacer-NGG-3N
    # for Cas9 off-target scoring, we only care about the spacer portion
    # also, Python uses 0-indexed arrays, so we also have to subtract 1 
    # from the position reported by Bowtie
    if not mismatched_positions:
        score = 1
    else:
    # experimentally determined weighting penality for mismatch at each position
        M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0, 0.389, 0.079, 0.445, 0.508, 0.613, 
             0.851, 0.731, 0.828, 0.615, 0.804, 0.685, 0.583]
        if len(mismatched_positions) == 1:  # if there is only one mismatch, we should ignore the second two terms.
            score = 1 - M[mismatched_positions[0]]
        else:
            nmm = len(mismatched_positions)
            mean_distance = (max(mismatched_positions) - min(mismatched_positions)) * 1.0 / (nmm - 1)
            term_2 = (1 / ((((19 - mean_distance) / 19) * 4) + 1))
            term_3 = 1.0 / (nmm ** 2)
            term_1 = 1
            for n in mismatched_positions:
                term_1 *= 1 - M[n - 20]
            score = term_1 * term_2 * term_3
    return score

Add all of the potential off-target scores together so that the higher the offtarget score, the more desirable the spacer.  I know this is the reverse of above, but eventually less confusing - you want spacers with a high on-target and high off-target score.

In [437]:
def sumofftargets(offtargetlist: list,
                  start: int,
                  end: int) -> float:

    sum_score = sum(scoreCas9offtarget(x, start, end) for x in offtargetlist)
    if sum_score == 0:
        return 100
    else:
        return ((1.0 / sum_score) * 100)

Create two new columns to hold the mismatch information

In [402]:
spacer_df['offtarget_score'] = np.repeat(0,spacer_df.shape[0])
spacer_df['number_matching'] = np.repeat(0,spacer_df.shape[0])

In [459]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063,3034248628717091829,100.0,1
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208,8372441828540166283,100.0,1
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884,3797309489026528158,100.0,1
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562,5568632362213989656,100.0,1
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002,8310665659683779518,100.0,1


Loop through each spacer identified by `find_guides()`, match it up with a result from Bowtie by using the hash.  Skip potentials that have a number of offtargets above a certain threshold, give a perfect score for potentials with only one match, score the rest.

In [438]:
# for each spacer
for i in spacer_df["hash"].unique():
    # get everything for that ["hash"]
    matching_locations = bowtie_results[bowtie_results["hash"] == i]
    
    # if the number of mismatches is above a threshold, remove the spacer
        # if there are more than one perfect matches
    if matching_locations.shape[0] > count_threshold or \
    len(matching_locations[matching_locations["mismatches"] == 0].index) > 1:
        score = 0
    # if there is only one entry - no offtargets, assign a score of 0
    elif matching_locations.shape[0] == 1:
        score = 100
    # elif there are mismatch positions, get the positions and make a list holding lists of those positions and score
    else:
        mmpos = list()
        matching_locations = matching_locations.drop(matching_locations[matching_locations["position"].isin(boundary)].index).head()
        mmpos = [mmpos_re.findall(str(_[1]['mismatches'])) for _ in matching_locations.iterrows()]
        score = sumofftargets(mmpos, 
                              start = nuclease_info["start"][0], 
                              end = nuclease_info["end"][0])
    spacer_df.loc[spacer_df["hash"] == i,"offtarget_score"] = score
    spacer_df.loc[spacer_df["hash"] == i,"number_matching"] = matching_locations.shape[0]


An offtarget_score of 100 means one perfect match - the ideal situation.  A score of 50 is indicative of two perfect matches, 33 is 3 perfect matches and so on.  Multiple imperfect matches will lower the score, but if it is a mismatch near the protospacer, it is probably highly destabilizing and this only slightly lowers the score

In [439]:
alpha = spacer_df[spacer_df.loc[:,"offtarget_score"] < 100]
alpha[alpha.loc[:,"offtarget_score"] > 0]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSE00003521581,74035171,74035201,+,GGACTGGCGAGGAGTGGGCTGGCGAGGAGT,0.472008,2141288911352333675,33.333333,3
0,PML,ENSE00003521581,74035171,74035201,+,GGGCTGGCGAGGAGTGGGCTGGCGAGGAGT,0.487037,2014923799064502481,33.333333,3
0,PML,ENSE00003521581,74035171,74035201,+,GGGCTGGCGAGGAGTGGGCTGGCGAGGACT,0.470899,-9009378886749602645,33.333333,3
0,PML,ENSE00001837919,74045805,74045835,+,AGGACCTGCGGCATCAGCATCCCCTGGGAG,0.467888,-8702657415855574472,93.258577,3
0,PML,ENSE00001837919,74045806,74045836,+,GGACCTGCGGCATCAGCATCCCCTGGGAGC,0.555896,455566179910359767,88.697579,5
0,PML,ENSE00001837919,74046078,74046108,+,GCTCTAGTGTCCCCATCTGTAAAATGGGCT,0.325219,8484732412346007667,50.0,2
0,PML,ENSE00001837919,74044220,74044250,+,GAGTAGGGATGGGAAGGAGGGAGGAGGAGG,0.400415,-515979721010325621,70.57163,2
0,PML,ENSE00001837919,74044220,74044250,+,GATGGGAAGGAGGGAGGAGGAGGAAGGGGC,0.457239,566791295796949599,50.0,2
0,PML,ENSE00001837919,74044220,74044250,+,ATGGGAAGGAGGGAGGAGGAGGAAGGGGCT,0.451201,7022340708380797116,50.0,2
0,PML,ENSE00001837919,74044220,74044250,+,AGAACCTGTTAAAACACAGACTGCTGGGCC,0.519846,6702228232313072200,25.0,4


Spot check the resuls for one entry with multiple potential off-targets:

In [440]:
bowtie_results[bowtie_results["hash"] == -8702657415855574472]

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
843,-8702657415855574472,+,chr15,74045805,AGGACCTGCGGCATCAGCATCCCCTGGGAG,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
844,-8702657415855574472,-,chr5,10602315,CTCCCAGGGGATGCTGATGCCGCAGGTCCT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"6:T>A,18:C>T"
845,-8702657415855574472,+,chr5,10480665,AGGACCTGCGGCATCAGCATCCCCTGGGAG,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"6:C>T,9:A>G"


In [458]:
a = scoreCas9offtarget([0],4,24)
b = scoreCas9offtarget([6,18],4,24)
c = scoreCas9offtarget([6,9],4,24)
d = a+b+c
e = (1/d)*100
print(f"a: {a}\nb: {b}\nc: {c}\nd: {d}\ne: {e}")

a: 1
b: 0.015058510638297874
c: 0.057228915662650606
d: 1.0722874263009485
e: 93.25857745527081
