In [1]:
import pandas as pd
import numpy as np
from regex import compile
from distutils.spawn import find_executable
from subprocess import check_output, check_call
from multiprocessing import cpu_count, Manager, Pool
from sys import stderr, exit
from numba import jit

In [2]:
count_threshold = 10

Retrieve the dataframe of spacers from `find_spacers()`

In [3]:
%store -r spacer_df

In [4]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score
0,PML,ENSG00000140464,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063
0,PML,ENSG00000140464,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208
0,PML,ENSG00000140464,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884
0,PML,ENSG00000140464,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562
0,PML,ENSG00000140464,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002


In [5]:
spacer_df=spacer_df[spacer_df["spacer"].isin(spacer_df["spacer"].unique())] # in case there are duplicates

In [6]:
spacerlist=spacer_df
cpus=0
refgenome="/Users/milessmith/workspace/bowtie/GCA_000001405.15_GRCh38_no_alt_analysis_set"
large_index_size=False
reject=False

In [7]:
program = find_executable("bowtie")

In [8]:
program

'/Users/milessmith/miniconda3/envs/mc/bin/bowtie'

In [9]:
if cpus is 0:
    cpus = cpu_count()

Adding a hash value makes matching up spacers with their off-target results **much** easier.

In [10]:
spacer_df['hash']=spacer_df.apply(lambda x: hash(tuple(x)), axis = 1)

In [11]:
with open('temp.fa', 'w') as tempfile:
    for entry in spacerlist.iterrows():
        tempfile.writelines(f">{entry[1]['hash']}\n{entry[1]['spacer']}\n")

In [12]:
command = f"{program} -a -p {cpus}"
if reject:
    command = command + f" -m {reject}"

if large_index_size:
    command = command + f" --large-index {refgenome}"
else:
    command = command + f" {refgenome}"

command = command + " -f temp.fa offtargets.fa"

try:
    check_call(command.split())
except:
    stderr.write("Bowtie encountered an error. Please check the log file.")
    exit(-1)
    
print("Bowtie finished.")
bowtie_results_file = 'offtargets.fa'

Bowtie finished.


Bowtie returns a tab-delimited file we can use.

In [13]:
bowtie_results = pd.read_csv(bowtie_results_file, 
                             header=None, 
                             names = ["hash", 
                                      "strand", 
                                      "refseq", 
                                      "position", 
                                      "seq", 
                                      "readquality",
                                      "aligncount",
                                      "mismatches"],
                             sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


If there are no mismatches, Bowtie puts no value in the last column resulting in `NaN` and disrupting stuff downstream

In [14]:
bowtie_results = bowtie_results.fillna(0)

In [15]:
bowtie_results.head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
0,2273848149162870664,+,chr15,73994686,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
1,5251915309045097405,+,chr15,73994697,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
2,5797773873868003637,+,chr15,73994698,CACCTCCCCTTTCGGACAGCTCAAGGGACT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
3,-1134622095285696753,+,chr15,73994712,GACAGCTCAAGGGACTCAGCCAACTGGCTC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
4,-6377816250002101177,+,chr15,73994780,CTAAACCGAGAATCGAAACTAAGCTGGGGT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [38]:
spacer_df[spacer_df['hash'] == 5797773873868003637]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSG00000140464,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884,5797773873868003637,100.0,1


In [39]:
bowtie_results[bowtie_results['hash'] == 5797773873868003637].head()

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
2,5797773873868003637,+,chr15,73994698,CACCTCCCCTTTCGGACAGCTCAAGGGACT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [18]:
%store -r nuclease_info

In [19]:
#if nuclease_info['nuclease'].values[0] is 'SpCas9':
    # TODO: adjust this to account for other PAMs
    mmpos = '[0-9]{1,}'
    mmpos_re = compile(mmpos)
    # we only need to score the spacer portion of the sequence

In [20]:
count_threshold = 10

Calculate the likelihood a Cas9 protospacer will cut at a particular off-target site
Equation from http://crispr.mit.edu/about

The mismatch scoring algorithm from the Zhang group has three terms:
* the effect of a mismatch at a particular position
* the product of the mismatch scores, weighted by the mean distance between each mismatch
* a penalty for the number of mismatches

Score is from 0 to 1, with higher scores indicating a higher likelihood the off-target will be cut.
e.g. the score for mismatches at [15,16,17,18,19] is infinitesimally small, indicating that those
mismatches are highly destablizing


In [104]:
def scoreCas9offtarget(mismatched_positions: list,
                       start: int,
                       end: int) -> float:
    search_region = range(start+1,end+1)
#     print(search_region)
    mismatched_positions = [int(_)-5 for _ in mismatched_positions if int(_) in search_region]
#     print(mismatched_positions)
    # remember that the for on target scoring, we have 4N-spacer-NGG-3N
    # for Cas9 off-target scoring, we only care about the spacer portion
    # also, Python uses 0-indexed arrays, so we also have to subtract 1 
    # from the position reported by Bowtie
    if not mismatched_positions:
        score = 1
    else:
    # experimentally determined weighting penality for mismatch at each position
        M = [0, 0, 0.014, 0, 0, 0.395, 0.317, 0, 0.389, 0.079, 0.445, 0.508, 0.613, 
             0.851, 0.731, 0.828, 0.615, 0.804, 0.685, 0.583]
        if len(mismatched_positions) == 1:  # if there is only one mismatch, we should ignore the second two terms.
            score = 1 - M[mismatched_positions[0]]
        else:
            nmm = len(mismatched_positions)
            mean_distance = (max(mismatched_positions) - min(mismatched_positions)) * 1.0 / (nmm - 1)
            term_2 = (1 / ((((19 - mean_distance) / 19) * 4) + 1))
            term_3 = 1.0 / (nmm ** 2)
            term_1 = 1
            for n in mismatched_positions:
#                 print(f"n: {n} n-20: {n-20}")
                term_1 *= 1 - M[n - 20]
            score = term_1 * term_2 * term_3
    return score

Add all of the potential off-target scores together so that the higher the offtarget score, the more desirable the spacer.  I know this is the reverse of above, but eventually less confusing - you want spacers with a high on-target and high off-target score.

In [184]:
def sumofftargets(offtargetlist: list,
                  start: int,
                  end: int) -> float:

    sum_score = sum(scoreCas9offtarget(x, start, end) for x in offtargetlist)
    if sum_score == 0:
        return 100
    else:
        final_score = (1-sum_score) * 100
        if final_score > 0:
            return final_score
        else:
            return 0

In [83]:
(1 / 1-scoreCas9offtarget([3], 4, 24))

range(4, 25)
[]


0.0

Create two new columns to hold the mismatch information

In [32]:
spacer_df['offtarget_score'] = np.repeat(0,spacer_df.shape[0])
spacer_df['number_matching'] = np.repeat(0,spacer_df.shape[0])

In [33]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSG00000140464,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063,2273848149162870664,0,0
0,PML,ENSG00000140464,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208,5251915309045097405,0,0
0,PML,ENSG00000140464,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884,5797773873868003637,0,0
0,PML,ENSG00000140464,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562,-1134622095285696753,0,0
0,PML,ENSG00000140464,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002,-6377816250002101177,0,0


Loop through each spacer identified by `find_guides()`, match it up with a result from Bowtie by using the hash.  Skip potentials that have a number of offtargets above a certain threshold, give a perfect score for potentials with only one match, score the rest.

In [34]:
int(nuclease_info["end"][0])

24

In [35]:
74045040 in range(int(spacer_df.loc[spacer_df["hash"] == i, "start"]),int(spacer_df.loc[spacer_df["hash"] == i, "stop"]))

False

In [186]:
# for each spacer
for i in spacer_df["hash"].unique():
    # get everything for that ["hash"]
    matching_locations = bowtie_results[bowtie_results["hash"] == i]
    
    # if the number of mismatches is above a threshold, remove the spacer
        # if there are more than one perfect matches
    if matching_locations.shape[0] > count_threshold or \
    len(matching_locations[matching_locations["mismatches"] == 0].index) > 1:
        score = 0
    # if there is only one entry - no offtargets, assign a score of 0
    elif matching_locations.shape[0] == 1:
        score = 100
    # elif there are mismatch positions, get the positions and make a list holding lists of those positions and score
    else:
        bounds = range(int(spacer_df.loc[spacer_df["hash"] == i, "start"]), \
                       int(spacer_df.loc[spacer_df["hash"] == i, "stop"])) # ideally, this would take refseq into consideration
        matching_locations = matching_locations.drop(
            matching_locations[matching_locations["position"].isin(bounds)].index)
        mmpos = [mmpos_re.findall(str(_[1]['mismatches'])) for _ in matching_locations.iterrows()]
#         try:
        score = sumofftargets(mmpos,
                              start=nuclease_info["start"][0],
                              end=nuclease_info["end"][0])
#         except:
#             print(f"mmpos: {mmpos}, start: {nuclease_info['start'][0]}, end: {nuclease_info['end'][0]}")
    spacer_df.loc[spacer_df["hash"] == i,"offtarget_score"] = score
    spacer_df.loc[spacer_df["hash"] == i,"number_matching"] = matching_locations.shape[0]


An offtarget_score of 100 means one perfect match - the ideal situation.  A score of 50 is indicative of two perfect matches, 33 is 3 perfect matches and so on.  Multiple imperfect matches will lower the score, but if it is a mismatch near the protospacer, it is probably highly destabilizing and this only slightly lowers the score

In [187]:
alpha = spacer_df[spacer_df.loc[:,"offtarget_score"] < 100]
alpha[alpha.loc[:,"offtarget_score"] > 0]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSG00000140464,73995889,73995919,+,GATTACAGGCCACGCCACCATACCTGGCTA,0.527816,-1666741231031362062,93.397922,4
0,PML,ENSG00000140464,73996370,73996400,+,CCACAATATATGAGACTCAAAGAAAGGTCA,0.573831,-104102213709905272,74.16413,3
0,PML,ENSG00000140464,73996889,73996919,+,GAGGATCTTGACTGCAAATTGTTCAGGTTC,0.363465,-7049932654528120854,99.519604,1
0,PML,ENSG00000140464,73997220,73997250,+,TTGTTACTGAGGCATGGAGAGTTGCGGTTT,0.56641,-9193367021944657541,99.584321,1
0,PML,ENSG00000140464,74002083,74002113,+,GTTTGGTTTCTTGTTCTGTCACCCAGGCTG,0.47217,-1805983178347146301,38.9,1
0,PML,ENSG00000140464,74002997,74003027,+,TGCCTCTCGTCCCAGCTATTCGGGAGGTGG,0.691249,4331339279305684228,83.85,3
0,PML,ENSG00000140464,74004379,74004409,+,TAACCTCGAACTCCTAGACTCAAGAGGTCT,0.644141,6799596336830874743,1.4,1
0,PML,ENSG00000140464,74004417,74004447,+,TAGTCTCTGAGTAGCTAGGACTATAGGTGC,0.453995,3396430527101221689,96.996774,1
0,PML,ENSG00000140464,74004750,74004780,+,TCTTGGAATATAGTGGCATGATCTTGGCTC,0.428682,-156144272632155828,1.4,1
0,PML,ENSG00000140464,74006495,74006525,+,CCAGAGATGAGTAACTCAAAGCGTTGGTTA,0.587441,5408481661682160326,99.212231,2


Spot check the resuls for one entry with multiple potential off-targets:

In [188]:
bowtie_results[bowtie_results["hash"] == -1666741231031362062]

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
2846248,-1666741231031362062,+,chr15,73995889,GATTACAGGCCACGCCACCATACCTGGCTA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
2846249,-1666741231031362062,-,chr6,57571502,TAGCCAGGTATGGTGGCGTGGCCTGTAATC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"21:C>T,24:G>A"
2846250,-1666741231031362062,-,chr6,113844346,TAGCCAGGTATGGTGGCGTGGCCTGTAATC,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"10:C>G,13:T>C"
2846251,-1666741231031362062,+,chr14,35209671,GATTACAGGCCACGCCACCATACCTGGCTA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"12:T>C,20:C>T"
2846252,-1666741231031362062,+,chr12,14311809,GATTACAGGCCACGCCACCATACCTGGCTA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"11:G>A,21:G>A"


In [189]:
a = scoreCas9offtarget([0],4,24)
b = scoreCas9offtarget([15,27],4,24)
c = scoreCas9offtarget([15,27],4,24)
d = a+b+c
e = (1/d)*100
print(f"a: {a}\nb: {b}\nc: {c}\nd: {d}\ne: {e}")

a: 1
b: 0.5549999999999999
c: 0.5549999999999999
d: 2.11
e: 47.39336492890995


In [190]:
spacer_df.to_csv("/Users/milessmith/workspace/merrycrispr/merrycrispr/data/scored_spacers.csv")

In [191]:
spacer_df[spacer_df['spacer'] == "TTGTTACTGAGGCATGGAGAGTTGCGGTTT"]

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score,hash,offtarget_score,number_matching
0,PML,ENSG00000140464,73997220,73997250,+,TTGTTACTGAGGCATGGAGAGTTGCGGTTT,0.56641,-9193367021944657541,99.584321,1


In [192]:
bowtie_results[bowtie_results['seq'] == "TTGTTACTGAGGCATGGAGAGTTGCGGTTT"]

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
476,-9193367021944657541,+,chr15,73997220,TTGTTACTGAGGCATGGAGAGTTGCGGTTT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0


In [193]:
bowtie_results[bowtie_results['hash'] == -9193367021944657541]

Unnamed: 0,hash,strand,refseq,position,seq,readquality,aligncount,mismatches
476,-9193367021944657541,+,chr15,73997220,TTGTTACTGAGGCATGGAGAGTTGCGGTTT,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,0
477,-9193367021944657541,-,chr10,47063782,AAACCGCAACTCTCCATGCCTCAGTAACAA,IIIIIIIIIIIIIIIIIIIIIIIIIIIIII,0,"18:T>C,24:C>G"


In [198]:
sumofftargets([[18,24]],4,24)

99.58432147887324

In [195]:
sumofftargets([[],[]],4,24)

0

In [196]:
(1-sum([scoreCas9offtarget([21,24],4,24), 
scoreCas9offtarget([10],4,24),
scoreCas9offtarget([12,20],4,24),
scoreCas9offtarget([11,21],4,24)]))*100

35.013417976668585

In [197]:
1/(1-4)

-0.3333333333333333