In [2]:
import gc
import pyfaidx
import pandas as pd
import numpy as np
from Bio import SeqIO
from multiprocessing import cpu_count, Manager, Pool

import progressbar
from Bio.Alphabet import IUPAC, single_letter_alphabet
from Bio.Restriction import RestrictionBatch
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import GC
from csv import writer
from functools import partial
from operator import attrgetter
from regex import compile
from subprocess import check_output

from azimuth import model_comparison

In [3]:
input_sequences='/Users/milessmith/workspace/merrycrispr/merrycrispr/data/PML.fa'
outfile='/Users/milessmith/workspace/merrycrispr/merrycrispr/data/PML_spacers.csv'
restriction_sites=["EcoRI","BamHI"]
largeindex=False
cutoff=0
offtargetcutoff=0
nuclease='SpCas9'
return_limit=9
reject=False
paired=False
rules=2
numcores=0
number_upstream_spacers=0
number_downstream_spacers=0
boundary=0

In [5]:
itemlist = pyfaidx.Fasta(input_sequences)
nucleases = pd.read_csv('data/nuclease_list.csv')
nuclease_info = nucleases[nucleases['nuclease'] == nuclease]

In [11]:
nuclease_info

Unnamed: 0,nuclease,pam,spacer_regex,start,end
0,SpCas9,NGG,(?i)[ACGT]{25}[G]{2}[ACGT]{3},4.0,24.0


In [6]:
spacer_regex = compile(nuclease_info['spacer_regex'].item())
spacer_start = int(nuclease_info['start'].item())
spacer_end = int(nuclease_info['end'].item())

In [7]:
print(f'{len(itemlist.keys())} sequences to search for spacers.')

46 sequences to search for spacers.


In [8]:
rsb = RestrictionBatch(restriction_sites)

In [9]:
spacer_df = pd.DataFrame(columns=['gene_name','feature_id','start','stop','strand','spacer'])

In [10]:
for item in itemlist.keys():
    # have to use the alternative Regex module instead of Re so that findall can detect overlapping
    # sequences
    spacers = (spacer_regex.findall(itemlist[item][:].seq, overlapped=True) +
                   spacer_regex.findall(itemlist[item][:].reverse.complement.seq, overlapped=True))

    info = dict(zip(['gene_name', 'feature_id', 'strand', 'start', 'end'], item.split("_")))

    for ps in spacers:
        # Note that ps[4:24] is the actual protospacer.  I need the rest of the sequence for scoring
        ps_seq = Seq(ps[spacer_start:spacer_end], IUPAC.unambiguous_dna)
        ps_full_seq = Seq(ps, IUPAC.unambiguous_dna)

        # Get rid of anything with T(4+) as those act as RNAPIII terminators
        if "TTTT" in ps:
            # TODO Should this also eliminate anything with G(4)?
            pass
        # Get rid of anything that has the verboten restriction sites
        elif bool([y for y in rsb.search(ps_full_seq).values() if y != []]):
            pass
        # BsmBI/Esp3I is used in most of the new CRISPR vectors, especially for library construction.
        # Biopython misses potential restriction sites as it tries to match GAGACGN(5), whereas we need to find
        # matches of just the GAGACG core.  The next four lines take care of that.
        elif 'GAGACG' in ps[spacer_start:spacer_end]:
            pass
        elif 'CGTCTC' in ps[spacer_start:spacer_end]:
            pass
        # Eliminate potentials with a GC content <20 or >80%
        elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80:
            pass
        else:
            ps_start = itemlist[item][:].seq.find(ps) + int(info['start'])
            spacer_data = {'gene_name': [info['gene_name']], 
                           'feature_id': [info['feature_id']], 
                           'start': [ps_start], 
                           'stop': [ps_start+len(ps)], 
                           'strand': [info['strand']], 
                           'spacer': [ps]}
            _ = pd.DataFrame.from_dict(spacer_data)
            # TODO change the spacer here to include 'NGG' so that it is taken into account by Bowtie?
            spacer_df = pd.concat([spacer_df,_])

In [12]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT


In [20]:
spacer_df['spacer'].values

array(['GCCCTGAGCCGGCACCTCCCCTTTCGGACA', 'GCACCTCCCCTTTCGGACAGCTCAAGGGAC',
       'CACCTCCCCTTTCGGACAGCTCAAGGGACT', ...,
       'TGGGGCCTTCCAGCTGGAGGTCACTGGACT', 'TCACTGCTGCTGTCATCCAGCTCTCGGGAG',
       'CACTGCTGCTGTCATCCAGCTCTCGGGAGG'], dtype=object)

In [16]:
predicted_scores = model_comparison.predict(spacer_df['spacer'].values)

No model file specified, using V3_model_nopos


In [17]:
spacer_df['score'] = predicted_scores

In [18]:
spacer_df.head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score
0,PML,ENSE00001363480,73994686,73994716,+,GCCCTGAGCCGGCACCTCCCCTTTCGGACA,0.316063
0,PML,ENSE00001363480,73994697,73994727,+,GCACCTCCCCTTTCGGACAGCTCAAGGGAC,0.47208
0,PML,ENSE00001363480,73994698,73994728,+,CACCTCCCCTTTCGGACAGCTCAAGGGACT,0.586884
0,PML,ENSE00001363480,73994712,73994742,+,GACAGCTCAAGGGACTCAGCCAACTGGCTC,0.493562
0,PML,ENSE00001363480,73994780,73994810,+,CTAAACCGAGAATCGAAACTAAGCTGGGGT,0.442002


In [19]:
spacer_df[spacer_df['score'] > 0.75].head()

Unnamed: 0,gene_name,feature_id,start,stop,strand,spacer,score
0,PML,ENSE00003598656,74024862,74024892,+,CTGTATCCAAGAAAGCCAGCCCAGAGGCTG,0.768965
0,PML,ENSE00001837919,74044220,74044250,+,TCTGCATTTCTAAGAAGCTCCCAGGGGATG,0.766544
0,PML,ENSE00002591339,74046787,74046817,+,ACAGACTCTGCTCAGCATCCCCAGAGGAAC,0.771509
0,PML,ENSE00002591339,74044220,74044250,+,TCTGCATTTCTAAGAAGCTCCCAGGGGATG,0.766544
0,PML,ENSE00003621588,74034707,74034737,+,CACCCCAGCCCTCCCACTACACCAGGGCCA,0.760352
