In [1]:
import pickle
import argparse
import re
import numpy as np
import pandas as pd

In [20]:
class CFD_calculator:
    def __init__(self, wt, off, DNA_bulge = False):
        wt = wt.upper()
        off = off.upper()
        if not DNA_bulge:
            pam = off[-2:]
            sg = off[:-3]
            self.cfd_score = self.calc_cfd(wt, sg, pam)
        else:
            DNA_bulge_size = len(wt) - 23
            wt = wt[DNA_bulge_size:]
            off = off[DNA_bulge_size:-3]
            pam = off[-2:]
            self.cfd_score = self.calc_cfd(wt, off, pam)

    # Reverse complements a given string
    def revcom(self, s):
        basecomp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'U': 'A', '-':'-'}
        letters = list(s[::-1])
        try:
            letters = [basecomp[base] for base in letters]
        except:
            return None
        return ''.join(letters)


    # Unpickle mismatch scores and PAM scores
    def get_mm_pam_scores(self):
        try:
            mm_scores = pickle.load(open('./mismatch_score.pkl', 'rb'))
            pam_scores = pickle.load(open('./PAM_scores.pkl', 'rb'))
            return mm_scores, pam_scores
        except:
            raise Exception("Could not find file with mismatch scores or PAM scores")


    # Calculates CFD score
    def calc_cfd(self, wt, off, pam):
        mm_scores, pam_scores = self.get_mm_pam_scores()    
        score = 1
        wt = wt.replace('T', 'U')
        off = off.replace('T', 'U')
        s_list = list(off)
        guide_seq_list = list(wt)
        for i, sl in enumerate(s_list):
            if guide_seq_list[i] == sl:
                score *= 1
            else:
                try:    #Catch exception if IUPAC character
                    key = 'r' + guide_seq_list[i] + ':d' + self.revcom(sl) + ',' + str(i + 1)
                except Exception as e:
                    score = 0
                    break
                try:
                    score *= mm_scores[key]
                    # print(i, key, mm_scores[key])
                except Exception as e : #If '-' is in first position, i do not have the score for that position
                    pass
        try:
            score *= pam_scores[pam]
        except:
            score *= 1
            print(wt, off, pam, 'no such PAM')
            return -1
        return score

# Mismatch

In [8]:
wt = 'GGCACTGCGGCTGGAGGTGGNNN'
off = 'GGCACTGCcGCTGcAGGcGcAGG'
cfd = CFD_calculator(wt, off, DNA_bulge=False)
cfd.cfd_score

0.04238618533415564

# RNA bulge

In [9]:
# # GGCACTGCGGCTGGAGGTGGNNN	GGCAgcGaGGCTGGAGGa-GAGG
wt = 'GGCACTGCGGCTGGAGGTGGNNN'
off = 'GGCAgcGaGGCTGGAGGa-GAGG'
cfd = CFD_calculator(wt, off, DNA_bulge=False)
cfd.cfd_score

0.015007215013226525

# DNA bulge

In [21]:
wt = 'GGCACTGCGGCTG-GAGGTG-GNNN'
off = 'cGCcCTGtGGCTGGAGGcGGAGTTA'
cfd = CFD_calculator(wt, off, DNA_bulge=True)
cfd.cfd_score

0.0