### Initial Guidelines

1.	Targeting sequences are 20bp long and must be complementary to the target coding strand.
    a.	For example if the target is AGA…GTT, the cognate sgRNA will be AAC…UCU
2.	Targeted sequences must have a PAM motif (NGG) in the antisense strand at the 5’ end, i.e. they must begin with CCN before the 20-mer target sequence.
3.	Targeting sequences should avoid off-target effects by having less than 9bp of complementarity with any other sites containing a PAM motif.
    - This usually cannot be avoided, so an alternative is to loosen the restriction to allow off-target binding to neutral regions (away from regulatory elements and on the template (non-coding) strand of coding regions)
4.	Avoid sequences possessing the following “bad seed” substrings in the last 5 positions of the sgRNA targeting sequence (i.e. the complement of the first 5 positions of the targeted sequence, excluding PAM)
    - Bad seeds listed in bad_seed_list.csv
5.	Between 0 and 10 mismatches can be introduced to accomplish partial repression, should maintain avoidance of off targets.


### Outline of Algorithm

1.	Download MG1655 .gb file, modify with any integrated constructs in base strain
2.	Enumerate all strings in target gene which satisfy [C]{2}[ATGC]{21} (regular expression, you will need to use these)
3.	Filter out all strings with bad seeds
4.	Make sure all target strings do not have more than 8bp of matching in the seed sequence (if there are, eliminate both)
 - This may be merged with the next step
5.	Find off-target sites
 - determine pam sites for which the 9nt seed sequence is identical
 - eliminate those in the template strand of a coding region or in non-annotated regions
 - if there are sill off-target sites, eliminate the guide
6.	For each light off target site, determine the annotation (CDS-template, CDS-coding, regulatory, no annotation)
7.	Eliminate any target sequences with a light off target with the CDS-coding or regulatory annotation
8.	Generate all possible single bp mismatches for each targeting sequence and repeat the above off-target analysis to determine valid guides
9.	Repeat for 2,3,…,10 mismatches; use random sampling if enumerating all possible combinations is too computationally costly; bottleneck each group to 100 variants if necessary. 


### Implementation (for gfpmut2) 

In [None]:
import copy
import csv
import random
import re
import sys

import numpy as np
import pandas as pd
from Bio import SeqIO, pairwise2
from Bio.Alphabet import IUPAC, generic_dna, generic_protein
from Bio.Seq import Seq
from Bio.SeqFeature import FeatureLocation, SeqFeature

In [None]:
genome = SeqIO.read("./CRISPRi_reference_genome.gb", "gb")

In [None]:
type(genome)

In [None]:
ref_start = 807859
ref_end = 808636
target = genome[ref_start:ref_end]

In [None]:
def find_strand_pams(seqrecord, strand, startcoord=0):
    if strand == 1:
        seq = str(seqrecord.seq)
    else:
        seq = str(seqrecord.seq.reverse_complement())

    pam_reg = re.compile("CC")
    pam_starts = [item.start(0) for item in re.finditer(pam_reg, str(seq))]

    pam_list = []

    if strand == 1:
        for item in pam_starts:
            if len(seq[item + 3 : item + 23]) == 20:
                start = startcoord + item + 3
                end = startcoord + item + 23
                sequence = seq[item + 3 : item + 23]
                pam_list.append([start, end, sequence, strand])
    else:
        for item in pam_starts:
            if len(seq[item + 3 : item + 23]) == 20:
                start = startcoord + len(seq) - item - 23
                end = startcoord + len(seq) - item - 3
                sequence = seq[item + 3 : item + 23]
                pam_list.append([start, end, sequence, strand])
    return pam_list


def find_pams(seqrecord, startcoord=0):
    fwd_pams = find_strand_pams(seqrecord, 1, startcoord=startcoord)
    rev_pams = find_strand_pams(seqrecord, -1, startcoord=startcoord)
    pam_df = pd.DataFrame(
        fwd_pams + rev_pams, columns=["start", "end", "sequence", "strand"]
    )
    return pam_df


def remove_bad_seeds(pam_df, bad_seed_path):
    bad_seed_df = pd.read_csv(bad_seed_path)
    bad_seed_list = bad_seed_df["seeds"].tolist()
    ## reverse complement to match target sequence
    bad_seed_list = [
        str(Seq(item.upper(), IUPAC.unambiguous_dna).reverse_complement())
        for item in bad_seed_list
    ]

    pam_df = pam_df[pam_df["sequence"].apply(lambda x: x[:5] not in bad_seed_list)]
    return pam_df

In [None]:
target_pam_df = find_pams(target, startcoord=ref_start)
target_pam_df = remove_bad_seeds(target_pam_df, "./bad_seed_list.csv")

In [None]:
target_pam_df

In [None]:
genome_pam_df = find_pams(genome)

In [None]:
def str_to_int(string):
    code = {"A": 0, "C": 1, "G": 2, "T": 3}
    conv_str = np.array(list(map(lambda x: code[x], string)))
    return conv_str


def compare_seqs(
    target_df, reference_df, subseq_range=None, remove_matching_starts=True
):
    target_arr = target_df["sequence"].values
    reference_arr = reference_df["sequence"].values
    target_int_arr = np.array(list(map(str_to_int, target_arr)), dtype="uint8")
    reference_int_arr = np.array(list(map(str_to_int, reference_arr)), dtype="uint8")

    if subseq_range != None:
        target_int_arr = target_int_arr[:, subseq_range]
        reference_int_arr = reference_int_arr[:, subseq_range]

    bool_arr = target_int_arr[:, np.newaxis, :] == reference_int_arr[np.newaxis, :, :]
    agreement_arr = np.sum(bool_arr, axis=2, dtype=int)

    if remove_matching_starts:
        matching_starts = np.where(
            target_df["start"].values[:, np.newaxis]
            == reference_df["start"].values[np.newaxis, :]
        )[1]
        agreement_arr[:, matching_starts] = 0
    most_agreement = np.max(agreement_arr, axis=1)
    return most_agreement

In [None]:
most_agreement = compare_seqs(target_pam_df, genome_pam_df, range(0, 9))

In [None]:
most_agreement

In [None]:
past_threshold = most_agreement < 9

In [None]:
target_pam_df_nooff = target_pam_df[past_threshold]

In [None]:
import copy
import itertools

In [None]:
def generate_all_mismatchs(in_str, num_mismatch):
    flip_dict = {
        "A": ["T", "C", "G"],
        "T": ["A", "C", "G"],
        "C": ["T", "A", "G"],
        "G": ["T", "C", "A"],
    }
    prod = list(itertools.product(*[flip_dict[in_str[i]] for i in range(num_mismatch)]))
    new_strs = ["".join(item) + in_str[num_mismatch:] for item in prod]
    return new_strs


def generate_mismatch(in_str, num_mismatch):
    flip_dict = {
        "A": ["T", "C", "G"],
        "T": ["A", "C", "G"],
        "C": ["T", "A", "G"],
        "G": ["T", "C", "A"],
    }
    list_str = list(in_str)
    new_str = copy.copy(list_str)
    for i in range(num_mismatch):
        new_char = np.random.choice(flip_dict[list_str[i]])
        new_str[i] = new_char
    new_str = "".join(new_str)
    return new_str


def generate_mismatch_df(pam_df, k=[1, 2, 4, 8, 10]):
    mismatch_df = []
    for i, row in pam_df.iterrows():
        seq = row["sequence"]
        start = row["start"]
        end = row["end"]
        strand = row["strand"]
        for k in [1, 2, 4, 8, 10]:
            if k < 5:
                mismatch_list = generate_all_mismatchs(seq, k)
            else:
                mismatch_list = list(
                    set([generate_mismatch(seq, k) for i in range(50)])
                )
            mismatch_df += [[start, end, item, strand, k] for item in mismatch_list]

    mismatch_df = pd.DataFrame(
        mismatch_df, columns=["start", "end", "sequence", "strand", "num_mismatch"]
    )
    return mismatch_df

In [None]:
generate_mismatch_df(target_pam_df_nooff)

In [None]:
seq = pams_past_threshold["sequence"][0]
start = pams_past_threshold["start"][0]
end = pams_past_threshold["end"][0]
strand = pams_past_threshold["strand"][0]

###eliminate bad seeds
bad_seed_df = pd.read_csv("./bad_seed_list.csv")
bad_seed_list = bad_seed_df["seeds"].tolist()
## reverse complement to match target sequence
bad_seed_list = [
    str(Seq(item.upper(), IUPAC.unambiguous_dna).reverse_complement())
    for item in bad_seed_list
]

mismatch_df = []
for k in [1, 2, 4, 8, 10]:
    if k < 5:
        mismatch_list = generate_all_mismatchs(test, k)
    else:
        mismatch_list = list(set([generate_mismatch(test, k) for i in range(100)]))
    mismatch_df += [[start, end, item, strand, k] for item in mismatch_list]
mismatch_df = pd.DataFrame(
    mismatch_df, columns=["start", "end", "sequence", "strand", "num_mismatch"]
)
mismatch_df = mismatch_df[
    mismatch_df["sequence"].apply(lambda x: x[:5] not in bad_seed_list)
]
mismatch_df = mismatch_df[get_query_mask(mismatch_df, all_genome_pams)].reset_index(
    drop=True
)

In [None]:
mismatch_df

In [None]:
from matplotlib import pyplot as plt

plt.hist(most_agreement)

In [None]:
past_threshold

In [None]:
generate_mismatch_seqs(test, 100, 10)

In [None]:
genome[58:78].reverse_complement()

In [None]:
agreement_arr

In [None]:
features_df = []
for feature in genome.features:
    start_idx = feature.location.start.real
    end_idx = feature.location.end.real
    strand = feature.location.strand
    category = feature.type
    meta = feature.qualifiers
    if "gene" in meta.keys():
        name = meta["gene"][0]
    else:
        name = ""
    entry = [start_idx, end_idx, strand, category, name]
    features_df.append(entry)
features_df = pd.DataFrame(
    features_df, columns=["start", "stop", "strand", "type", "name"]
)  ##stop is inclusive

In [None]:
CDS_df = features_df[features_df["type"] == "CDS"].reset_index(drop=True)

In [None]:
CDS_df

In [None]:
### annotate the PA

In [None]:
def check_if_template(pams_df, CDS_df):
    targets_template_list = []
    for i, item in pams_df.iterrows():
        if item["strand"] == 1:
            start_idx = item["position"]
            end_idx = start_idx + 20
        else:
            start_idx = item["position"] - 20
            end_idx = item["position"]

        start_above = start_idx > (CDS_df["start"])
        end_below = end_idx < (CDS_df["stop"])
        in_range = start_above & end_below
        overlapping_CDSs = CDS_df[in_range]
        targets_template = ~np.any(overlapping_CDSs["strand"] == item["strand"])
        targets_template_list.append(targets_template)
    return targets_template_list

In [None]:
check_if_template(all_genome_pams[:10], CDS_df)

In [None]:
all_genome_pams[:100]

In [None]:
all_pam_str_arr = all_pams["sequence"].values
all_pam_int_arr = np.array(list(map(str_to_int, all_pam_str_arr)), dtype="uint8")
all_pam_seed_arr = all_pam_int_arr[:, :8]
bool_arr = all_pam_seed_arr[:, np.newaxis, :] == all_pam_seed_arr[np.newaxis, :, :]
agreement_arr = np.sum(bool_arr, axis=2, dtype=int)
agreement_arr[np.eye(agreement_arr.shape[0], dtype=bool)] = 0
most_agreement = np.max(agreement_arr, axis=0)

In [None]:
all_pams["sequence"].values

In [None]:
moo = all_pam["sequence"]

In [None]:
moo.values

In [None]:
all_pam_int_arr[0] == all_pam_int_arr[1]

In [None]:
most_agreement

In [None]:
plt.hist(most_agreement)
plt.show()

In [None]:
?right

In [None]:
np.where(closest_dist > 10)[0]

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.imshow(hamming_arr)

In [None]:
bool_arr = all_pam_int_arr[:, np.newaxis, :] == all_pam_int_arr[np.newaxis, :, :]

In [None]:
all_pam_str_arr = np.array(all_pams)

= np.apply_along_axis(''.join,1,np.random.choice(["A","C","G","T"],size=(100000,20))) #example array of strings
ex_2 = np.apply_along_axis(''.join,1,np.random.choice(["A","C","G","T"],size=(100000,20)))# example array of strings
ex_1_int = np.array(list(map(str_to_int,ex_1)),dtype="uint8") #conversion to (N, L) array of integers
ex_2_int = np.array(list(map(str_to_int,ex_2)),dtype="uint8") #conversion to (N, L) array of integers
ex_1_broadcast = np.array(ex_1_int[:,np.newaxis,:]) #reshaping for broadcast operation (N, L) -> (N, 1, L)
ex_2_broadcast =  np.array(ex_2_int[np.newaxis,:,:]) #reshaping for broadcast operation (N, L) -> (1, N, L)
bool_arr = (ex_1_broadcast==ex_2_broadcast) #broadcast comparison (N, N, L)
match_arr = np.sum(bool_arr,axis=2) #summing over L (N, N)

In [None]:
bad_seed_df = pd.read_csv("./bad_seed_list.csv")
seed_list = bad_seed_df["seeds"].tolist()
## reverse complement to match target sequence
seed_list = [
    str(Seq(item.lower(), IUPAC.unambiguous_dna).reverse_complement())
    for item in seed_list
]