In [94]:
import itertools
import numpy as np
import pandas as pd

## Generating Good Bit Sequences

Taking all positive sequences from original merFISH papers

In [None]:
pos_seqs = [
    "ACACTACCACCATTTCCTAT",
    "AAACACACACTAAACCACCC",
    "ATCCTCCTTCAATACATCCC",
    "TATCTCATCAATCCCACACT",
    "ACTCCACTACTACTCACTCT",
    "AACTCATCTCAATCCTCCCA",
    "ACCACAACCCATTCCTTTCA",
    "TCTATCATCTCCAAACCACA",
    "ACCCTCTAACTTCCATCACA",
    "AATACTCTCCCACCTCAACT",
    "TTTCTACCACTAATCAACCC",
    "TCCAACTCATCTCTAATCTC",
    "TCCTATTCTCAACCTAACCT",
    "ATAAATCATTCCCACTACCC",
    "ACCCTTTACAAACACACCCT",
    "TTCCTAACAAATCACATCCC",
    "TATCCTTCAATCCCTCCACA",
    "ACCCAACACTCATAACATCC",
    "TTTACTCCCTACACCTCCAA",
    "ACTTTCCACATACTATCCCA",
    "ACATTACACCTCATTCTCCC",
    "TACTACAAACCCATAATCCC",
    "TTCTCCCTCTATCAACTCTA",
    "TTCTTCCCTCAATCTTCATC",
    "TCCTAACAACCAACTACTCC",
    "ACCTTTCTCCATACCCAACT",
    "ACCCTTACTACTACATCATC",
    "AATCTCACCTTCCACTTCAC",
    "TCTATCATTACCCTCCTCCT",
    "TCCTCATCTTACTCCCTCTA",
]

In [None]:
random_seqs = np.random.choice(["A", "T", "C"], size=(100000, 20), p=[0.25, 0.25, 0.5])
random_seqs = np.array(["".join(random_seqs[i]) for i in range(random_seqs.shape[0])])
GC_content = np.array(
    [(item.count("G") + item.count("C")) / len(item) for item in random_seqs]
)
GC_filtered = random_seqs[(GC_content < 0.5) & (0.4 < GC_content)]

GGG_stretch = np.array(["CCCC" in item for item in GC_filtered])
GGG_filtered = GC_filtered[~GGG_stretch]

In [None]:
filtered_list_and_orig_probes = pos_seqs + GGG_filtered.tolist()

In [None]:
filtered_list_and_orig_probes

In [None]:
with open("./seqs.fasta", "w") as outfile:
    for i, seq in enumerate(filtered_list_and_orig_probes):
        outfile.write(">" + str(i) + "\n" + seq + "\n")

using blast here....
```
makeblastdb -in seqs.fasta -dbtype nucl

blastn -db ./seqs.fasta -query ./seqs.fasta -out ./blast_result.out -word_size 10 -soft_masking False -dust no -outfmt "6 qseqid sseqid nident' 
```

In [None]:
num_seqs = len(filtered_list_and_orig_probes)
output_arr = np.zeros((num_seqs, num_seqs), dtype=int)
with open("./blast_result.out", "r") as infile:
    for line in infile:
        output = [int(item) for item in line[:-1].split("\t")]
        output_arr[output[0], output[1]] = output[2]

In [None]:
hsp_bool_arr = output_arr > 12
inclusion_indices = []
exclusion_indices = []
seq_pool = [i for i in range(num_seqs)]
for i in range(len(pos_seqs)):
    inclusion_indices.append(i)
    seq_pool.remove(i)
    new_exclusion_indices = np.where(hsp_bool_arr[i])[0]
    new_exclusion_indices = new_exclusion_indices[new_exclusion_indices != i]
    hsp_bool_arr[:, new_exclusion_indices] = False
    new_exclusion_indices = new_exclusion_indices.tolist()
    exclusion_indices += new_exclusion_indices

for j in range(1000):
    chosen_idx = np.random.choice(seq_pool)
    new_exclusion_indices = np.where(hsp_bool_arr[chosen_idx])[0]
    new_exclusion_indices = new_exclusion_indices[new_exclusion_indices != chosen_idx]
    in_inclusion_set = False
    for item in new_exclusion_indices:
        if item not in seq_pool:
            in_inclusion_set = True
    if in_inclusion_set:
        continue

    seq_pool.remove(chosen_idx)
    inclusion_indices.append(chosen_idx)
    hsp_bool_arr[:, new_exclusion_indices] = False
    new_exclusion_indices = new_exclusion_indices.tolist()
    exclusion_indices += new_exclusion_indices
    for exclusion_idx in new_exclusion_indices:
        seq_pool.remove(exclusion_idx)

In [None]:
inclusion_indices

In [None]:
chosen_seq_list = np.array(filtered_list_and_orig_probes)[inclusion_indices].tolist()

In [None]:
with open("./chosen_seqs.fasta", "w") as outfile:
    for i, seq in enumerate(chosen_seq_list):
        outfile.write(">" + str(i) + "\n" + seq + "\n")

using blast here....
```
makeblastdb -in MG1655.fasta -dbtype nucl

blastn -db ./MG1655.fasta -query ./chosen_seqs.fasta -out ./blast_result_MG1655.out -word_size 13 -soft_masking False -dust no -outfmt "6 qseqid sseqid nident' 
```

In [None]:
num_seqs = len(chosen_seq_list)
in_MG1655 = []
with open("./blast_result_MG1655.out", "r") as infile:
    for line in infile:
        output = [item for item in line[:-1].split("\t")]
        in_MG1655.append(int(output[0]))

Keeping sequences with less than 12 or more bps of continuous homology to sites in MG1655

In [None]:
not_in_MG1655 = list(set(range(num_seqs)) - set(in_MG1655))
not_in_MG1655.sort()

In [None]:
not_in_MG1655

In [None]:
neg_seqs = np.array(filtered_list_and_orig_probes)[not_in_MG1655[200:]].tolist()
neg_seqs_AT_start = [item for item in neg_seqs if item[0] is "A" or item[0] is "T"][:30]

In [None]:
neg_seqs[:30]

## Designing Barcode Assembly

### Loading Bit Sequences

Listing bit sequences previously determined, as derived using the above code.

In [5]:
pos_seqs = [
    "ACACTACCACCATTTCCTAT",
    "AAACACACACTAAACCACCC",
    "ATCCTCCTTCAATACATCCC",
    "TATCTCATCAATCCCACACT",
    "ACTCCACTACTACTCACTCT",
    "AACTCATCTCAATCCTCCCA",
    "ACCACAACCCATTCCTTTCA",
    "TCTATCATCTCCAAACCACA",
    "ACCCTCTAACTTCCATCACA",
    "AATACTCTCCCACCTCAACT",
    "TTTCTACCACTAATCAACCC",
    "TCCAACTCATCTCTAATCTC",
    "TCCTATTCTCAACCTAACCT",
    "ATAAATCATTCCCACTACCC",
    "ACCCTTTACAAACACACCCT",
    "TTCCTAACAAATCACATCCC",
    "TATCCTTCAATCCCTCCACA",
    "ACCCAACACTCATAACATCC",
    "TTTACTCCCTACACCTCCAA",
    "ACTTTCCACATACTATCCCA",
    "ACATTACACCTCATTCTCCC",
    "TACTACAAACCCATAATCCC",
    "TTCTCCCTCTATCAACTCTA",
    "TTCTTCCCTCAATCTTCATC",
    "TCCTAACAACCAACTACTCC",
    "ACCTTTCTCCATACCCAACT",
    "ACCCTTACTACTACATCATC",
    "AATCTCACCTTCCACTTCAC",
    "TCTATCATTACCCTCCTCCT",
    "TCCTCATCTTACTCCCTCTA",
]

neg_seqs = [
    "TCACCTTTCTCCTTTCCTCT",
    "CCCTCTACTCTCCATCTTAT",
    "AACCTCCTCTCTCCATCATA",
    "TCACCATAATTCCTCCTCCT",
    "ACCAACTTCCACACATCACT",
    "CCCTCTTACTTATCTACCCA",
    "ACATCTTCTCTCCAACCTTC",
    "TATCATCCTCCTTCTCTCAC",
    "CTTCTTCTCTTACACCCTCT",
    "TCCCACCTTCACTTCACTAT",
    "CACCCTAACATACAACTCTC",
    "AAACTTCATCACTCTCCTCC",
    "TCAATCCACCATTCCTCAAC",
    "TAAAACCCATCCCACATCCT",
    "TTAAACAACCCATCCCACCA",
    "CATAACCCTACACACAACAC",
    "CTCTCTACACCCACCAATAA",
    "ATTCCATACCCACTCTCTTC",
    "CCCTTACCAACAACAATCCT",
    "TCAACTCATTACCCACAACC",
    "CATATCCAACCACAACCTCA",
    "CAACCACACTCAACTACCAT",
    "ACCTTCTACTCCCAACATTC",
    "CCTCTTCATCCTCTTTCAAC",
    "AACTCACAAACACCTCACCT",
    "CCCAAAACCACACACCAATT",
    "ATCCATATCCTTCTCACCCT",
    "CTCTTAACTACCCTCATTCC",
    "TTTCCTTCTTCCCACCAACT",
    "CAACCACCAACTTCAATCTC",
]

### Checking bsa1 compatability

Making sure no bit junctions will have a bsa1 compatible site

In [6]:
seq_pairs = []
for i in range(29):
    seq_pairs.append(neg_seqs[i] + "G" + neg_seqs[i + 1])
    seq_pairs.append(neg_seqs[i] + "G" + pos_seqs[i + 1])
    seq_pairs.append(pos_seqs[i] + "G" + neg_seqs[i + 1])
    seq_pairs.append(pos_seqs[i] + "G" + pos_seqs[i + 1])

Checking BSAI

In [7]:
for item in seq_pairs:
    if "GGTCTC" in item or "CCAGAG" in item:
        print(item)

### Cycled Ligation Assembly Design

based on https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0107329

In [8]:
import string

tab = str.maketrans("ACTG", "TGAC")

def reverse_complement_table(seq):
    return seq.translate(tab)[::-1]

In [53]:
## specify the order of our sequence bits and the constant sequences within the barcode
handle_1 = "ACAGTAACGTTAGCTAGCCT"
handle_2 = "ATCACATTGCCATCAGTAAT"
spacer = "TAGCAATTACATAACAGATA"

zipped_rc_bits = [
    (reverse_complement_table(item[0]), reverse_complement_table(item[1]))
    for item in zip(pos_seqs, neg_seqs)
]
all_seqs = (
    [(handle_1,)]
    + zipped_rc_bits[:15]
    + [(spacer,)]
    + zipped_rc_bits[15:]
    + [(handle_2,)]
)

num_seqs = len(all_seqs)
all_seqs_binary_rep = [tuple(range(len(item))) for item in all_seqs]

In [115]:
fragment_groups = []
all_fragments = []

binary_reps = []
all_binary_fragments = []

bit_step = 3

last_iter = list(range(0, num_seqs, bit_step))[-1]
##generate all fragment sequences in both sequence and binary representation
### Iterating through sequences in steps of 3 bits
for i in range(0, num_seqs, bit_step):
    print(i)
    if i < last_iter:
        working_seqs = all_seqs[i : i + bit_step] ## get seqs of interest
        working_bin = all_seqs_binary_rep[i : i + bit_step] ## get binary rep of seqs of interest
        working_frags = [
            "C".join(item) + "C" for item in itertools.product(*working_seqs)
        ] ## generate all possible seq combinations, adding a C spacer
        working_bin_frags = [item for item in itertools.product(*working_bin)] ## generate the equivelent binary representation
    else: # if you are on the last loop, do the following
        print("last!")
        working_seqs = all_seqs[i:] ## get the remaining sequences
        working_bin = all_seqs_binary_rep[i:] ## get the remaining binary representation
        working_frags = ["C".join(item) for item in itertools.product(*working_seqs)]  ## generate all possible seq combinations, adding a C spacer
        working_bin_frags = [item for item in itertools.product(*working_bin)] ## generate the equivelent binary representation
    print("strlen:" + str(len(working_frags[0])))
    fragment_groups.append(working_frags) ## append each fragment group (for SOC generation)
    all_binary_fragments += working_bin_frags
    all_fragments += working_frags

### Generate fragment labels (the iteration each fragment belongs to)
frag_nums = [[k for i in range(len(item))] for k, item in enumerate(fragment_groups)]
all_frag_nums = []
for item in frag_nums:
    all_frag_nums += item

##make all SOCs, based on the fragment starts and ends
fragment_ends_list = []
fragment_starts_list = []
for fragment_group in fragment_groups:
    fragment_ends = tuple(set([fragment[-20:] for fragment in fragment_group]))
    fragment_starts = tuple(set([fragment[:20] for fragment in fragment_group]))
    fragment_ends_list.append(fragment_ends)
    fragment_starts_list.append(fragment_starts)

all_SOCs = []
all_SOC_fragments = []

for i in range(len(fragment_starts_list) - 1):
    possible_SOCs = [
        item[0] + item[1]
        for item in itertools.product(
            fragment_ends_list[i], fragment_starts_list[i + 1]
        )
    ]
    n_SOCs_in_group = len(fragment_ends_list[i])*len(fragment_starts_list[i + 1])
    SOC_fragments = [str(i)+"-"+str(i+1) for idx in range(n_SOCs_in_group)]
    
    all_SOCs += possible_SOCs
    all_SOC_fragments += SOC_fragments

##make all complementary fragments
comp_all_fragments = [reverse_complement_table(item) for item in all_fragments]
all_frag_seqs = all_fragments + comp_all_fragments

print("Number of fragments: " + str(len(all_fragments)))
print("Number of complementary fragments: " +  str(len(comp_all_fragments)))
print("Number of SOCs: " + str(len(all_SOCs)))

0
strlen:63
3
strlen:63
6
strlen:63
9
strlen:63
12
strlen:63
15
strlen:63
18
strlen:63
21
strlen:63
24
strlen:63
27
strlen:63
30
last!
strlen:62
Number of fragments: 76
Number of complementary fragments: 76
Number of SOCs: 40


#### Checking Barcode ScaI and BsaI Sites

- Should return nothing if design is successful
- In this instance, an ScaI site was admitted since this site is only used as a diagnostic to linearize the plasmid in rare cases

In [42]:
forbidden_rest_site_list = ["AGTACT","GGTCTC"] ## ScaI and BsaI site searched in all fragments and complements
for rest_site in forbidden_rest_site_list:
    rc_rest_site = reverse_complement_table(rest_site)
    for item in all_fragments:
        if handle_1 in item:
            item = "ATAGGG" + item ## added prefix from the flanking sequence on the vector which will surround the barcode
        if handle_2 in item:
            item = item + "AACCCC" ## added suffix from the flanking sequence on the vector which will surround the barcode
        if rest_site in item or rc_rest_site in item:
            print("Restriction Site " + rest_site + " detected in " + item)
    for item in all_SOCs:
        if rest_site in item or rc_rest_site in item:
            print("Restriction Site " + rest_site + " detected in " + item)

Restriction Site AGTACT detected in GGATTATGGGTTTGTAGTACTAGAGTTGATAGAGGGAGAA


#### Export

In [121]:
bit_1_list = list(np.array(all_binary_fragments)[:,0])+list(np.array(all_binary_fragments)[:,0])+[-1 for i in range(len(all_SOCs))]
bit_2_list = list(np.array(all_binary_fragments)[:,1])+list(np.array(all_binary_fragments)[:,1])+[-1 for i in range(len(all_SOCs))]
bit_3_list = list(np.array(all_binary_fragments)[:,2])+list(np.array(all_binary_fragments)[:,2])+[-1 for i in range(len(all_SOCs))]
fragment_type = ["Sense" for i in range(len(all_fragments))] + ["Complement" for i in range(len(all_fragments))] + ["SOC" for i in range(len(all_SOCs))]
sequences_out = all_fragments + comp_all_fragments + all_SOCs
fragment_n = all_frag_nums+all_frag_nums+all_SOC_fragments

output_df = pd.DataFrame({"Fragment":fragment_n,"Bit 1":bit_1_list,"Bit 2":bit_2_list,"Bit 3":bit_3_list,"Fragment Type":fragment_type,"Sequence":sequences_out})

In [122]:
output_df.to_csv("/home/de64/group/de64/CRISPRi_Libraries/dev_notebooks/2024-11-23_Figure_Notebooks/Data/MARLIN_Design/FISH_barcode_cycled_ligation_assembly.csv")