In [None]:
import itertools
import numpy as np

### Determining Bit Sequences

Taking all positive sequences from original merFISH papers

In [None]:
pos_seqs = [
    "ACACTACCACCATTTCCTAT",
    "AAACACACACTAAACCACCC",
    "ATCCTCCTTCAATACATCCC",
    "TATCTCATCAATCCCACACT",
    "ACTCCACTACTACTCACTCT",
    "AACTCATCTCAATCCTCCCA",
    "ACCACAACCCATTCCTTTCA",
    "TCTATCATCTCCAAACCACA",
    "ACCCTCTAACTTCCATCACA",
    "AATACTCTCCCACCTCAACT",
    "TTTCTACCACTAATCAACCC",
    "TCCAACTCATCTCTAATCTC",
    "TCCTATTCTCAACCTAACCT",
    "ATAAATCATTCCCACTACCC",
    "ACCCTTTACAAACACACCCT",
    "TTCCTAACAAATCACATCCC",
    "TATCCTTCAATCCCTCCACA",
    "ACCCAACACTCATAACATCC",
    "TTTACTCCCTACACCTCCAA",
    "ACTTTCCACATACTATCCCA",
    "ACATTACACCTCATTCTCCC",
    "TACTACAAACCCATAATCCC",
    "TTCTCCCTCTATCAACTCTA",
    "TTCTTCCCTCAATCTTCATC",
    "TCCTAACAACCAACTACTCC",
    "ACCTTTCTCCATACCCAACT",
    "ACCCTTACTACTACATCATC",
    "AATCTCACCTTCCACTTCAC",
    "TCTATCATTACCCTCCTCCT",
    "TCCTCATCTTACTCCCTCTA",
]

In [None]:
random_seqs = np.random.choice(["A", "T", "C"], size=(100000, 20), p=[0.25, 0.25, 0.5])
random_seqs = np.array(["".join(random_seqs[i]) for i in range(random_seqs.shape[0])])
GC_content = np.array(
    [(item.count("G") + item.count("C")) / len(item) for item in random_seqs]
)
GC_filtered = random_seqs[(GC_content < 0.5) & (0.4 < GC_content)]

GGG_stretch = np.array(["CCCC" in item for item in GC_filtered])
GGG_filtered = GC_filtered[~GGG_stretch]

In [None]:
filtered_list_and_orig_probes = pos_seqs + GGG_filtered.tolist()

In [None]:
filtered_list_and_orig_probes

In [None]:
with open("./seqs.fasta", "w") as outfile:
    for i, seq in enumerate(filtered_list_and_orig_probes):
        outfile.write(">" + str(i) + "\n" + seq + "\n")

using blast here....
```
makeblastdb -in seqs.fasta -dbtype nucl

blastn -db ./seqs.fasta -query ./seqs.fasta -out ./blast_result.out -word_size 10 -soft_masking False -dust no -outfmt "6 qseqid sseqid nident' 
```

In [None]:
num_seqs = len(filtered_list_and_orig_probes)
output_arr = np.zeros((num_seqs, num_seqs), dtype=int)
with open("./blast_result.out", "r") as infile:
    for line in infile:
        output = [int(item) for item in line[:-1].split("\t")]
        output_arr[output[0], output[1]] = output[2]

In [None]:
hsp_bool_arr = output_arr > 12
inclusion_indices = []
exclusion_indices = []
seq_pool = [i for i in range(num_seqs)]
for i in range(len(pos_seqs)):
    inclusion_indices.append(i)
    seq_pool.remove(i)
    new_exclusion_indices = np.where(hsp_bool_arr[i])[0]
    new_exclusion_indices = new_exclusion_indices[new_exclusion_indices != i]
    hsp_bool_arr[:, new_exclusion_indices] = False
    new_exclusion_indices = new_exclusion_indices.tolist()
    exclusion_indices += new_exclusion_indices

for j in range(1000):
    chosen_idx = np.random.choice(seq_pool)
    new_exclusion_indices = np.where(hsp_bool_arr[chosen_idx])[0]
    new_exclusion_indices = new_exclusion_indices[new_exclusion_indices != chosen_idx]
    in_inclusion_set = False
    for item in new_exclusion_indices:
        if item not in seq_pool:
            in_inclusion_set = True
    if in_inclusion_set:
        continue

    seq_pool.remove(chosen_idx)
    inclusion_indices.append(chosen_idx)
    hsp_bool_arr[:, new_exclusion_indices] = False
    new_exclusion_indices = new_exclusion_indices.tolist()
    exclusion_indices += new_exclusion_indices
    for exclusion_idx in new_exclusion_indices:
        seq_pool.remove(exclusion_idx)

In [None]:
inclusion_indices

In [None]:
chosen_seq_list = np.array(filtered_list_and_orig_probes)[inclusion_indices].tolist()

In [None]:
with open("./chosen_seqs.fasta", "w") as outfile:
    for i, seq in enumerate(chosen_seq_list):
        outfile.write(">" + str(i) + "\n" + seq + "\n")

using blast here....
```
makeblastdb -in MG1655.fasta -dbtype nucl

blastn -db ./MG1655.fasta -query ./chosen_seqs.fasta -out ./blast_result_MG1655.out -word_size 13 -soft_masking False -dust no -outfmt "6 qseqid sseqid nident' 
```

In [None]:
num_seqs = len(chosen_seq_list)
in_MG1655 = []
with open("./blast_result_MG1655.out", "r") as infile:
    for line in infile:
        output = [item for item in line[:-1].split("\t")]
        in_MG1655.append(int(output[0]))

Keeping sequences with less than 12 or more bps of continuous homology to sites in MG1655

In [None]:
not_in_MG1655 = list(set(range(num_seqs)) - set(in_MG1655))
not_in_MG1655.sort()

In [None]:
not_in_MG1655

In [None]:
neg_seqs = np.array(filtered_list_and_orig_probes)[not_in_MG1655[200:]].tolist()
neg_seqs_AT_start = [item for item in neg_seqs if item[0] is "A" or item[0] is "T"][:30]

In [None]:
neg_seqs[:30]

### Loading Bit Sequences

Listing bit sequences previously determined, as derived using the above code.

In [None]:
import itertools
import numpy as np
import string
import random

In [None]:
pos_seqs = [
    "ACACTACCACCATTTCCTAT",
    "AAACACACACTAAACCACCC",
    "ATCCTCCTTCAATACATCCC",
    "TATCTCATCAATCCCACACT",
    "ACTCCACTACTACTCACTCT",
    "AACTCATCTCAATCCTCCCA",
    "ACCACAACCCATTCCTTTCA",
    "TCTATCATCTCCAAACCACA",
    "ACCCTCTAACTTCCATCACA",
    "AATACTCTCCCACCTCAACT",
    "TTTCTACCACTAATCAACCC",
    "TCCAACTCATCTCTAATCTC",
    "TCCTATTCTCAACCTAACCT",
    "ATAAATCATTCCCACTACCC",
    "ACCCTTTACAAACACACCCT",
    "TTCCTAACAAATCACATCCC",
    "TATCCTTCAATCCCTCCACA",
    "ACCCAACACTCATAACATCC",
    "TTTACTCCCTACACCTCCAA",
    "ACTTTCCACATACTATCCCA",
    "ACATTACACCTCATTCTCCC",
    "TACTACAAACCCATAATCCC",
    "TTCTCCCTCTATCAACTCTA",
    "TTCTTCCCTCAATCTTCATC",
    "TCCTAACAACCAACTACTCC",
    "ACCTTTCTCCATACCCAACT",
    "ACCCTTACTACTACATCATC",
    "AATCTCACCTTCCACTTCAC",
    "TCTATCATTACCCTCCTCCT",
    "TCCTCATCTTACTCCCTCTA",
]

neg_seqs = [
    "TCACCTTTCTCCTTTCCTCT",
    "CCCTCTACTCTCCATCTTAT",
    "AACCTCCTCTCTCCATCATA",
    "TCACCATAATTCCTCCTCCT",
    "ACCAACTTCCACACATCACT",
    "CCCTCTTACTTATCTACCCA",
    "ACATCTTCTCTCCAACCTTC",
    "TATCATCCTCCTTCTCTCAC",
    "CTTCTTCTCTTACACCCTCT",
    "TCCCACCTTCACTTCACTAT",
    "CACCCTAACATACAACTCTC",
    "AAACTTCATCACTCTCCTCC",
    "TCAATCCACCATTCCTCAAC",
    "TAAAACCCATCCCACATCCT",
    "TTAAACAACCCATCCCACCA",
    "CATAACCCTACACACAACAC",
    "CTCTCTACACCCACCAATAA",
    "ATTCCATACCCACTCTCTTC",
    "CCCTTACCAACAACAATCCT",
    "TCAACTCATTACCCACAACC",
    "CATATCCAACCACAACCTCA",
    "CAACCACACTCAACTACCAT",
    "ACCTTCTACTCCCAACATTC",
    "CCTCTTCATCCTCTTTCAAC",
    "AACTCACAAACACCTCACCT",
    "CCCAAAACCACACACCAATT",
    "ATCCATATCCTTCTCACCCT",
    "CTCTTAACTACCCTCATTCC",
    "TTTCCTTCTTCCCACCAACT",
    "CAACCACCAACTTCAATCTC",
]

### Checking bsa1 compatability

Making sure no bit junctions will have a bsa1 compatible site

In [None]:
seq_pairs = []
for i in range(29):
    seq_pairs.append(neg_seqs[i] + "G" + neg_seqs[i + 1])
    seq_pairs.append(neg_seqs[i] + "G" + pos_seqs[i + 1])
    seq_pairs.append(pos_seqs[i] + "G" + neg_seqs[i + 1])
    seq_pairs.append(pos_seqs[i] + "G" + pos_seqs[i + 1])

Checking BSAI

In [None]:
for item in seq_pairs:
    if "GGTCTC" in item or "CCAGAG" in item:
        print(item)

### Cycled Ligation Assembly Design

based on https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0107329

Structure of this assembly:
#### handle_1-Bits(0-8)-spacer_1-Bits(9-13)-spacer_2-Bits(14-18)-spacer_3-Bits(19-21)-spacer_4-Bits(22-26)-spacer_5-Constant Bits (27-29)-handle_2

Note the 5-3-5 bit structure was to avoid a restriction site from being formed

In [None]:
tab = str.maketrans("ACTG", "TGAC")


def reverse_complement_table(seq):
    return seq.translate(tab)[::-1]

In [None]:
reverse_complement_table("ACACTTAAGCTATTAAAGAA")

In [None]:
### Defining handle and spacer sequences
handle_1 = "ACAGTAACGTTAGCTAGCCT"
handle_2 = "ATCACATTGCCATCAGTAAT"

spacer_1 = "TAGCAATTACATAACAGATA"
spacer_2 = "ACAAATAAGAATCTACGATT"
spacer_3 = "GTATAATTAAACAGTCACAA"
spacer_4 = "ATAAAGCTAATCAACGTAAA"
spacer_5 = "ACACTTAAGCTATTAATGAA"

### Wrapping each bit in tuples and reverse complementing the probe sequence to yield the
### correxponding mRNA (and thus DNA) sequence.
zipped_rc_bits = [
    (reverse_complement_table(item[0]), reverse_complement_table(item[1]))
    for item in zip(pos_seqs, neg_seqs)
]
### Setting the last three bits to the positive sequence
zipped_rc_bits[27:30] = [tuple((item[0],)) for item in zipped_rc_bits[27:30]]


### Defining the sturcture of the construct
all_seqs = (
    [(handle_1,)]
    + zipped_rc_bits[0:9]
    + [(spacer_1,)]
    + zipped_rc_bits[9:14]
    + [(spacer_2,)]
    + zipped_rc_bits[14:19]
    + [(spacer_3,)]
    + zipped_rc_bits[19:22]
    + [(spacer_4,)]
    + zipped_rc_bits[22:27]
    + [(spacer_5,)]
    + zipped_rc_bits[27:30]
    + [(handle_2,)]
)
### Create a binary representation of the sequences
all_seqs_binary_rep = [tuple(range(len(item))) for item in all_seqs]
num_seqs = len(all_seqs)

In [None]:
all_seqs

In [None]:
num_seqs

In [None]:
print(all_seqs_binary_rep[:])

#### Assembling 

In [None]:
fragment_groups = []
all_fragments = []

binary_reps = []
all_binary_fragments = []

last_iter = list(range(0, num_seqs, 3))[-1]
### Iterating through sequences in 3s
for i in range(0, num_seqs, 3):
    print(i)
    if i < last_iter:
        working_seqs = all_seqs[i : i + 3]  ## get seqs of interest
        working_bin = all_seqs_binary_rep[
            i : i + 3
        ]  ## get binary rep of seqs of interest
        working_frags = [
            "C".join(item) + "C" for item in itertools.product(*working_seqs)
        ]  ## generate all possible seq combinations, adding a C spacer
        working_bin_frags = [
            item for item in itertools.product(*working_bin)
        ]  ## generate the equivelent binary representation
    else:  # if you are on the last loop, do the following
        print("last!")
        working_seqs = all_seqs[i:]  ## get the remaining sequences
        working_bin = all_seqs_binary_rep[
            i:
        ]  ## get the remaining binary representation
        working_frags = [
            "C".join(item) for item in itertools.product(*working_seqs)
        ]  ## generate all possible seq combinations, adding a C spacer
        working_bin_frags = [
            item for item in itertools.product(*working_bin)
        ]  ## generate the equivelent binary representation
    print("strlen:" + str(len(working_frags[0])))
    fragment_groups.append(
        working_frags
    )  ## append each fragment group (for SOC generation)
    all_binary_fragments += working_bin_frags
    all_fragments += working_frags
### Generate fragment labels (the iteration each fragment belongs to)
frag_nums = [[k for i in range(len(item))] for k, item in enumerate(fragment_groups)]
all_frag_nums = []
for item in frag_nums:
    all_frag_nums += item

In [None]:
print(all_frag_nums)

In [None]:
### Generate SOCs for assembly
fragment_ends_list = []
fragment_starts_list = []
for fragment_group in fragment_groups:
    fragment_ends = tuple(set([fragment[-20:] for fragment in fragment_group]))
    fragment_starts = tuple(set([fragment[:20] for fragment in fragment_group]))
    fragment_ends_list.append(fragment_ends)
    fragment_starts_list.append(fragment_starts)

all_SOCs = []

for i in range(len(fragment_starts_list) - 1):
    possible_SOCs = [
        item[0] + item[1]
        for item in itertools.product(
            fragment_ends_list[i], fragment_starts_list[i + 1]
        )
    ]
    print(len(possible_SOCs))
    all_SOCs += possible_SOCs

In [None]:
all_SOCs

In [None]:
len(all_SOCs)

In [None]:
### Generate all fragment complements, to also order
comp_all_fragments = [reverse_complement_table(item) for item in all_fragments]

In [None]:
len(comp_all_fragments)

In [None]:
comp_all_fragments

In [None]:
all_frag_seqs = all_fragments + comp_all_fragments

In [None]:
len(all_frag_seqs)

In [None]:
all_frag_seqs

#### Checking BsaI Sites

In [None]:
all_frag_seqs

In [None]:
all_SOCs

In [None]:
rest_site = "AGTACT"
rc_rest_site = reverse_complement_table(rest_site)
for item in all_frag_seqs:
    with_prefix = "ATAGGG" + item
    with_suffix = item + "AACCCC"
    rc_with_prefix = reverse_complement_table(with_prefix)
    rc_with_suffix = reverse_complement_table(with_suffix)
    if rest_site in with_prefix or rc_rest_site in with_prefix:
        print(item)
    if rest_site in with_suffix or rc_rest_site in with_suffix:
        print(item)
    if rest_site in with_prefix or rc_rest_site in rc_with_prefix:
        print(item)
    if rest_site in with_suffix or rc_rest_site in rc_with_suffix:
        print(item)

In [None]:
for item in all_SOCs:
    if "GGTCTC" in item or "CCAGAG" in item:
        print(item)

In [None]:
reverse_complement_table("GGTCTC")