In [None]:
import random
import toml
import pygsheets
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from itertools import count

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
from paulssonlab.api.util import base_url
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.design as design
import paulssonlab.cloning.primers as primers
import paulssonlab.cloning.io as cio
import paulssonlab.api.geneious as geneious

# Setup

In [None]:
random.seed(75)

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
geneious_sessionmaker = geneious.connect(**config["geneious"])

In [None]:
reg = registry.Registry(
    gc,
    config["registry"]["folder"],
    geneious_sessionmaker=geneious_sessionmaker,
    geneious_folder="registry",
)

# Config

In [None]:
olib_oligos = reg[("oLIB", "oligos")]
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
flib_fragments = reg[("fLIB", "fragments")]
part_types = reg[("fLIB", "fragments", "Part types")]

# Primers to make FP parts

In [None]:
gg_overhangs = workflow.overhangs_for(part_types["CDS_CD"])

In [None]:
storage_flanks = (
    flib_fragments.find({"Name": "JUMP_storage_vector_prefix"})["Sequence"],
    flib_fragments.find({"Name": "JUMP_storage_vector_suffix"})["Sequence"],
)

In [None]:
ua_rbs = "tctagatttaagaaggagatatacat"
cluzel_cterm = "atgtccagacctgcaggcatgcaagctctagaggcat"
flanks = (ua_rbs + "atg", "taa" + cluzel_cterm)

## Source plasmids

In [None]:
# %%time
# plasmids = {
#     row["Names"]: plib_maps[id_]
#     for id_, row in plib_plasmids.items()
#     if "cluzel-fp" in row["Tags"]
# }

In [None]:
%%time
addgene_publication = api.addgene.get_addgene(
    "https://www.addgene.org/browse/article/28192043/"
)
plasmids = {}
for item in tqdm(addgene_publication["items"]):
    name = item["plasmid"]
    plasmids[name] = cio.read_http(item["sequence_urls"]["addgene_full"][0])

In [None]:
plasmids.keys()

## Extract FP inserts

In [None]:
%%time
locations = {
    name: sequence.amplicon_location(
        seq, flanks[0], sequence.reverse_complement(flanks[1])
    )
    for name, seq in plasmids.items()
}

In [None]:
inserts = {name: seq.slice(*locations[name]) for name, seq in plasmids.items()}

## Check restriction sites

In [None]:
for enzyme_name in ("BsaI", "BsmBI", "BbsI", "AarI"):
    names_with_cuts = []
    for name, seq in inserts.items():
        cuts = enzyme.re_search(seq, enzyme_name)
        if cuts:
            names_with_cuts.append(name)
    print(f"{enzyme_name} ({len(names_with_cuts)}): {', '.join(names_with_cuts)}")

## Find FP common ends

In [None]:
max_end_length = 40

In [None]:
insert_ends = {
    name: (
        seq.seq_lower()[:max_end_length],
        seq.seq_lower()[-max_end_length:][::-1],
    )
    for name, seq in inserts.items()
}

In [None]:
from collections import ChainMap


def cluster_by_prefix(d, start_index=0, num_subseqs=None, max_length=None):
    seqs = list(d.values())
    if num_subseqs is None:
        num_subseqs = min(len(seq) for seq in seqs)
    if max_length is None:
        max_length = max(len(subseq) for seq in seqs for subseq in seq[:num_subseqs])
    mismatch = False
    for idx in range(max_length):
        for subseq_idx in range(num_subseqs):
            base0 = seqs[0][subseq_idx][idx]
            if not all(seq[subseq_idx][idx] == base0 for seq in seqs[1:]):
                mismatch = True
                break
        if mismatch:
            break
    if not mismatch:
        # idx refers to the first mismatched base, so if no mismatches found, set to max_length
        idx += 1
    common_key = tuple(
        seqs[0][subseq_idx][start_index:idx] for subseq_idx in range(num_subseqs)
    )
    if idx > start_index:
        if idx == max_length:
            values = tuple(d.keys())
        else:
            values = cluster_by_prefix(
                d,
                start_index=idx,
                num_subseqs=num_subseqs,
                max_length=max_length,
            )
        res = {common_key: values, "_size": len(d)}
        return res
    else:
        clusters = {}
        for name, seq in d.items():
            key = tuple(
                seq[subseq_idx][start_index : idx + 1]
                for subseq_idx in range(num_subseqs)
            )
            clusters.setdefault(key, {})
            clusters[key][name] = seq
        res = ChainMap(
            {"_size": len(d)},
            *[
                cluster_by_prefix(
                    cluster,
                    start_index=idx,
                    num_subseqs=num_subseqs,
                    max_length=max_length,
                )
                for cluster in clusters.values()
            ],
        )
        return res


c = cluster_by_prefix(insert_ends)

In [None]:
import textwrap


def print_clusters(clusters, length=0, indent_level=0, extra_indent=4, wrap_width=100):
    indent_str = " " * indent_level
    for key, cluster in clusters.items():
        if key == "_size":
            continue
        segment = "/".join(key)
        if isinstance(cluster, tuple):
            num_seqs = len(cluster)
        else:
            num_seqs = cluster["_size"]
        new_length = length + len(key[0])
        print(f"{indent_str}{segment} ({new_length}nt x {num_seqs}):")
        if isinstance(cluster, tuple):
            print(
                textwrap.fill(
                    ", ".join(cluster),
                    width=wrap_width,
                    initial_indent=" " * (indent_level + extra_indent),
                    subsequent_indent=" " * (indent_level + extra_indent),
                    break_long_words=True,
                )
            )
            print()
        else:
            print_clusters(cluster, length=new_length, indent_level=indent_level + 2)


print_clusters(c)

## Design primers

In [None]:
import primer3plus

In [None]:
flanks = workflow.concatenate_flanks(gg_overhangs, storage_flanks)
primers.primer3_amplicon(inserts["pEB1-SCFP3A"], flanks, return_many=3)

In [None]:
# TODO: make find_primer_binding_site more general,
# allow specifying score func so can find amplicons with overhangs on both sides?

In [None]:
# USE CASES:
# 1) take desired product, template seq, find overhangs
# 2) take amplicon, optional overhangs

# TODO:
# tm/ta settings for Q5/phusion


primer3_amplicon_primers(
    inserts["pEB1-SCFP3A"], [gg_overhangs, storage_flanks], return_many=3
)