In [None]:
import Bio.Restriction as Restriction
import pygsheets
import requests
import toml
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.geneious as geneious
import paulssonlab.cloning.design as design
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.primers as primers
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.thermodynamics as thermodynamics
import paulssonlab.cloning.viennarna as viennarna
import paulssonlab.cloning.workflow as workflow

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
geneious_sessionmaker = geneious.connect(**config["geneious"])

In [None]:
reg = registry.Registry(
    gc,
    config["registry"]["folder"],
    geneious_sessionmaker=geneious_sessionmaker,
    geneious_folder="registry",
)

# Config

In [None]:
olib_oligos = reg[("oLIB", "oligos")]
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
flib_fragments = reg[("fLIB", "fragments")]
part_types = reg[("fLIB", "fragments", "Part types")]

In [None]:
# tags
tag_part_type = "ClpXP_tag"
tag_overhangs = workflow.overhangs_for(part_types[tag_part_type])
library_reverse_primer = "oLIB46"
tag_enzyme = Restriction.BsaI
placeholder_enzyme = Restriction.BsaI
num_random_bases = 6  # random bases to add between enzyme binding site and end of DNA
tail_length_aa = 3  # aa
tail_length = tail_length_aa * 3  # nt
sspb_length_aa = 6  # aa (part of the SspB binding site that is mutagenized with NNK's; note we keep first two alanines fixed)
sspb_length = sspb_length_aa * 3  # nt
head_length = 6  # nt
reference = "Andersen, J. B., Sternberg, C., Poulsen, L. K., Bjørn, S. P., Givskov, M., & Molin, S. (1998). New unstable variants of green fluorescent protein for studies of transient gene expression in bacteria. Applied and environmental microbiology, 64(6), 2240-2246."

# placeholders
tm_binding = 60
tm_homology = 55
min_mfe = None

# Deg tags

## Data

In [None]:
# FROM: Andersen, J. B., Sternberg, C., Poulsen, L. K., Bjørn, S. P., Givskov, M., & Molin, S. (1998). New unstable variants of green fluorescent protein for studies of transient gene expression in bacteria. Applied and environmental microbiology, 64(6), 2240-2246.
# paper gives reverse-complement sequences
tags_wt_rc = {
    "LAA": "AGCTGCTAAAGCGTAGTTTTCGTCGTTTGCTGC",
    "AAV": "AACTGCTGCAGCGTAGTTTTCGTCGTTTGCTGC",
    # "LVA": "AGCTACTAAAGCGTAGTTTTCGTCGTTTGCTGC", # paper implies it behaves similarly to LAA
    "ASV": "AACTGATGCAGCGTAGTTTTCGTCGTTTGCTGC",
}
tags_wt_rc = {name: Seq(seq) for name, seq in tags_wt_rc.items()}
tags_wt = {name: sequence.reverse_complement(seq) for name, seq in tags_wt_rc.items()}

In [None]:
tags_wt["AAV"].translate()

## Tags

In [None]:
tag_normalized = {name: workflow.normalize_seq(seq) for name, seq in tags_wt.items()}
tag_prefixes = {
    name: seq[: seq.find(tag_overhangs[0])] for name, seq in tag_normalized.items()
}
tag_prefixes_list = list(tag_prefixes.values())
tag_prefix = tag_prefixes_list[0]
assert all(s == tag_prefix for s in tag_prefixes_list)

In [None]:
tag_suffix = sequence.smoosh_sequences(workflow.normalize_seq("taa"), tag_overhangs[1])

In [None]:
head_length_without_prefix = head_length - len(tag_prefix)  # nt
tail_length_with_suffix = tail_length + len(tag_suffix)  # nt

In [None]:
tag_parts = {name: seq[len(tag_prefix) :] + tag_suffix for name, seq in tags_wt.items()}

In [None]:
tag_parts

In [None]:
tags = {}
for name, seq in tag_parts.items():
    tags[f"degtag_{name}"] = {
        "Sequence": seq,
        "Description": f"Wild-type {name} ClpXP degradation tag from Andersen 1998.",
    }
    assert (
        len(seq) - head_length_without_prefix - tail_length_with_suffix == sspb_length
    )
    tags[f"degtag_{name}_NNK"] = {
        "Sequence": seq[:head_length_without_prefix]
        + "NNK" * sspb_length_aa
        + seq[-tail_length_with_suffix:],
        "Description": f"ClpXP degradation tag library with 2x alanines, {sspb_length_aa}x NNK's, and the {name} tail from Andersen 1998.",
    }
seq0 = next(iter(tag_parts.values()))
tags[f"degtag_all_NNK"] = {
    "Sequence": seq0[:head_length_without_prefix]
    + "NNK" * (sspb_length_aa + tail_length_aa)
    + seq0[-(tail_length_with_suffix - tail_length) :],
    "Description": f"ClpXP degradation tag library with 2x alanines and {sspb_length_aa}x NNK's.",
}
tags[f"degtag_tail_NNK"] = {
    "Sequence": seq0[: head_length_without_prefix + sspb_length]
    + "NNK" * tail_length_aa
    + seq0[-len(tag_suffix) :],
    "Description": f"ClpXP degradation tag library with 2x alanines, wildtype sspB binding site, and {tail_length_aa}x NNK's.",
}

In [None]:
tags

In [None]:
library_primer_seq_rc = workflow.normalize_seq(
    sequence.reverse_complement(reg.get(library_reverse_primer)["Sequence"])
)

In [None]:
base = {"Author": "Jacob Quinn Shenker", "Date": workflow.date()}

oligo_base = {
    **base,
    "Order date": workflow.date(),
    "Vendor": "IDT",
    "Type": "Primer",
}

fragment_base = {
    **base,
    "Reference": reference,
}

apply_oligo = {"Name": None}  # {"Sequence": workflow.normalize_seq}
overwrite_oligo = False
apply_fragment = {"Name": None}
overwrite_fragment = False

for name, row in tags.items():
    # add BsaI flanks
    oligo_seq = (
        design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[0]))
        + row["Sequence"]
        + sequence.reverse_complement(
            design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[1]))
        )
    )
    oligo_seq = workflow.normalize_seq(oligo_seq)
    oligo_seq = design.random_bases(num_random_bases, seed=oligo_seq) + oligo_seq
    if workflow.DEGENERATE_BASES_REGEX.search(workflow.normalize_seq(row["Sequence"])):
        # is library with degenerate bases, append library primer
        oligo_seq = oligo_seq + library_primer_seq_rc
        oligo_row = {
            **oligo_base,
            **row,
            "Name": f"{name}_v2",
            "Sequence": oligo_seq.upper(),
        }
        oligo_id = olib_oligos.upsert(
            oligo_row, apply=apply_oligo, overwrite=overwrite_oligo
        )
        usage = f"{oligo_id}<{library_reverse_primer}>/{tag_enzyme}"
    else:
        # not library, just order two oligos to anneal
        # in this case, there's no reverse primer binding site,
        # so we need to add random bases to ensure efficient cutting
        oligo_seq = oligo_seq + design.random_bases(num_random_bases, seed=oligo_seq)
        oligo_id = olib_oligos.upsert(
            {
                **oligo_base,
                **row,
                "Name": f"{name}_sense",
                "Sequence": workflow.normalize_seq_upper(oligo_seq),
            },
            apply=apply_oligo,
            overwrite=overwrite_oligo,
        )
        oligo_id2 = olib_oligos.upsert(
            {
                **oligo_base,
                **row,
                "Name": f"{name}_antisense",
                "Sequence": workflow.normalize_seq_upper(
                    sequence.reverse_complement(oligo_seq)
                ),
            },
            apply=apply_oligo,
            overwrite=overwrite_oligo,
        )
        usage = f"{oligo_id}={oligo_id2}/{tag_enzyme}"
    fragment_row = {
        **fragment_base,
        **row,
        "Name": name,
        "Sequence": workflow.normalize_seq(row["Sequence"]),
        "Usage": usage,
        "Type": tag_part_type,
        "Upstream overhang": tag_overhangs[0],
        "Downstream overhang": tag_overhangs[1],
        "Species/codon usage": "E. coli",
    }
    flib_fragments.upsert(
        fragment_row, apply=apply_fragment, overwrite=overwrite_fragment
    )

In [None]:
flib_fragments.local

In [None]:
olib_oligos.local

In [None]:
# olib_oligos.commit()
# flib_fragments.commit()

## Placeholders

In [None]:
# part_names = ["sigW", "rsiW", "ECF20_992", "AS20_992", "sfGFP"]  # TODO: pick correct FPs
part_names = ["sigW", "rsiW"]
# part_names = ["rsiW"]

In [None]:
placeholder = tag_prefix + design.golden_gate_placeholder(
    placeholder_enzyme, None, *tag_overhangs
)
# remove downstream overhang because it is already in the storage vector
placeholder = placeholder[: -len(tag_overhangs[1])]

In [None]:
%%time
placeholder_primers = {}
storage_vector_seqs = {}
source_plasmid_names = {}
for part_name in part_names:
    part = reg.get(part_name)
    part_seq = part["_seq"]
    cds_start, cds_stop = workflow.find_coding_sequence(part_seq)
    cds_stop -= 3  # want to insert placeholder before stop codon
    plasmid_name = workflow.get_source_plasmid(reg, part["Usage"])
    source_plasmid_names[part_name] = plasmid_name
    plasmid_seq = reg.get(plasmid_name)["_seq"]
    part_start, part_stop, _, _ = sequence.find_subsequence(
        plasmid_seq, part_seq, min_score=len(part_seq)
    )
    start = part_start + cds_start
    stop = part_stop - (len(part_seq) - cds_stop)
    downstream_overhang_start = part_stop - len(part["Downstream overhang"])
    # we want 5' end of forward primer binding site to start with the downstream overhang
    # ("aggt" for CDS parts)
    plasmid_seq_forward = workflow.normalize_seq(
        plasmid_seq.reindex(downstream_overhang_start)
    )
    plasmid_seq_reverse = workflow.normalize_seq(
        plasmid_seq.reindex(stop).reverse_complement()
    )
    homology = next(
        primers.iter_primers(
            plasmid_seq_forward,
            min_tm=tm_homology,
            min_mfe=min_mfe,
            anchor="5prime",
            gc_clamp=False,
        )
    )
    forward_primer = next(
        primers.iter_primers(
            plasmid_seq_forward,
            min_length=len(homology),
            min_tm=tm_binding,
            min_mfe=min_mfe,
            anchor="5prime",
        )
    )
    overhang = sequence.reverse_complement(placeholder + homology.binding)
    reverse_primer = next(
        primers.iter_primers(
            plasmid_seq_reverse,
            overhang=overhang,
            min_tm=tm_binding,
            min_mfe=min_mfe,
            anchor="5prime",
        )
    )
    primer_pair = primers.PrimerPair(forward_primer, reverse_primer)
    placeholder_primers[part_name] = primer_pair
    storage_vector_seqs[part_name] = sequence.pcr(
        plasmid_seq, str(primer_pair.primer1), str(primer_pair.primer2)
    ).assemble(method="gibson")

### Oligos

In [None]:
placeholder_base = {"Author": "Jacob Quinn Shenker", "Date": workflow.date()}

placeholder_oligo_base = {
    **placeholder_base,
    "Order date": workflow.date(),
    "Vendor": "IDT",
    "Type": "Primer",
}

apply_oligo = {"Name": None}  # {"Sequence": workflow.normalize_seq}
overwrite_oligo = False
placeholder_oligo_names = {}

for part_name in part_names:
    name = f"degtag_ph_{part_name}"
    description = f"Adds a deg tag placeholder to {source_plasmid_names[part_name]} ({part_name})."
    forward_primer_id = olib_oligos.upsert(
        {
            **placeholder_oligo_base,
            "Name": f"{name}_f",
            "Sequence": workflow.normalize_seq_upper(
                placeholder_primers[part_name].primer1
            ),
            "Description": description,
        },
        apply=apply_oligo,
        overwrite=overwrite_oligo,
    )
    reverse_primer_id = olib_oligos.upsert(
        {
            **placeholder_oligo_base,
            "Name": f"{name}_r",
            "Sequence": workflow.normalize_seq_upper(
                placeholder_primers[part_name].primer2
            ),
            "Description": description,
        },
        apply=apply_oligo,
        overwrite=overwrite_oligo,
    )
    placeholder_oligo_names[part_name] = (forward_primer_id, reverse_primer_id)

In [None]:
olib_oligos.local

### Storage vectors/parts

In [None]:
# use already-generated oligos
storage_vector_seqs["sigW"] = reg.eval_command("@Gib(pLIB214<oLT100,oLT101>)")["_seq"]
storage_vector_seqs["rsiW"] = reg.eval_command("@Gib(pLIB215<oLT100,oLT102>)")["_seq"]

In [None]:
workflow.re_digest_part(storage_vector_seqs["sigW"], Restriction.BsaI)

In [None]:
plib_maps["pLIB321"] = storage_vector_seqs["sigW"]
plib_maps["pLIB322"] = storage_vector_seqs["rsiW"]

In [None]:
plib_maps.local

In [None]:
plib_maps.commit()

In [None]:
flib_fragments.upsert(
    {
        "Name": "sigW+ClpXP_tag",
        "Date": workflow.date(),
        "Sequence": workflow.re_digest_part(
            storage_vector_seqs["sigW"], Restriction.BsaI
        ).seq_lower(),
    },
    key_columns=["Name"],
    overwrite=True,
)
flib_fragments.upsert(
    {
        "Name": "rsiW+ClpXP_tag",
        "Date": workflow.date(),
        "Sequence": workflow.re_digest_part(
            storage_vector_seqs["rsiW"], Restriction.BsaI
        ).seq_lower(),
    },
    key_columns=["Name"],
    overwrite=True,
)

In [None]:
flib_fragments.local

In [None]:
flib_fragments.commit()

In [None]:
placeholder_part_base = {
    **placeholder_base,
}

apply = {"Name": None}

for part_name in part_names:
    #### PART SEQUENCE
    #### USAGE
    name = f"{part_name}_degtag_ph"
    part = reg.get(part_name)
    # usage = f"{oligo_id}={oligo_id2}/{part_enzyme}"
    part_row = {
        **placeholder_part_base,
        "Sequence": workflow.normalize_seq(row["Sequence"]),
        "Usage": usage,
        "Type": part["Type"],
        "Upstream overhang": tag_overhangs[0],
        "Downstream overhang": tag_overhangs[1],
        "Species/codon usage": "E. coli",
    }
    lib_parts[name] = part_row
    # lib_parts.upsert(part_row, apply=apply)

In [None]:
# storage vector maps
# storage vector plasmid entry
# storage vector strain entry