In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
from zipfile import ZipFile
import urllib
from datetime import datetime
import string
import pygsheets
import requests
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import Bio.Entrez as Entrez
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.benchling as bapi
from paulssonlab.api.util import base_url
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.viennarna as viennarna
import paulssonlab.cloning.thermodynamics as thermodynamics
import paulssonlab.cloning.primers as primers
import paulssonlab.cloning.ncbi as ncbi

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
bench_session = benchlingapi.Session(config["benchling"]["api_key"])
benchling_folder = bapi.get_project_root(bench_session, config["benchling"]["project"])

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"], benchling_folder)

# Config

In [None]:
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
lib_parts = reg[("LIB", "parts")]
part_types = reg[("LIB", "parts", "Part types")]

In [None]:
# gg_overhangs = workflow.overhangs_for(part_types["CDS_CD"])
degtag_overhangs = workflow.overhangs_for(part_types["Deg_tag"])

In [None]:
# storage_flanks = (
#     lib_parts["JUMP_storage_vector_prefix"]["Sequence"],
#     lib_parts["JUMP_storage_vector_suffix"]["Sequence"],
# )

In [None]:
# ua_rbs = "tctagatttaagaaggagatatacat"
# cluzel_cterm = "atgtccagacctgcaggcatgcaagctctagaggcat"
# flanks = (ua_rbs + "atg", "taa" + cluzel_cterm)

# Test

In [None]:
part_names = ["sigW", "rsiW", "ECF20_992", "AS20_992", "sfGFP"]  # pick correct FPs

In [None]:
part_seqs = {name: reg.get(name)["_seq"] for name in part_names}

In [None]:
reg.get("ECF20_992")["_seq"]

# Deg tags

## Data

In [None]:
# FROM: Andersen, J. B., Sternberg, C., Poulsen, L. K., Bjørn, S. P., Givskov, M., & Molin, S. (1998). New unstable variants of green fluorescent protein for studies of transient gene expression in bacteria. Applied and environmental microbiology, 64(6), 2240-2246.
# paper gives reverse-complement sequences
tags_wt_rc = {
    "LAA": "AGCTGCTAAAGCGTAGTTTTCGTCGTTTGCTGC",
    "AAV": "AACTGCTGCAGCGTAGTTTTCGTCGTTTGCTGC",
    # "LVA": "AGCTACTAAAGCGTAGTTTTCGTCGTTTGCTGC", # paper implies it behaves similarly to LAA
    "ASV": "AACTGATGCAGCGTAGTTTTCGTCGTTTGCTGC",
}
tags_wt_rc = {name: Seq(seq) for name, seq in tags_wt_rc.items()}
tags_wt = {name: sequence.reverse_complement(seq) for name, seq in tags_wt_rc.items()}

In [None]:
tags_wt["AAV"].translate()

## Tags

In [None]:
head_length = 6  # nt
tail_length = 9  # nt
nnk_length = 6  # aa's

In [None]:
tags = {}
for name, seq in tags_wt.items():
    tags[f"degtag_{name}"] = {
        "Sequence": seq,
        "Description": f"Wild-type {name} ClpXP degradation tag from Andersen 1998.",
    }
    assert len(seq) - head_length - tail_length == nnk_length * 3
    tags[f"degtag_{name}_NNK"] = {
        "Sequence": seq[:head_length] + "NNK" * nnk_length + seq[-tail_length:],
        "Description": f"ClpXP degradation tag library with 2x alanines, 6x NNK's, and the {name} tail from Andersen 1998.",
    }

In [None]:
tags

In [None]:
reference = "Andersen, J. B., Sternberg, C., Poulsen, L. K., Bjørn, S. P., Givskov, M., & Molin, S. (1998). New unstable variants of green fluorescent protein for studies of transient gene expression in bacteria. Applied and environmental microbiology, 64(6), 2240-2246."

oligo_base = {
    "Author": "Jacob Quinn Shenker",
    "Date": workflow.date(),
    "Order date": workflow.date(),
    "Vendor": "IDT",
    "Type": "Primer",
    "Description": description,
    "Reference": reference,
}

for enzyme_name, flipped, upstream in product(enzymes, (False, True), (False, True)):
    base = {}
    olt[olt.next_id()] = {
        **base,
        "Name": "",
        "Sequence": seq,
        "Description": description,
    }
    print(name, enzyme_name, flipped, unses)

## Placeholders

In [None]:
sequence.find_aligned_substring("abcabcxyzabcxyzabc?", "xyz", last=True)

In [None]:
workflow.find_coding_sequence("xxyxyxyyyxyxyyxxyxyatg******taatga??")

In [None]:
"xxyxyxyyyxyxyyxxyxyatg******taatga??"[19:34]

In [None]:
# find stop, extract CDS
# generate Tm>50 reverse primer abutting stop codon
# generate Tm>50 forward primer starting on stop codon
# extend forward primer to generate Tm>60-65 overhang
# add placeholder+overhang to reverse primer

In [None]:
# primers

In [None]:
# storage vectors

# Old

## Source plasmids

In [None]:
%%time
plasmids = {
    row["Names"]: plib_maps[id_]
    for id_, row in plib_plasmids.items()
    if "cluzel-fp" in row["Tags"]
}

In [None]:
plasmids.keys()

## Extract FP inserts

In [None]:
%%time
locations = {
    name: sequence.amplicon_location(
        seq, flanks[0], sequence.reverse_complement(flanks[1])
    )
    for name, seq in plasmids.items()
}

In [None]:
inserts = {name: seq.slice(*locations[name]) for name, seq in plasmids.items()}

## Check restriction sites

In [None]:
for enzyme_name in ("BsaI", "BsmBI", "BbsI", "AarI"):
    names_with_cuts = []
    for name, seq in inserts.items():
        cuts = enzyme.re_search(seq, enzyme_name)
        if cuts:
            names_with_cuts.append(name)
    print(f"{enzyme_name} ({len(names_with_cuts)}): {', '.join(names_with_cuts)}")

## Find FP common ends

In [None]:
names = np.array(list(inserts.keys()))

In [None]:
max_end_length = 40

In [None]:
seqs = [(v.seq_lower()[:max_end_length], k) for k, v in inserts.items()]

In [None]:
def cluster_by_prefix(seqs):
    max_length = max(len(s) for s, _ in seqs)
    for idx in range(max_length):
        base0 = seqs[0][0][idx]
        if not all(s[0][idx] == base0 for s in seqs[1:]):
            break
    if idx == max_length - 1:
        return seqs, idx
    clusters = {}
    for seq in seqs:
        key = seq[0][: idx + 1]
        clusters.setdefault(key, [])
        clusters[key].append(seq)
    # print(">>>",clusters)
    clusters = {k: cluster_by_prefix(v) for k, v in clusters.items()}
    return clusters, idx


c = cluster_by_prefix(seqs)

## Design primers

In [None]:
import primer3plus

In [None]:
flanks = workflow.concatenate_flanks(gg_overhangs, storage_flanks)
primers.primer3_amplicon(inserts["pEB1-SCFP3A"], flanks, return_many=3)

In [None]:
# TODO: make find_primer_binding_site more general,
# allow specifying score func so can find amplicons with overhangs on both sides?

In [None]:
# USE CASES:
# 1) take desired product, template seq, find overhangs
# 2) take amplicon, optional overhangs

# TODO:
# tm/ta settings for Q5/phusion


primer3_amplicon_primers(
    inserts["pEB1-SCFP3A"], [gg_overhangs, storage_flanks], return_many=3
)

# Sequence matching test

In [None]:
a = "zzaaaaaxxx"
b = "12aaayyxx"
sequence.longest_contiguous_matching(a, b)

In [None]:
import random


def randdna(n):
    return "".join(random.choices("atcg", k=n))

In [None]:
a = sequence.DsSeqRecord(
    Seq(randdna(30) + "aaaaggggttttgggg" + randdna(30)), circular=True
)
b = sequence.DsSeqRecord(Seq("aaaaggggttttgggg"))
c = sequence.DsSeqRecord(Seq("aaaaggggttttgggc"))

In [None]:
a

In [None]:
b

In [None]:
sequence.enumerate_matches(a, b)

In [None]:
a[29:45]

In [None]:
b[0:16]

In [None]:
sequence.enumerate_primer_binding_sites(a, b)

In [None]:
sequence.enumerate_primer_binding_sites(a, c, require_3prime_clamp=False)

In [None]:
e = sequence.DsSeqRecord(
    Seq(randdna(1) + "atgcgcgggaaaatgcgcacaacattagcgacctagc" + randdna(1)),
    circular=True,
)
f = sequence.DsSeqRecord(Seq("atgcgcgggaaaatgcgcacaacattagcgacctagc"))
g = sequence.DsSeqRecord(Seq("NNNNatgcgcgggaaaatgcgcacaacattagcgacctagcMMMM"))

In [None]:
sequence.amplicon_tails(e, g, min_score=20)