In [None]:
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
import urllib
from datetime import datetime
import string
import pygsheets
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
from paulssonlab.api.util import base_url
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.util as cloning_util
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.enzyme as enzyme

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

# Registry

In [None]:
# temp: get parts, plasmid seqs
# check that GG works
# gibson
# command: primer design for plasmid -> part storage vector (oligodest=oLIT)
# PCR with flanks
# primer design for fusion parts
# commands: GG, Gib (need to specify recipient strain!)
# get parts, plasmid seqs

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"])

In [None]:
reg.registry

In [None]:
%%timeit
"J23101" in df2

In [None]:
reg.get("LIB93")

In [None]:
%%time
reg.duplicate_collection("LIB", "TESTB", clear=False)

# PCR simulation test

In [None]:
seq1 = "TTTT"
seq1a = "TTTTA"
seq1b = "TTTTC"
seq1c = "CCCCCCCCCCCTTTTC"
seq2 = "AAAATTTTAAAATTTTAAAA"
seq3 = "AGTGATTTTTTTCTCCATTCTTTGTGTGTTTTTTTTGTTTTATGAATTTTTTTAACTGATACCCGTTTTTTTGGAAGGAGACCCGTTTTTTTGGAAG"
seq4 = "TTTTTTAAAAAAAGGGGGGGGGGGGGGTTTTTTCCCCCCCCCCCCAAAAAAATTTTTTAAAAAAA"
seq4p = "GGGGGGGGGGGGGG"

In [None]:
def count_matching_chars(a, b):
    s = 0
    for i in range(min(len(a), len(b))):
        if a[i] == b[i]:
            s += 1
    return s


def count_contiguous_matching_chars(a, b, right=False):
    s = 0
    idxs = range(min(len(a), len(b)))
    if right:
        idxs = reversed(idxs)
    for i in idxs:
        if a[i] == b[i]:
            s += 1
        else:
            return s
    return s


# TODO: LINEAR!!
# TODO: add min_tm option?
def find_primer_binding_site(
    primer, template, linear=False, try_reverse_complement=True, min_score=8
):
    orig_template = template
    if try_reverse_complement:
        senses = (1, -1)
    else:
        senses = (1,)
    sites = []
    for sense in senses:
        if sense == -1:
            template = sequence.reverse_complement(orig_template)
        else:
            template = orig_template
        template_padded = " " * (len(primer) - 1) + template + " " * (len(primer) - 1)
        for loc in range(1, len(template) + len(primer)):
            score = count_contiguous_matching_chars(
                primer, template_padded[loc - 1 : loc + len(primer) - 1], right=True
            )
            if score >= min_score:
                sites.append((sense, loc, score))
    return sites


find_primer_binding_site(seq4p, seq4)

In [None]:
sequence.slice_seq?

In [None]:
# TODO: make work with SeqRecords
# TODO: make sure join_seqs sets topology correctly?
def simulate_pcr(primer1, primer2, template, linear=False, min_score=6):
    both_sites = []
    for primer in (primer1, primer2):
        sites = find_primer_binding_site(
            primer,
            template,
            linear=linear,
            try_reverse_complement=True,
            min_score=min_score,
        )
        if len(sites) != 1:
            raise ValueError(
                f"expecting a unique primer binding site, instead found {len(sites)}"
            )
        both_sites.append(sites[0])
    if both_sites[0][0] == -1:
        both_sites = both_sites[::-1]
    sense1, loc1, len1 = both_sites[0]
    sense2, loc2, len2 = both_sites[1]
    if sense1 != -sense2:
        raise ValueError("expecting a forward/reverse primer pair")
    start = loc1
    stop = len(template) - loc2
    amplicon = sequence.slice_seq(template, start, stop)
    product = sequence.join_seqs([primer1, amplicon, primer2])
    return product


p1 = "TTTTTAGAAGA"
p2 = "CCCCCAGAGGG"
seq5 = "AGAAGACCCCGGGGCCCTCT"
simulate_pcr(p1, p2, seq5)

In [None]:
sequence.reverse_complement(seq5)

# DsSeqRecord test

In [None]:
# DsSeqRecord (store circularity)
# fix linear=/circular= handling
# Registry.get_entry ->
#    {"_type": "plasmid", "plasmid": {row...}, "plasmid_map": DsSeqRecord}
# Registry.get_seq -> DsSeqRecord
# re_digest -> DsSeqRecord
# ligate_seqs([seq1, seq2, seq3], method="goldengate")

In [None]:
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [None]:
b = Seq("aaaagg" + ("X" * 800) + "ggcccc")

In [None]:
hasattr(b, "complement")

In [None]:
a = sequence.DsSeqRecord(
    Seq("aaaagg" + ("X" * 800) + "ggcccc"), upstream_overhang=1, downstream_overhang=-2
)

In [None]:
a

In [None]:
print(a)

In [None]:
print(a.reverse_complement())

In [None]:
s = "GGTCTCAaatgAGCTGTCCTGAACAAATTGTGCAGCTTATGCATATGCATCTTGATGGAGATATCCTTCCAAAAGATGAACACGTATTAAATGAACATCTGGAGACATGCGAGAAATGCAGAAAGCATTTTTACGAGATGGAGAAATCCATAGCGCTCGTACGGAGCACATCGCATGTCGAAGCCCCCGCGGATTTTACCGCTAATGTCATGGCAAAATTGCCTAAGGAGAAGAAAAGAGCTTCTGTAAAAAGATGGTTCAGAACCCATTAAaggtTGAGACC"
s = "GGTCTCAaaaaGGTCTCAccccGGTCTCAggggGGTCTCAttttgggg"
# s = "GGTCTCAaaaaGGTCTAAccccGGTCTAAggggGGTCTAAttttgggg"
seq = Seq(s)
dsdna = sequence.DsSeqRecord(seq, circular=False)

In [None]:
enzyme.re_search(dsdna, Restriction.BsaI)

In [None]:
for dna in enzyme.re_digest(dsdna, Restriction.BsaI):
    print(repr(dna))

# Append test

In [None]:
seq = workflow.read_sequence(
    open(
        "/Users/jacob/Google Drive/Paulsson shared/Documents/FISH/example_construct.gb"
    ).read()
)

In [None]:
seq

In [None]:
seq = sequence.DsSeqRecord(seq)
seq

In [None]:
seq.features

# GG test

In [None]:
import dnacauldron as dc

In [None]:
parts = {
    # hhh
}

repository = dc.SequenceRepository(parts=parts)

In [None]:
# gibson.assemble -> hhh

In [None]:
seq1 = get_plib_seq(drive_service, 1)
seq2 = get_plib_seq(drive_service, 82)
seq3 = get_plib_seq(drive_service, 23)
seq4 = get_plib_seq(drive_service, 95)
seq5 = get_plib_seq(drive_service, 110)

In [None]:
to_join = [
    (sequence.reverse_complement(seq1), Restriction.BsaI, "Name1", "promoter"),
    (sequence.reverse_complement(seq2), Restriction.BsaI, "Name2", "RBS"),
    (seq3, Restriction.BsaI, "Name3", "CDS"),
    (seq4, Restriction.BsaI, "Name4", "terminator"),
    (sequence.reverse_complement(seq5), Restriction.BsaI, "Name5", "misc_feature"),
]

assembly = golden_gate.assemble(to_join, linear=False)
assembly

In [None]:
with open("/Users/jacob/Downloads/test3.gb", "w") as f:
    f.write(assembly.format("gb"))

# Gibson test

In [None]:
# gibson two test seqs
# gibson two GG'ed TUs
# check we can PCR UNSes onto JUMP vector
# check we can circularize a real 3G rxn

In [None]:
to_join = [
    (seq, storage_enzyme),
    (storage_vector_seq, storage_enzyme),
]
plasmid_map = golden_gate.assemble(to_join, linear=False)

# 3G

# Command parsing

In [None]:
import tatsu

In [None]:
grammar_preamble = """@@grammar::CLONING
@@whitespace :: //"""

reference_grammar = """reference
    =
    | pcr
    | restriction_digest
#    | assembly
    | name
    ;

name = name:/\w+/ ;

pcr = template:reference '~' ~ primer1:name ',' primer2:name ;

restriction_digest = input:reference '/' ~ enzyme:name ;

#assembly = assembly+:name {'-' ~ assembly+:name}+ ;
"""

grammar = f"""start = command $ ;

argument
    =
    | quoted_string
    | command
    | float
    | int
    | lookup
    | reference
    ;

ws = /\s*/ ;

command_name = '@' ~ @:/\w+/ ;

command_arglist = '(' ~ ws @+:argument ws {{',' ws @+:argument ws }}* ')' ;

command = command_name:command_name arguments:command_arglist ;

quoted_string = '"' ~ quoted_string:/[^"]*/ '"' ;

float = float:/\d+\.\d+/ ;

int = int:/\d+/ ;

lookup = '$' ~ name ;

{reference_grammar}
"""

In [None]:
parser = tatsu.compile(grammar)
command = "@Gib(@GG(UNS1, J23101, BCD11, UNS5), pLIB47~oLIB22,oLIB24/BsaI)"
ast = parser.parse(command)
ast

In [None]:
# command = (
#     "@3G(UNS1-J23101-BCD11-mVenus-L3S3P11-UNS5, UNS5-J23150-CFP-BCD16-L3S2P55-UNS10)"
# )
# command = "@Gib(@GG(UNS1, J23101, BCD11, UNS5), pLIB47~oLIB22,oLIB24/BsaI)"
# command = "@Gib(@GG(UNS1, J23101, BCD11, UNS5), @PCR(pLIB47, oLIB22, oLIB24)/BsaI)"
# command = (
#     "@Gib(@GG(UNS1, J23101, BCD11, UNS5), @RE(@PCR(pLIB47, oLIB22, oLIB24), BsaI))"
# )

In [None]:
part_sheet.sync?

In [None]:
def get_named_sequence(name, part_sheet, plasmid_maps):
    # try plasmid, strain, part
    pass


def goldengate(*args):
    return "gg", args


def threeg(*args):
    return "3g", args


commands = {"GG": goldengate, "3G": threeg}

get_named_sequence("pLIB27")

In [None]:
reference_parser = tatsu.compile(grammar_preamble + reference_grammar)


class CloningCommandSemantics(object):
    def __init__(self, commands):
        self.commands = commands

    def command(self, ast):
        if ast.command_name not in self.commands:
            raise tatsu.semantics.SemanticError(
                "command must be one of: {}".format(
                    ", ".join([f"@{k}" for k in commands.keys()])
                )
            )
        command = self.commands[ast.command_name]
        return command(ast.arguments)

    def int_(self, ast):
        return int(s)

    def float_(self, ast):
        return float(s)

    def name(self, ast):
        return ast.name

    def assembly(self, ast):
        return ast.assembly


# parser.parse(command, semantics=CloningCommandSemantics())

In [None]:
# GG
command = "@GG(J23101, BCD11, mVenus, L3S3P11, p121/BsaI)"
# 3G
# command = "@3G(@GG(UNS1, J23101, BCD11, mVenus, L3S3P11, UNS5), @GG(UNS5, J23150, CFP, BCD16, L3S2P55, UNS10), JUMP_p15a_UNS1_UNS10)"

In [None]:
parser.parse(command, semantics=CloningCommandSemantics(commands))