In [None]:
import toml
import re
import urllib
from datetime import datetime
import pygsheets
import benchlingapi
import requests_html

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
from paulssonlab.api.util import base_url
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.util as cloning_util

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
session = benchlingapi.Session(config["benchling"]["api_key"])

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
col = workflow.get_strain_collection_sheets(gc.drive.service, "LIB")
col

In [None]:
strain_sheet = gc.open_by_key(col["strains"]).worksheet()
plasmid_sheet = gc.open_by_key(col["plasmids"]).worksheet()
part_sheet = gc.open_by_key(col["parts"]).worksheet()

# 3G/JUMP/Densmore

In [None]:
threeg_kit = "https://www.addgene.org/1000000161/"
marionette_kit = "https://www.addgene.org/1000000137/"
jump_plasmids = [
    "https://www.addgene.org/126956/",
    "https://www.addgene.org/126959/",
    "https://www.addgene.org/126960/",
    "https://www.addgene.org/126961/",
    "https://www.addgene.org/126962/",
    "https://www.addgene.org/126963/",
    "https://www.addgene.org/126964/",
    "https://www.addgene.org/126965/",
    "https://www.addgene.org/126966/",
    "https://www.addgene.org/126967/",
    "https://www.addgene.org/126973/",
    "https://www.addgene.org/126974/",
    "https://www.addgene.org/126975/",
    "https://www.addgene.org/126976/",
    "https://www.addgene.org/126991/",
    "https://www.addgene.org/126996/",
    "https://www.addgene.org/127015/",
    "https://www.addgene.org/127047/",
    "https://www.addgene.org/127051/",
    "https://www.addgene.org/127025/",
    "https://www.addgene.org/127000/",
    "https://www.addgene.org/126983/",
]
jump_plasmids = sorted(jump_plasmids)
densmore_kit = "https://www.addgene.org/1000000059/"
densmore_wells = [
    "A1",
    "A5",
    "A9",
    "B1",
    "B5",
    "B9",
    "C1",
    "C5",
    "C9",
    "D1",
    "D5",
    "D6",
    "D7",
    "D8",
    "D9",
    "D10",
    "D11",
    "D12",
    "E1",
    "E2",
    "E3",
    "E4",
    "E5",
    "E6",
    "E7",
]

# Densmore renaming

In [None]:
for sheet in (strain_sheet, plasmid_sheet):
    rows = sheet.get_all_records()
    # sheet.unlink()
    col_idx = list(rows[0].keys()).index("Aliases*") + 1
    for idx, row in enumerate(rows):
        if base_url(densmore_kit) in row["Source*"]:
            new_aliases = re.sub(
                r"([^()]*)\s\(([^()]*)\)([^()]*)", r"\1\3,\2\3", row["Aliases*"]
            )
            sheet.update_value((idx + 2, col_idx), new_aliases, parse=False)
    # sheet.link() # TODO: this gives 500 error, not sure why

# Ingest parts

In [None]:
rows = plasmid_sheet.get_all_records()

In [None]:
service = plasmid_sheet.client.drive.service

In [None]:
def import_threeg_part(plasmid, seq_file):
    part = plasmid_to_part(plasmid)
    # MoClo golden gate assembly BC part for BCD24 (low expression bi-cistronic RBS, engineered for downstream context-independence; see https://doi.org/10.1038/nmeth.2404).
    name = {
        "P18m": "pT7",
        "P33m": "pMutalik_med",
        "P34m": "pMutalik_weak",
        "C31m": "Bxb1",
        "C40m": "random_blank",
        "C71m": "CinR-CIDDHYRTC",
        "C95m": "T7_RNAP",
        "C114m": "Cas9_recoded",
        "UC16m": "gQi_gRNA_BD",
        "UC17m": "gV1_gRNA_BD",
        "UC20m": "gN2_gRNA_BD",
        "UCT1m": "gQi_gRNA_BE",
    }.get(part["Name*"])
    if name is not None:
        part["Name*"] = name
    else:
        part["Name*"] = re.search(
            r"(\S+)(?: (?:RBS|terminator|integrase|fusion|protease))? \(",
            plasmid["Description"],
        ).group(1)
    part["Author*"] = "Richard Murray lab"
    return part


def import_densmore_part(plasmid, seq_file):
    part = plasmid_to_part(plasmid)
    part["Author*"] = "Douglas Densmore lab"
    return part


def import_jump_part(plasmid, seq_file):
    part = plasmid_to_part(plasmid)
    part["Name*"] = re.sub(r"^pJUMP\d+-", "", plasmid["Names"])
    part["Author*"] = "Marcos Valenzuela-Ortega, Christopher French"
    return part


# accept extra columns via overrides={"Tags": "foo"}
# pass through tags from plasmid


def plasmid_to_part(plasmid):
    part = {}
    part["Name*"] = plasmid["Names"].split(",")[-1]
    part["Tags"] = plasmid["Tags"]
    # part["Plasmid/Oligos (Cutter)*"] = ""
    # part["Author*"] = ""
    part["Date*"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
    part["Upstream overhang*"] = "aaa"
    part["Downstream overhang*"] = "bbb"
    part["Sequence*"] = "aaaseqbbb"
    part["Organism/codon usage*"] = "E. coli"
    part["Description"] = plasmid["Description"]
    return part


part_rules = [
    (
        lambda x: (base_url(threeg_kit) in x["Source*"]) and (x["Names"][0] != "V"),
        import_threeg_part,
    ),
    (lambda x: base_url(densmore_kit) in x["Source*"], import_densmore_part),
    (
        lambda x: (any(base_url(j) in x["Source*"] for j in jump_plasmids))
        and ("(Empty Backbone)" not in x["Description"]),
        import_jump_part,
    ),
]

In [None]:
plasmid_folder = col["plasmid_maps"]
plasmid_maps = api.google.list_drive(service, root=plasmid_folder)

In [None]:
rows[0]

In [None]:
api.util.regex_key(plasmid_maps, "")

In [None]:
for row in rows:
    for predicate, rule in part_rules:
        if predicate(row):
            seq_file = api.util.regex_key(
                plasmid_maps, f'{row["ID*"]}\\.', check_duplicates=True
            )["id"]
            part = rule(row, seq_file)
            # print(row["Names"], part["Name*"])
            print(part)
            break

# Part cutting

In [None]:
from Bio import Restriction

In [None]:
seq1 = api.read_sequence(
    service.files()
    .get_media(
        fileId=api.util.regex_key(plasmid_maps, r"pLIB1\.", check_duplicates=True)["id"]
    )
    .execute()
    .decode("utf8")
)

In [None]:
seq

In [None]:
from Bio import Seq

In [None]:
seqt = SeqRecord.SeqRecord(
    "atttctggaattcgcggccgcttctagagactagtgggtctcaggagtttacagctagctcagtcctaggtattatgctagctactagagacctactagtagcg"
)

In [None]:
seqt

In [None]:
search_re(Restriction.BsaI, seqt, linear=False)

In [None]:
Restriction.BsaI.elucidate()

In [None]:
Restriction.BsaI.site

In [None]:
enzyme.site

In [None]:
def _search_re(enzyme, seq, linear=True):
    compsite = re.compile(
        enzyme.compsite.pattern, enzyme.compsite.flags | re.IGNORECASE
    )
    if not linear:
        seq = seq + seq[1 : enzyme.size]
    re_sites = [
        (i.start(), i.group(1) is not None) for i in re.finditer(compsite, str(seq.seq))
    ]
    return re_sites


def _re_digest_cuts(binding_locs, enzyme):
    cuts = []
    for loc, sense in binding_locs:
        for cut5, cut3 in ((enzyme.fst5, enzyme.fst3), (enzyme.scd5, enzyme.scd3)):
            if cut5 is None and cut3 is None:
                continue
            if sense:
                if cut5 is not None:
                    cut5_loc = loc + cut5
                else:
                    cut5_loc = None
                if cut3 is not None:
                    cut3_loc = loc + enzyme.size + cut3
                else:
                    cut3_loc = None
            else:
                if cut3 is not None:
                    cut5_loc = loc - cut3
                else:
                    cut5_loc = None
                if cut5 is not None:
                    cut3_loc = loc + enzyme.size - cut5
                else:
                    cut3_loc = None
            # is_5prime_overhang is true if cut5 is upstream of cut3
            if cut5 is not None and cut3 is not None:
                is_5prime_overhang = cut5_loc > cut3_loc
            else:
                is_5prime_overhang = None
            cuts.append((cut5_loc, cut3_loc, is_5prime_overhang))
    return cuts


def re_digest(seq, enzyme, linear=True):
    binding_locs = _search_re(enzyme, seq, linear=linear)
    cuts = _re_digest_cuts(binding_locs, enzyme)
    length = len(seq)
    return sorted([(c[0] % length, c[1] % length, c[2]) for c in cuts])

In [None]:
len(seq1)

In [None]:
seq1

In [None]:
re_digest(seq1, Restriction.BsaI, linear=False)

In [None]:
# 44/48, 83/87

In [None]:
def anneal_oligos():
    # align, find overhangs
    # add feature to seqrecord with name of part (?)
    pass
    # return (overhang1, SeqRecord, overhang2)

In [None]:
seq

In [None]:
seq[0 : (-1 % len(seq)) + 1]

In [None]:
-1 % 200

In [None]:
Seq.Seq?

In [None]:
from copy import deepcopy


def _get_overhang(seq, cut5, cut3, is_5prime_overhang):
    if not is_5prime_overhang:
        loc = cut5
    else:
        loc = cut3
    return ((cloning_util.slice_seq(seq, cut5, cut3), is_5prime_overhang), loc)


def _digest_for_assembly(seq, cuts):
    cuts.append(cuts[0])
    seqs = []
    for cut1, cut2 in zip(cuts[:-1], cuts[1:]):
        if cut1[2] == True and cut2[2] == False:
            overhang1, loc1 = _get_overhang(seq.seq, *cut1)
            overhang2, loc2 = _get_overhang(seq.seq, *cut2)
            seq = cloning_util.slice_seq(seq, loc1, loc2)
            print("SEQ", len(seq), overhang1, overhang2)
            seqs.append((seq, overhang1, overhang2))
    seqs = sorted(seqs, key=lambda x: len(x[0]))
    return seqs


def digest_for_assembly(seq, enzyme, linear=False):
    cuts = re_digest(seq, enzyme, linear=linear)
    return _digest_for_assembly(seq, cuts)


def join_seqs(seqs):
    # every element of seqs could be a Seq or SeqRecord
    # join all annotations
    # join all letter_annotations (intersection of all)
    # assembly = Seq.SeqRecord("", alphabet)
    # assembly = deepcopy(seqs[0][0])
    return seqs


def _check_seq_compatibility(seq1, seq2):
    _, overhang1_1, overhang1_2 = seq1
    _, overhang2_1, overhang2_2 = seq2
    print("CHECK", overhang1_2, overhang2_1)
    return (overhang1_2[0], not overhang1_2[1]) == overhang2_1


def _reverse_complement_overhangs(seq_with_overhangs):
    seq, overhang1, overhang2 = seq_with_overhangs
    overhang1_rc = (overhang1[0].reverse_complement(), not overhang1[1])
    overhang2_rc = (overhang2[0].reverse_complement(), not overhang2[1])
    return (seq.reverse_complement(), overhang2_rc, overhang1_rc)


def _5prime_overhang(overhang):
    if not overhang[1]:
        return overhang[0].reverse_complement()
    else:
        return overhang[0]


def assemble_sequences(seqs, linear=True):
    alphabet = seqs[0][0].seq.alphabet
    if len(seqs) < 2:
        raise ValueError("need at least two sequences to assemble")
    seq1 = seqs[0]
    seq2 = seqs[1]
    seq1_rc = _reverse_complement_overhangs(seq1)
    seq2_rc = _reverse_complement_overhangs(seq2)
    if _check_seq_compatibility(seq1, seq2):
        pass
    elif _check_seq_compatibility(seq1, seq2_rc):
        seqs[1] = seq2_rc
    elif _check_seq_compatibility(seq1_rc, seq2):
        seqs[0] = seq1_rc
    elif _check_seq_compatibility(seq2_rc, seq2_rc):
        seqs[0] = seq1_rc
        seqs[1] = seq2_rc
    else:
        raise ValueError(f"overhang mismatch when assembling sequences 0 and 1")
    if linear:
        seqs = [*seqs, None]
    else:
        seqs = [*seqs, seqs[0], None]
    seqs_to_join = []
    for idx, (seq1, seq2) in enumerate(zip(seqs[:-1], seqs[1:])):
        if seq2 is not None:
            seq2_rc = _reverse_complement_overhangs(seq2)
            if _check_seq_compatibility(seq1, seq2):
                pass
            elif _check_seq_compatibility(seq1, seq2_rc):
                seq2 = seqs[idx + 1] = seq2_rc  # TODO: does this change zip?
            else:
                raise ValueError(
                    f"overhang mismatch when assembling sequences {idx} and {idx + 1}: {seq1[2]} does not match {seq2[1]} or {seq2_rc[1]}"
                )
        # seq, overhang1, overhang2 = seq1
        seqs_to_join.append(_5prime_overhang(seq1[1]))
        seqs_to_join.append(seq1[0])
        if seq2 is None:
            seqs_to_join.append(_5prime_overhang(seq1[2]))
    # copy SeqRecords, add annotations for each part?? (including overhangs)
    joined_seq = join_seqs(seqs_to_join)
    # circularize?
    return joined_seq


to_join = [
    (seq1.reverse_complement(), Restriction.BsaI),
    (seq2, Restriction.BsaI),
    (seq3, Restriction.BsaI),
    (seq4, Restriction.BsaI),
    # (seq5, Restriction.BsaI),
]
# check sticky end sequence/orientation compatibility = ("AATG", "")
# enumerate sequences with RE binding site outside (sense=True then sense=False?)
seqs_to_assemble = []
for seq, enzyme in to_join:
    seqs = digest_for_assembly(seq, enzyme, linear=False)
    # TODO: ensure we choose the sequence with inward-pointing restriction sites (kwarg!)
    seqs_to_assemble.append(seqs[0])
assemble_sequences(seqs_to_assemble)

# print(digest_for_assembly(seq1, Restriction.BsaI, linear=False))
# 35/AGTA/CTCC
# digest_for_assembly(seq1.reverse_complement(), Restriction.BsaI, linear=False)

In [None]:
seq1[:100]

In [None]:
seq1.reverse_complement()

In [None]:
re_digest(seq1, Restriction.BsaI, linear=False)

In [None]:
# re_digest(seq1.reverse_complement(), Restriction.BsaI, linear=False)

digest_for_assembly(seq1.reverse_complement(), Restriction.BsaI, linear=False)

In [None]:
cloning_util.slice_seq(seq1.reverse_complement(), 2139, 0)

In [None]:
len(_)

In [None]:
seqc = cloning_util.slice_seq(seq1, 10, None) + cloning_util.slice_seq(seq1, 0, 10)

In [None]:
seqc

In [None]:
(len(seq1), len(seqc))

In [None]:
cloning_util.slice_seq(
    seq,
)

In [None]:
len(seq) - 6

In [None]:
Restriction.BsaI.overhang()

In [None]:
Restriction.BsaI.ovhg

In [None]:
Restriction.BbsI.charac

In [None]:
Restriction.AarI.charac

In [None]:
Restriction.AarI.elucidate()

In [None]:
Restriction.BsmBI.charac

In [None]:
Restriction.SapI.charac

In [None]:
Restriction.SapI.elucidate()

In [None]:
Restriction.BsaI.charac

In [None]:
Restriction.BsaI.characteristic??

In [None]:
Restriction.BsaI.compsite

# Test

In [None]:
(
    cloning_util.slice_seq(seq, 0, 55, 0, 100)
    + cloning_util.slice_seq(seq, 55, 70, 0, 100)
).features

In [None]:
cloning_util.slice_seq(seq, 5, 10, 0, 15).features

# Old

In [None]:
seq.letter_annotations??

In [None]:
seq.__add__??

In [None]:
seq.__getitem__??

In [None]:
from pydna.dseqrecord import Dseqrecord

In [None]:
dseq = Dseqrecord.from_SeqRecord(seq, circular=True)

In [None]:
dseq[:100].cut(Restriction.BsaI)[0].features

In [None]:
print(dseq[:100].cut(Restriction.BsaI)[0])

In [None]:
d = dseq.cut(Restriction.BsaI)

In [None]:
Restriction.BsaI.compsite

In [None]:
def finditer(pattern, size):
    if self.is_linear():
        data = self.data
    else:
        data = self.data + self.data[1:size]
    return


def search_re(enzyme, seq, linear=True):
    # cuts = re.dna.finditer(re.compsite, re.size)
    if not linear:
        seq = seq + seq[1 : enzyme.size]
    cuts = [(i.start(), i.group(1)) for i in re.finditer(enzyme.compsite, str(seq.seq))]
    return list(cuts)


search_re(Restriction.BsaI, seq, linear=False)

In [None]:
# sense-binding: sense cuts, antisense cuts
# antisense-binding: antisense cuts, sense cuts
# circular handling
# for each cut: (sense cut, antisense cut)

In [None]:
a[1][1]()

In [None]:
Restriction.BsaI._search??

In [None]:
print(d[0][:5])

In [None]:
Restriction.BsaI.charac

In [None]:
dseq.seq.cut??

In [None]:
d[0]

In [None]:
from Bio import Restriction

In [None]:
cuts = sorted(Restriction.BsaI.search(seq.seq, linear=False))
cuts = cuts + cuts[:1]

In [None]:
fragments = [slice_seq(seq, x1 - 1, x2 - 1) for x1, x2 in zip(cuts[:-1], cuts[1:])]

In [None]:
# include overhangs

In [None]:
Restriction.BsaI.characteristic?

In [None]:
Restriction.BsaI.fst5

In [None]:
Restriction.BsaI.elucidate??