In [None]:
import re
import string
import urllib
from datetime import datetime

import benchlingapi
import Bio.Restriction as Restriction
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import pandas as pd
import pygsheets
import seaborn as sns
import toml
from Bio.Seq import Seq
from tqdm.auto import tqdm

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.cloning.golden_gate as golden_gate
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.util as cloning_util
import paulssonlab.cloning.workflow as workflow
from paulssonlab.api.util import base_url

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
session = benchlingapi.Session(config["benchling"]["api_key"])

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
col = workflow.get_strain_collection_sheets(gc.drive.service, "LIB")
col

In [None]:
strain_sheet = gc.open_by_key(col["strains"]).worksheet()
plasmid_sheet = gc.open_by_key(col["plasmids"]).worksheet()
part_sheet = gc.open_by_key(col["parts"]).worksheet()
oligo_sheet = gc.open_by_key(col["oligos"]).worksheet()

In [None]:
drive_service = plasmid_sheet.client.drive.service
plasmid_folder = col["plasmid_maps"]
plasmid_maps = api.google.list_drive(drive_service, root=plasmid_folder)

# UNS parts

In [None]:
oligos = oligo_sheet.get_as_df()

In [None]:
oligos.columns

In [None]:
part_sequences_sheet = gc.open_by_key(col["parts"]).worksheet_by_title("Sequences")

In [None]:
part_sequences = part_sequences_sheet.get_as_df()

## UNS sequences

In [None]:
uns_df = part_sequences[["Name*", "Sequence*"]]
uns_df = uns_df[uns_df["Name*"].str.startswith("UNS")]

In [None]:
uns_seqs = {}
for idx in range(len(uns_df)):
    uns_seqs[uns_df.iloc[idx]["Name*"]] = uns_df.iloc[idx]["Sequence*"]

## Check forward UNSes

In [None]:
uns_tops = oligos[
    oligos["Name"].str.startswith("UNS")
    & oligos["Name"].str.contains("Top")
    & ~oligos["Name"].str.contains("_r_")
]

In [None]:
uns_bottoms = oligos[
    oligos["Name"].str.startswith("UNS")
    & oligos["Name"].str.contains("Bottom")
    & ~oligos["Name"].str.contains("_r_")
]

In [None]:
re_sites = (
    {}
)  # record the RE binding site sequences added to the 40bp UNSes to make them GG parts
for idx in range(len(uns_tops)):
    top = uns_tops.iloc[idx]
    bottom = uns_bottoms.iloc[idx]
    assert Seq(top["Sequence*"]).reverse_complement() == Seq(bottom["Sequence*"])
    uns_num, part_type = top["Name"].split()[0].split("_")
    if part_type == "A":
        assert top["Sequence*"].startswith(uns_seqs[uns_num])
        re_site = top["Sequence*"][len(uns_seqs[uns_num]) :]
    elif part_type == "E":
        assert top["Sequence*"].endswith(uns_seqs[uns_num])
        re_site = top["Sequence*"][: -len(uns_seqs[uns_num])]
    else:
        raise ValueError
    if part_type in re_sites:
        assert re_site == re_sites[part_type]
    else:
        re_sites[part_type] = re_site

## Create reversed UNSes

In [None]:
reversed_unses = []
for uns_num in [1, *range(3, 11)]:
    for part_type in ("A", "E"):
        if (uns_num, part_type) in [(1, "A"), (10, "E")]:
            continue
        reversed_unses.append((f"UNS{uns_num}", part_type))

In [None]:
for uns_num, part_type in reversed_unses:
    name = f"{uns_num}_r_{part_type}"
    print(f"{name} Top")
    print(f"{name} Bottom")

In [None]:
for uns_num, part_type in reversed_unses:
    uns_seq = uns_seqs[uns_num]
    uns_seq = str(Seq(uns_seq).reverse_complement())
    if part_type == "A":
        seq = uns_seq + re_sites[part_type]
    elif part_type == "E":
        seq = re_sites[part_type] + uns_seq
    else:
        raise ValueError
    print(seq)
    print(Seq(seq).reverse_complement())

## Add UNSes to LIB_parts

In [None]:
first_row = 141  # workflow.get_next_empty_row(part_sheet)
enzyme = Restriction.BsaI

In [None]:
uns_tops = oligos[
    oligos["Name"].str.startswith("UNS") & oligos["Name"].str.contains("Top")
]
uns_bottoms = oligos[
    oligos["Name"].str.startswith("UNS") & oligos["Name"].str.contains("Bottom")
]

In [None]:
def _format_seq(seq):
    return str(seq).lower()


def get_bottom_oligo(seq, bottoms):
    matching = bottoms[bottoms["Sequence*"] == Seq(seq).reverse_complement()]
    num_matching = len(matching)
    if num_matching != 1:
        raise ValueError(
            f"expected one reverse-compliment, instead found {num_matching}"
        )
    x = matching["ID*"].iloc[0]
    return x


parts = []
for idx in uns_tops.index:
    name = uns_tops.loc[idx, "Name"].replace(" Top", "")
    full_seq = uns_tops.loc[idx, "Sequence*"]
    subseqs = golden_gate.re_digest(full_seq, enzyme, linear=True)
    print(name, subseqs)
    seq, overhang1, overhang2 = subseqs[0]
    part = {}
    part["Name*"] = name
    part["Tags"] = "golden-gate 3g"
    top_oligo = uns_tops.loc[idx, "ID*"]
    bottom_oligo = get_bottom_oligo(full_seq, uns_bottoms)
    part["Plasmid/Oligos (Cutter)*"] = f"{top_oligo}={bottom_oligo}/{enzyme.__name__}"
    part["Author*"] = "Richard Murray lab"
    part["Date*"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    part["Upstream overhang*"] = _format_seq(overhang1[0])
    part["Downstream overhang*"] = _format_seq(overhang2[0])
    part["Sequence*"] = _format_seq(overhang1[0] + sequence.get_seq(seq) + overhang2[0])
    part["Organism/codon usage*"] = "E. coli"
    uns_num, part_type = name[:-2], name[-1:]
    if part_type == "A":
        type_description = "Upstream (A-type)"
    elif part_type == "E":
        type_description = "Downstream (E-type)"
    else:
        raise ValueError
    if uns_num.endswith("_r"):
        homology_description = f"reversed homology sequence {uns_num[:-2]}"
    else:
        homology_description = f"homology sequence {uns_num}"
    description = (
        f"{type_description} homology part for 3G/Gibson with {homology_description}"
    )
    part["Description"] = description
    part[
        "Reference"
    ] = "Halleran, A. D., Swaminathan, A., & Murray, R. M. (2018). Single day construction of multigene circuits with 3G assembly. ACS synthetic biology, 7(5), 1477-1480."
    parts.append(part)

In [None]:
parts[-2]

In [None]:
workflow.insert_parts(part_sheet, parts, first_row)