In [None]:
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
import urllib
from datetime import datetime
import string
import pygsheets
import requests
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.benchling as bapi
from paulssonlab.api.util import base_url
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
bench_session = benchlingapi.Session(config["benchling"]["api_key"])
benchling_folder = bapi.get_project_root(bench_session, config["benchling"]["project"])

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"], benchling_folder)

In [None]:
strain_sheet = reg.get_sheet(("LT", "strains"))
plasmid_sheet = reg.get_sheet(("pLT", "plasmids"))
part_sheet = reg.get_sheet(("LT", "parts"))
part_type_sheet = gc.open_by_key(col["parts"]).worksheet_by_title("Part types")

In [None]:
drive_service = plasmid_sheet.client.drive.service
plasmid_folder = col["plasmid_maps"]
plasmid_maps = api.google.list_drive(drive_service, root=plasmid_folder)

# Test

In [None]:
reg.registry

In [None]:
reg.get_sheet_by_id(("LT", "parts"))

In [None]:
reg.get_sheet_by_id(("LT", "parts", "Part types"))

In [None]:
reg.get_sheet(("LT", "parts"))

# Voigt terminators

In [None]:
!mkdir -p data/2013terminators_supp
!curl -b does_not_exist -Lo data/2013terminators_supp/supptable2.xlsx "https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2515/MediaObjects/41592_2013_BFnmeth2515_MOESM206_ESM.xlsx"
!curl -b does_not_exist -Lo data/2013terminators_supp/supptable3.xlsx "https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2515/MediaObjects/41592_2013_BFnmeth2515_MOESM207_ESM.xlsx"
!curl -b does_not_exist -Lo data/2013terminators_supp/supptable4.xlsx "https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2515/MediaObjects/41592_2013_BFnmeth2515_MOESM208_ESM.xlsx"

# Data

In [None]:
# FROM: used https://www.adobe.com/acrobat/online/pdf-to-excel.html to convert
# Table S1 (p. 23) from https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20209584&file=msb209584-sup-0001-AppendixFig.pdf

voigt_bidirectional_terms_tsv = """
name	sequence	Ts_forward	Ts_reverse
DT3	"CCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTT"		3000				120		
DT5	"TCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCC"		4700				50		
DT19	TTCAGCCAAAAAACTTAAGACCGCCGGTCTTGTCCACTACCTTGCAGTAATGCGGTGGACAGGATCGGCGGTTTTCTTTTCTCTTCTCAACTCGGTACCAAAGACGAACAATAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC	770				1.2			
DT34	GCTGATGCCAGAAAGGGTCCTGAATTTCAGGGCCCTTTTTTTACATGGATTGCTCGGTACCAAATTCCAGAAAAGAGACGCTTTCGAGCGTCTTTTTTCGTTTTGGTCC	570				1.4			
DT36	GATCTAACTAAAAAGGCCGCTCTGCGGCCTTTTTTCTTTTCACTGTAACAACGGAAACCGGCCATTGCGCCGGTTTTTTTTGGCCT	680				3.2			
DT42	"AGTTAACCAAAAAGGGGGGATTTTATCTCCCCTTTAATTTTTCCTCGCAGATAGCAAAAAAGCGCCTTTAGGGCGCTTTTTTACATTG
GTGG"	2500				2.2			
DT54	"GGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCC"		1800				30		
DT56	TACCACCGTCAAAAAAAACGGCGCTTTTTAGCGCCGTTTTTATTTTTCAACCTTCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTC	240				11			
DT60	ACATTTAATAAAAAAAGGGCGGTCGCAAGATCGCCCTTTTTTACGTATGACACAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATG	110				29			
DT65	TGCTCGTACCAGGCCCCTGCAATTTCAACAGGGGCCTTTTTTTATCCAATTCCATCGGGTCCGAATTTTCGGACCTTTTCTCCGC	400				1.0			
DT82	"CTTATTCCATAACAAAGCCGGGTAATTCCCGGCTTTGTTGTATCTGAACAATAAATGGATGCCCTGCGTAAGCGGGGCATTTTTCTTCCT"	170				2.8			
DT83	AGCGTCAAAAGGCCGGATTTTCCGGCCTTTTTTATTAGGCAGCATGCTGCCAGGTGATCCCCCTGGCCACCTCTTTT	600				4.4			
DT86	TAATCATTCTTAGCGTGACCGGGAAGTCGGTCACGCTACCTCTTCTGAAGAAACAGCAAACAATCCAAAACGCCGCGTTCAGCGGCGTTTTTTCTGCTTTTCT	210				0.4			
DT100	"GTGAAGTGAAAAATGGCGCACATTGTGCGCCATTTTTTTTGTCTGCCGTTTACCGCTTCTCTGAAAATCAACGGGCAGGTCACTGACTTGCCCGTTTTTTTATCCCTTCTCCACACCG"	4700				12			
DT101	"TCTTTAAAAAGAAACCTCCGCATTGCGGAGGTTTCGCCTTTTGATACTCTGTCTGAAGTAATTCTTGCCGCAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATGCTTCCATTAGAAAGCAAAAAGCCTGCTAGAAAGCAGGCTTTTTTGAATTTGGCTCCTCTGAC"		2800				160		
DT103	"AAAGTTCTGAAAAAGGGTCACTTCGGTGGCCCTTTTTTATCGCCACGGTTTGAGCAGTGCACTTGCTTAAAATCCCGCCAGCGGCGGGATTTTTTATTGTCCGGTTTAAGACA"	790				4.0			
DT104	"GCAGACAAAAAAAATGGCGCACAATGTGCGCCATTTTTCACTTCACAGGTACTATTGTTTTGAATTGAAAAGGGCGCTTCGGCGCCCTTTTTGCATTTGTTGACGGCATATATTTGTATATCGAAGCGCCCTGATGGGCGCTTTTTTTATTTAATCGATAACCAGA"		580				101		
"""

import io

voigt_bidirectional_terms = pd.read_csv(
    io.StringIO(voigt_bidirectional_terms_tsv), sep="\s+", index_col=0
)

In [None]:
selected_terms = voigt_bidirectional_terms[
    voigt_bidirectional_terms["Ts_reverse"] >= 10
]

In [None]:
selected_terms

## Makeshift oligo orders

In [None]:
def _format_seq(seq):
    return str(sequence.get_seq(seq)).lower()

In [None]:
NO_GBLOCK = ["DT3", "DT56", "DT60"]
# NO_GBLOCK = []

In [None]:
overhangs = ["aggt", "gctt"]
random_bases = (
    "GCTTCA",
    "TGCTAA",
)  # to add between BsmBI recognition site and ends of oligos
flanks = ("CGTCTCGGTCTCa", "tGAGACCgGAGACG")  # storage vector BsmBI flanks
seqs_to_order = {}
for term_name, row in selected_terms.iterrows():
    seq = row["sequence"]
    seq = workflow.add_flanks(
        workflow.add_overhangs(seq.lower(), overhangs),
        [flanks, random_bases],
    )
    seqs_to_order[term_name] = seq

In [None]:
seqs_to_order

In [None]:
prefix = "oLT"
id_num = 37
for term_name, seq in seqs_to_order.items():
    if term_name in NO_GBLOCK:
        continue
    # for sense in (False, True):
    for sense in (True,):
        if sense:
            oligo_seq = seq
        else:
            oligo_seq = sequence.reverse_complement(seq)
        id_ = f"{prefix}{id_num}"
        name = f"Voigt_{term_name}"
        print(f"{id_}\t{name}\t{_format_seq(oligo_seq)}")
        # print(f"{name}\t{_format_seq(oligo_seq)}")
        id_num += 1

In [None]:
for term_name, seq in seqs_to_order.items():
    if term_name not in NO_GBLOCK:
        continue
    for sense in (True, False):
        if sense:
            oligo_seq = seq
        else:
            oligo_seq = sequence.reverse_complement(seq)
        id_ = f"{prefix}{id_num}"
        name = f"Voigt_{term_name}_{'sense' if sense else 'antisense'}"
        print(f"{id_}\t{name}\t{_format_seq(oligo_seq)}")
        id_num += 1

# Sequence orders

## Config

In [None]:
part_type = "Terminator_DE"
random_bases = ("GCTTCA", "TGCTAA")
flanks = ("CGTCTCGGTCTCa", "tGAGACCgGAGACG")
part_enzyme = Restriction.BsaI
storage_enzyme = Restriction.BsmBI
storage_vector_id = "pLIB112"

background_strain = "DH5alpha"
tags = "bidirectional-terminators terminators"
author = "Jacob Quinn Shenker"
date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
species = "E. coli"
reference = "Park, Y., Espah Borujeni, A., Gorochowski, T. E., Shin, J., & Voigt, C. A. (2020). Precision design of stable genetic circuits carried in highly‐insulated E. coli genomic landing pads. Molecular systems biology, 16(8), e9584."
confirmation_notes = "Sanger sequencing with oLIB203+oLIB204."

oligo_description = "Annealed oligos for Voigt bidirectional terminator parts."

## Generate sequences

In [None]:
part_types = part_type_sheet.get_as_df().set_index("Type*")

In [None]:
overhangs = part_types.loc[part_type, ["Upstream overhang", "Downstream overhang"]]
overhangs = [o.upper() for o in overhangs]

In [None]:
# TODO: defaults to E. coli
aa_to_codons = codon.codons_by_relative_frequency()
# force only using TAA as stop codon
aa_to_codons = {**aa_to_codons, "*": {"TAA": 1}}

In [None]:
sigma_sequences_to_order = dict(
    sigmas.loc[sigma_subset_info["Sigma"], "Sequence"].items()
)
antisigma_sequences_to_order = dict(
    antisigmas.loc[sigma_subset_info["Antisigma"], "Sequence"].items()
)
promoter_sequences_to_order = dict(
    sigma_promoters.loc[
        sigma_subset_info["Promoter"], "Promoter sequence (-60 to +20)"
    ].items()
)
_sequences_to_order = {
    "promoter": promoter_sequences_to_order,
    "sigma": sigma_sequences_to_order,
    "antisigma": antisigma_sequences_to_order,
}
sequences_to_order = {}
# prepare seq
for kind, seqs in _sequences_to_order.items():
    for name, seq in seqs.items():
        item = {}
        item["name"] = name
        item["kind"] = kind
        if kind == "promoter":
            seq = workflow.add_flanks(
                workflow.add_overhangs(seq.upper(), promoter_overhangs),
                [flanks, random_bases],
            )
            item["cds_location"] = None
        else:
            aa_seq = seq + cds_aa_suffix
            item["aa_seq"] = aa_seq
            seq = codon.back_translate(aa_seq, aa_to_codons)
            cds_length = len(seq)
            seq = workflow.add_flanks(
                workflow.add_overhangs(seq.upper(), cds_overhangs), [flanks]
            )
            # because overhang (aATG) has an extra a
            cds_start = len(flanks[0]) + cds_overhang_shift
            cds_end = cds_start + cds_length
            item["cds_location"] = (cds_start, cds_end)
        seq = SeqRecord(Seq(seq))  # ensure our pipeline propagates features correctly
        item["initial_seq"] = seq
        sequences_to_order[name] = item

## Use DnaChisel to optimize sequences ourselves

In [None]:
for item in tqdm(sequences_to_order.values()):
    # if cds_location is defined
    if item["cds_location"] is not None:
        seq = item["initial_seq"]
        new_seq = optimization.dnachisel(
            seq,
            *optimization.dnachisel_constraints_for_twist(
                seq,
                cds_location=item["cds_location"],
                avoid_enzymes=avoid_enzymes,
                aa_to_codons=aa_to_codons,
                genetic_table="Bacterial",
            ),
        )
        item["optimized_seq"] = new_seq
        item["final_seq"] = workflow.add_flanks(new_seq, [twist_adaptors])
    else:
        item["final_seq"] = item["optimized_seq"] = item["initial_seq"]

## Substitute sequences with Twist-optimized sequences

Only needed if using Twist web interface to manually optimize sequences. Otherwise use DnaChisel and/or Twist API.

In [None]:
def _format_seq(seq):
    return str(sequence.get_seq(seq)).lower()


for item in sequences_to_order.values():
    if item["kind"] != "promoter":
        print(f"{item['name']}\t{_format_seq(item['optimized_seq'])}")

In [None]:
# TODO: we're cheating and manually running Twist codon optimization through the web interface
twist_seqs = pd.read_csv("201013voigtsigmas.csv").set_index("Name")
for item in sequences_to_order.values():
    name = item["name"]
    if item["kind"] != "promoter":
        try:
            if (
                _format_seq(item["optimized_seq"]).lower()
                != twist_seqs.loc[name, "Insert sequence"].lower()
            ):
                print(f"substituting codon-optimized Twist sequence for {name}")
            else:
                print(f"adding Twist adapters for {name}")
            # TODO: copy features
            item["optimized_seq"] = SeqRecord(
                Seq(twist_seqs.loc[name, "Insert sequence"])
            )
        except:
            print(f"could not find Twist sequence for {name}")

## Check restriction sites

In [None]:
correct_re_site_counts = {"BsaI": 2, "BsmBI": 2, "AarI": 0, "BbsI": 0}

In [None]:
for item in sequences_to_order.values():
    for enzyme, expected_count in correct_re_site_counts.items():
        cuts = golden_gate.re_search(
            item["optimized_seq"], getattr(Restriction, enzyme)
        )
        if len(cuts) != expected_count:
            print(
                f"Expected {expected_count} {enzyme} cuts in {item['name']}, instead found cuts at: {cuts}"
            )

## Check that CDSes match expected translations

In [None]:
for item in sequences_to_order.values():
    if "aa_seq" in item:
        aa_seq = item["aa_seq"]
        translation = item["optimized_seq"][slice(*item["cds_location"])].translate()
        if aa_seq != translation.seq:
            print(
                f"{item['name']}: translation did not match expected amino acid sequence"
            )

## Add to strain collection

In [None]:
oligo0_sheet = gc.open_by_key(col["oligos"]).worksheet_by_title("Special (oLIB0.x)")

In [None]:
# descriptions listing corresponding promoter/sigma/antisigma/fold change/growth rates.; with double-stop

In [None]:
storage_vector_seq = workflow.get_drive_seq(
    drive_service, col["plasmid_maps"], storage_vector_id
)

In [None]:
plasmids_df = plasmid_sheet.get_as_df().set_index("ID*")

In [None]:
base_oligo = {"Date*": date, "Author*": author, "Description": oligo_description}

base_part = {
    "Tags": tags,
    "Author": author,
    "Date*": date,
    "Species/codon usage*": species,
    "Reference": reference,
}

base_plasmid = {
    "Origin*": plasmids_df.loc[storage_vector_id, "Origin*"],
    "Marker*": plasmids_df.loc[storage_vector_id, "Marker*"],
}

base_strain = {
    "Species*": species,
    "Background*": background_strain,
    "Parent*": background_strain,
    "Marker*": plasmids_df.loc[storage_vector_id, "Marker*"],
}

In [None]:
(oligo_prefix, oligo_num), oligo_row = workflow.get_next_collection_id(oligo0_sheet)
(plasmid_prefix, plasmid_num), plasmid_row = workflow.get_next_collection_id(
    plasmid_sheet
)
(strain_prefix, strain_num), strain_row = workflow.get_next_collection_id(strain_sheet)
part_row = workflow.get_next_empty_row(part_sheet)
parts = []
oligos = []
plasmids = []
plasmid_maps = {}
strains = []


def _format_seq(seq):
    return str(sequence.get_seq(seq)).lower()


for item in sequences_to_order.values():
    name = item["name"]
    kind = item["kind"]
    seq = item["final_seq"]
    # description
    row = sigma_subset_info.loc[
        sigma_subset_info.loc[:, kind.capitalize()] == name
    ].iloc[0]
    description = f"""Sigma/antisigma/promoter: {row["Sigma"]}/{row["Antisigma"]}/{row["Promoter"]}
    Sigma/antisigma fold change at max induction: {row["Sigma max"]:.0f}x / {row["Antisigma max"]:.0f}x
    Sigma/antisigma growth rate: {row["Sigma growth"]:.0f}% / {row["Antisigma growth"]:.0f}%"""
    if kind != "promoter":
        description += "\nCDS with double stop codon."
    # part
    part_digest = golden_gate.re_digest(seq, part_enzyme, linear=True)
    part_seq, overhang1, overhang2 = part_digest[0]
    usage = f"{plasmid_prefix}{plasmid_num}/{part_enzyme.__name__}"
    if kind == "promoter":
        usage += f",{oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1}/{part_enzyme.__name__}"
    part = {
        "Name*": name,
        "Usage*": usage,
        "Upstream overhang*": _format_seq(overhang1[0]),
        "Downstream overhang*": _format_seq(overhang2[0]),
        "Sequence*": _format_seq(overhang1[0] + part_seq + overhang2[0]),
        "Description": description,
        **base_part,
    }
    parts.append(part)
    # strain
    plasmid_id = f"{plasmid_prefix}{plasmid_num}"
    strain = {
        "ID*": f"{strain_prefix}{strain_num}",
        "Names": name,
        "Plasmid(s)*": plasmid_id,
        **base_strain,
    }
    strains.append(strain)
    strain_num += 1
    # plasmid map
    to_join = [
        (seq, storage_enzyme),
        (storage_vector_seq, storage_enzyme),
    ]
    plasmid_map = golden_gate.assemble(to_join, linear=False)
    filename = f"{plasmid_id}.gbk"
    content = plasmid_map.format("genbank")
    plasmid_maps[filename] = {
        "content": content,
        "mimetype": "chemical/seq-na-genbank",
    }
    # plasmid
    command = f"@GG({oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1}/{storage_enzyme.__name__}, {storage_vector_id}/{storage_enzyme.__name__})"
    if kind == "promoter":
        construction_notes = f"{storage_enzyme.__name__} golden gate of annealed oligos {oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1} into storage vector {storage_vector_id}."
    else:
        construction_notes = f"{storage_enzyme.__name__} golden gate of {oligo_prefix}{oligo_num} into storage vector {storage_vector_id}."
    plasmid = {
        "Command": command,
        "ID*": plasmid_id,
        "Names": name,
        "Description": description,
        "Size (bp)": len(plasmid_map),
        "Construction Notes": construction_notes,
        "Confirmation Notes": confirmation_notes,
        **base_plasmid,
    }
    plasmids.append(plasmid)
    plasmid_num += 1
    # oligo
    item[
        "oligo_id"
    ] = f"{oligo_prefix}{oligo_num}"  # for promoters, this only records the first (top) annealed oligo
    if kind == "promoter":
        for strand, oligo_seq in [("top", seq), ("bottom", seq.reverse_complement())]:
            oligo = {
                "ID*": f"{oligo_prefix}{oligo_num}",
                "Name": f"{name}_{strand}",
                "Vendor*": "Genewiz",
                "Type": "Primer",
                "Sequence*": _format_seq(oligo_seq),
                **base_oligo,
            }
            oligos.append(oligo)
            oligo_num += 1
    else:
        oligo = {
            "ID*": f"{oligo_prefix}{oligo_num}",
            "Name": f"{name}",
            "Vendor*": "Twist",
            "Type": "Twist Gene Fragment",
            "Sequence*": _format_seq(item["optimized_seq"]),
            **base_oligo,
        }
        oligos.append(oligo)
        oligo_num += 1

## Genewiz sequences to order

In [None]:
from itertools import product, repeat

for well, oligo in zip(cloning_util.well_iterator(), oligos):
    if oligo["Vendor*"] == "Genewiz":
        print(f"{oligo['ID*'].replace('.', '_')}\t{oligo['Sequence*']}")
        # print(f"{well}\t{oligo['ID*']}\t{oligo['Sequence*']}")

## Twist sequences to order

In [None]:
# for oligo in oligos:
#     if oligo["Vendor*"] == "Twist":
#         print(f"{oligo['ID*']}\t{oligo['Sequence*']}")
for item in sequences_to_order.values():
    if item["kind"] != "promoter":
        print(f"{item['oligo_id']}\t{_format_seq(item['optimized_seq'])}")

In [None]:
import pickle

data = {
    "oligos": oligos,
    "plasmids": plasmids,
    "plasmid_maps": plasmid_maps,
    "strains": strains,
    "parts": parts,
    "sequences_to_order": sequences_to_order,
    "oligo_row": oligo_row,
    "plasmid_row": plasmid_row,
    "strain_row": strain_row,
    "part_row": part_row,
}
with open("201013voigtsigmas.json", "wb") as f:
    pickle.dump(data, f)

In [None]:
api.google.insert_sheet_rows(plasmid_sheet, plasmid_row, plasmids)

In [None]:
api.google.insert_sheet_rows(strain_sheet, strain_row, strains)

In [None]:
api.google.insert_sheet_rows(oligo0_sheet, oligo_row, oligos)

In [None]:
workflow.upload_plasmid_maps(drive_service, plasmid_maps, plasmid_folder)

In [None]:
plasmid_maps.keys()