In [None]:
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
import urllib
from datetime import datetime
import string
import pygsheets
import requests
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
from paulssonlab.api.util import base_url
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.util as cloning_util
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.golden_gate as golden_gate
import paulssonlab.cloning.codon as codon
import paulssonlab.cloning.optimization as optimization

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
session = benchlingapi.Session(config["benchling"]["api_key"])

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
col = workflow.get_strain_collection_sheets(gc.drive.service, "LIB")
col

In [None]:
strain_sheet = gc.open_by_key(col["strains"]).worksheet()
plasmid_sheet = gc.open_by_key(col["plasmids"]).worksheet()
part_sheet = gc.open_by_key(col["parts"]).worksheet()
part_type_sheet = gc.open_by_key(col["parts"]).worksheet_by_title("Part types")

In [None]:
drive_service = plasmid_sheet.client.drive.service
plasmid_folder = col["plasmid_maps"]
plasmid_maps = api.google.list_drive(drive_service, root=plasmid_folder)

# 2017 Khammash OptoT7

In [None]:
PLASMID_MAPS_FOLDER = "1gpbR9Njm73enshW_U4TN7xbwqgRN-tOo"


def get_plib(service, name, root):
    seq_file = api.google.get_drive_by_path(service, name, root=root)
    data = service.files().get_media(fileId=seq_file).execute().decode("utf8")
    seq = api.read_sequence(data)
    return seq

In [None]:
seq = get_plib(gc.drive.service, "pLIB252.gbk", PLASMID_MAPS_FOLDER)

In [None]:
primer1 = "ATGAACACGATTAACATCGCTAAGAACGACT"
primer2 = "aaaaaaaa" + primer1

In [None]:
import biopython.

# Sequence orders

In [None]:
"tGAGACCgGAGACG" + "TGCTAA"

In [None]:
sequence.reverse_complement("tGAGACCgGAGACG" + "TGCTAA")

In [None]:
seq

In [None]:
import Bio.pairwise2 as pw

In [None]:
pw.align.globalxs?

In [None]:
seq1 = "TTTT"
seq1a = "TTTTA"
seq1b = "TTTTC"
seq1c = "CCCCCCCCCCCTTTTC"
seq2 = "AAAATTTTAAAATTTTAAAA"

In [None]:
seq2 = "AGTGATTTTTTTCTCCATTCTTTGTGTGTTTTTTTTGTTTTATGAATTTTTTTAACTGATACCCGTTTTTTTGGAAGGAGACCCGTTTTTTTGGAAG"

In [None]:
pw.align.localxs(seq1b, seq2, -100, -100)

In [None]:
from Bio import Align

In [None]:
aligner = Align.PairwiseAligner(
    mode="local", match_score=1, mismatch_score=0, gap_score=float("-inf")
)

In [None]:
aligner = Align.PairwiseAligner(
    mode="local", match_score=1, mismatch_score=0, gap_score=float("-inf")
)

In [None]:
alignments = aligner.align(seq1c, seq2)
for a in alignments:
    print(f"score: {a.score}")
    print(a)

In [None]:
from itertools import repeat
from paulssonlab.cloning.sequence import reverse_complement


def find_primer_binding_site(
    primer, template, linear=False, try_reverse_complement=True, min_length=8
):
    # TODO: .upper() or .lower() everything?
    if not linear:
        template = template + template[: len(primer)]
    aligner = Align.PairwiseAligner(
        mode="local", match_score=1, mismatch_score=0, gap_score=-10
    )  # float("-inf"))
    alignments = []
    alignments.extend(zip(aligner.align(primer, template), repeat(False)))
    if try_reverse_complement:
        primer_rc = reverse_complement(primer)
        alignments.extend(zip(aligner.align(primer_rc, template), repeat(True)))
    if not len(alignments):
        raise ValueError("could not align primer to template")
    best_alignment = alignments
    # best_alignment = max(a for a in alignments if a.aligned[0][-1][1] == len(primer), key=lambda x: x[0].score)
    return best_alignment  # , fiveprime_strand


def pcr(primer1, primer2, template):
    pass

In [None]:
a = find_primer_binding_site("ATTTTTTT", seq.seq, try_reverse_complement=False)  # [0]
a

In [None]:
len(primer2)

In [None]:
a.aligned

In [None]:
a.path

In [None]:
a.aligned[0][-1][1]

In [None]:
print(a)

In [None]:
a.aligned?

In [None]:
b = a[0][0]

In [None]:
b.target

In [None]:
a[0][0].score

In [None]:
for alignment in aligner.align(primer2, seq.seq):
    print("Score = %.1f:" % alignment.score)
    print(alignment)
    0 / 0

In [None]:
Bio.Align.PairwiseAligner()

## Config

In [None]:
promoter_part_type = "Promoter_AB"
cds_part_type = "CDS_CD"
# because overhang (aATG) has an extra a
# TODO: autodetect start codon?
cds_overhang_shift = 1
random_bases = ("GCTTCA", "TGCTAA")
flanks = ("CGTCTCGGTCTCa", "tGAGACCgGAGACG")
part_enzyme = Restriction.BsaI
storage_enzyme = Restriction.BsmBI
avoid_enzymes = [getattr(Restriction, e) for e in ("BsaI", "BsmBI", "BbsI", "AarI")]
cds_aa_suffix = "**"  # add double stop
storage_vector_id = "pLIB112"
twist_adaptors = ("GAAGTGCCATTCCGCCTGACCT", "AGGCTAGGTGGAGGCTCAGTG")

background_strain = "DH5alpha"
tags = "voigt-sigmas sequestration"
author = "Jacob Quinn Shenker"
date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
species = "E. coli"
reference = "Rhodius, V. A., Segall‐Shapiro, T. H., Sharon, B. D., Ghodasara, A., Orlova, E., Tabakh, H., ... & Voigt, C. A. (2013). Design of orthogonal genetic switches based on a crosstalk map of σs, anti‐σs, and promoters. Molecular systems biology, 9(1), 702."
confirmation_notes = "Sanger sequencing with oLIB203+oLIB204."

oligo_description = "Annealed oligos for Voigt sigma factor promoters."

## Generate sequences

In [None]:
part_types = part_type_sheet.get_as_df().set_index("Type*")

In [None]:
promoter_overhangs = part_types.loc[
    promoter_part_type, ["Upstream overhang", "Downstream overhang"]
]
promoter_overhangs = [o.upper() for o in promoter_overhangs]
cds_overhangs = part_types.loc[
    cds_part_type, ["Upstream overhang", "Downstream overhang"]
]
cds_overhangs = [o.upper() for o in cds_overhangs]

In [None]:
# TODO: defaults to E. coli
aa_to_codons = codon.codons_by_relative_frequency()
# force only using TAA as stop codon
aa_to_codons = {**aa_to_codons, "*": {"TAA": 1}}

In [None]:
sigma_sequences_to_order = dict(
    sigmas.loc[sigma_subset_info["Sigma"], "Sequence"].items()
)
antisigma_sequences_to_order = dict(
    antisigmas.loc[sigma_subset_info["Antisigma"], "Sequence"].items()
)
promoter_sequences_to_order = dict(
    sigma_promoters.loc[
        sigma_subset_info["Promoter"], "Promoter sequence (-60 to +20)"
    ].items()
)
_sequences_to_order = {
    "promoter": promoter_sequences_to_order,
    "sigma": sigma_sequences_to_order,
    "antisigma": antisigma_sequences_to_order,
}
sequences_to_order = {}
# prepare seq
for kind, seqs in _sequences_to_order.items():
    for name, seq in seqs.items():
        item = {}
        item["name"] = name
        item["kind"] = kind
        if kind == "promoter":
            seq = workflow.add_flanks(
                workflow.add_overhangs(seq.upper(), promoter_overhangs),
                [flanks, random_bases],
            )
            item["cds_location"] = None
        else:
            aa_seq = seq + cds_aa_suffix
            item["aa_seq"] = aa_seq
            seq = codon.back_translate(aa_seq, aa_to_codons)
            cds_length = len(seq)
            seq = workflow.add_flanks(
                workflow.add_overhangs(seq.upper(), cds_overhangs), [flanks]
            )
            # because overhang (aATG) has an extra a
            cds_start = len(flanks[0]) + cds_overhang_shift
            cds_end = cds_start + cds_length
            item["cds_location"] = (cds_start, cds_end)
        seq = SeqRecord(Seq(seq))  # ensure our pipeline propagates features correctly
        item["initial_seq"] = seq
        sequences_to_order[name] = item

## Check restriction sites

In [None]:
correct_re_site_counts = {"BsaI": 2, "BsmBI": 2, "AarI": 0, "BbsI": 0}

In [None]:
for item in sequences_to_order.values():
    for enzyme, expected_count in correct_re_site_counts.items():
        cuts = golden_gate.re_search(
            item["optimized_seq"], getattr(Restriction, enzyme)
        )
        if len(cuts) != expected_count:
            print(
                f"Expected {expected_count} {enzyme} cuts in {item['name']}, instead found cuts at: {cuts}"
            )

## Add to strain collection

In [None]:
oligo0_sheet = gc.open_by_key(col["oligos"]).worksheet_by_title("Special (oLIB0.x)")

In [None]:
# descriptions listing corresponding promoter/sigma/antisigma/fold change/growth rates.; with double-stop

In [None]:
storage_vector_seq = workflow.get_drive_seq(
    drive_service, col["plasmid_maps"], storage_vector_id
)

In [None]:
plasmids_df = plasmid_sheet.get_as_df().set_index("ID*")

In [None]:
base_oligo = {"Date*": date, "Author*": author, "Description": oligo_description}

base_part = {
    "Tags": tags,
    "Author": author,
    "Date*": date,
    "Species/codon usage*": species,
    "Reference": reference,
}

base_plasmid = {
    "Origin*": plasmids_df.loc[storage_vector_id, "Origin*"],
    "Marker*": plasmids_df.loc[storage_vector_id, "Marker*"],
}

base_strain = {
    "Species*": species,
    "Background*": background_strain,
    "Parent*": background_strain,
    "Marker*": plasmids_df.loc[storage_vector_id, "Marker*"],
}

In [None]:
(oligo_prefix, oligo_num), oligo_row = workflow.get_next_collection_id(oligo0_sheet)
(plasmid_prefix, plasmid_num), plasmid_row = workflow.get_next_collection_id(
    plasmid_sheet
)
(strain_prefix, strain_num), strain_row = workflow.get_next_collection_id(strain_sheet)
part_row = workflow.get_next_empty_row(part_sheet)
parts = []
oligos = []
plasmids = []
plasmid_maps = {}
strains = []


def _format_seq(seq):
    return str(sequence.get_seq(seq)).lower()


for item in sequences_to_order.values():
    name = item["name"]
    kind = item["kind"]
    seq = item["final_seq"]
    # description
    row = sigma_subset_info.loc[
        sigma_subset_info.loc[:, kind.capitalize()] == name
    ].iloc[0]
    description = f"""Sigma/antisigma/promoter: {row["Sigma"]}/{row["Antisigma"]}/{row["Promoter"]}
    Sigma/antisigma fold change at max induction: {row["Sigma max"]:.0f}x / {row["Antisigma max"]:.0f}x
    Sigma/antisigma growth rate: {row["Sigma growth"]:.0f}% / {row["Antisigma growth"]:.0f}%"""
    if kind != "promoter":
        description += "\nCDS with double stop codon."
    # part
    part_digest = golden_gate.re_digest(seq, part_enzyme, linear=True)
    part_seq, overhang1, overhang2 = part_digest[0]
    usage = f"{plasmid_prefix}{plasmid_num}/{part_enzyme.__name__}"
    if kind == "promoter":
        usage += f",{oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1}/{part_enzyme.__name__}"
    part = {
        "Name*": name,
        "Usage*": usage,
        "Upstream overhang*": _format_seq(overhang1[0]),
        "Downstream overhang*": _format_seq(overhang2[0]),
        "Sequence*": _format_seq(overhang1[0] + part_seq + overhang2[0]),
        "Description": description,
        **base_part,
    }
    parts.append(part)
    # strain
    plasmid_id = f"{plasmid_prefix}{plasmid_num}"
    strain = {
        "ID*": f"{strain_prefix}{strain_num}",
        "Names": name,
        "Plasmid(s)*": plasmid_id,
        **base_strain,
    }
    strains.append(strain)
    strain_num += 1
    # plasmid map
    to_join = [
        (seq, storage_enzyme),
        (storage_vector_seq, storage_enzyme),
    ]
    plasmid_map = golden_gate.assemble(to_join, linear=False)
    filename = f"{plasmid_id}.gbk"
    content = plasmid_map.format("genbank")
    plasmid_maps[filename] = {
        "content": content,
        "mimetype": "chemical/seq-na-genbank",
    }
    # plasmid
    command = f"@GG({oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1}/{storage_enzyme.__name__}, {storage_vector_id}/{storage_enzyme.__name__})"
    if kind == "promoter":
        construction_notes = f"{storage_enzyme.__name__} golden gate of annealed oligos {oligo_prefix}{oligo_num}={oligo_prefix}{oligo_num+1} into storage vector {storage_vector_id}."
    else:
        construction_notes = f"{storage_enzyme.__name__} golden gate of {oligo_prefix}{oligo_num} into storage vector {storage_vector_id}."
    plasmid = {
        "Command": command,
        "ID*": plasmid_id,
        "Names": name,
        "Description": description,
        "Size (bp)": len(plasmid_map),
        "Construction Notes": construction_notes,
        "Confirmation Notes": confirmation_notes,
        **base_plasmid,
    }
    plasmids.append(plasmid)
    plasmid_num += 1
    # oligo
    item[
        "oligo_id"
    ] = f"{oligo_prefix}{oligo_num}"  # for promoters, this only records the first (top) annealed oligo
    if kind == "promoter":
        for strand, oligo_seq in [("top", seq), ("bottom", seq.reverse_complement())]:
            oligo = {
                "ID*": f"{oligo_prefix}{oligo_num}",
                "Name": f"{name}_{strand}",
                "Vendor*": "Genewiz",
                "Type": "Primer",
                "Sequence*": _format_seq(oligo_seq),
                **base_oligo,
            }
            oligos.append(oligo)
            oligo_num += 1
    else:
        oligo = {
            "ID*": f"{oligo_prefix}{oligo_num}",
            "Name": f"{name}",
            "Vendor*": "Twist",
            "Type": "Twist Gene Fragment",
            "Sequence*": _format_seq(item["optimized_seq"]),
            **base_oligo,
        }
        oligos.append(oligo)
        oligo_num += 1

## Genewiz sequences to order

In [None]:
from itertools import product, repeat

for well, oligo in zip(cloning_util.well_iterator(), oligos):
    if oligo["Vendor*"] == "Genewiz":
        print(f"{oligo['ID*'].replace('.', '_')}\t{oligo['Sequence*']}")
        # print(f"{well}\t{oligo['ID*']}\t{oligo['Sequence*']}")

## Twist sequences to order

In [None]:
# for oligo in oligos:
#     if oligo["Vendor*"] == "Twist":
#         print(f"{oligo['ID*']}\t{oligo['Sequence*']}")
for item in sequences_to_order.values():
    if item["kind"] != "promoter":
        print(f"{item['oligo_id']}\t{_format_seq(item['optimized_seq'])}")

In [None]:
import pickle

data = {
    "oligos": oligos,
    "plasmids": plasmids,
    "plasmid_maps": plasmid_maps,
    "strains": strains,
    "parts": parts,
    "sequences_to_order": sequences_to_order,
    "oligo_row": oligo_row,
    "plasmid_row": plasmid_row,
    "strain_row": strain_row,
    "part_row": part_row,
}
with open("201013voigtsigmas.json", "wb") as f:
    pickle.dump(data, f)

In [None]:
api.google.insert_sheet_rows(plasmid_sheet, plasmid_row, plasmids)

In [None]:
api.google.insert_sheet_rows(strain_sheet, strain_row, strains)

In [None]:
api.google.insert_sheet_rows(oligo0_sheet, oligo_row, oligos)

In [None]:
workflow.upload_plasmid_maps(drive_service, plasmid_maps, plasmid_folder)

In [None]:
plasmid_maps.keys()