In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
from zipfile import ZipFile
import urllib
from datetime import datetime
import string
import pygsheets
import requests
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import Bio.pairwise2 as pairwise2
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.benchling as bapi
from paulssonlab.api.util import base_url
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
bench_session = benchlingapi.Session(config["benchling"]["api_key"])
benchling_folder = bapi.get_project_root(bench_session, config["benchling"]["project"])

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"], benchling_folder)

# Data

## 2013 Chen

In [None]:
!mkdir -p data/2013terminators_supp
!curl -b does_not_exist -Lo data/2013terminators_supp/supptable2.xlsx "https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2515/MediaObjects/41592_2013_BFnmeth2515_MOESM206_ESM.xlsx"
!curl -b does_not_exist -Lo data/2013terminators_supp/supptable3.xlsx "https://static-content.springer.com/esm/art%3A10.1038%2Fnmeth.2515/MediaObjects/41592_2013_BFnmeth2515_MOESM207_ESM.xlsx"

In [None]:
natural_terminators = pd.read_excel(
    "data/2013terminators_supp/supptable2.xlsx", index_col=0
)
synthetic_terminators = pd.read_excel(
    "data/2013terminators_supp/supptable3.xlsx", index_col=0
)
for terminators in (natural_terminators, synthetic_terminators):
    terminators["Length"] = terminators["Sequence"].str.len()

## 2020 Park

In [None]:
# FROM: used https://www.adobe.com/acrobat/online/pdf-to-excel.html to convert
# Table S1 (p. 23) from https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.20209584&file=msb209584-sup-0001-AppendixFig.pdf

voigt_bidirectional_terms_tsv = """
name	sequence	Ts_forward	Ts_reverse
DT3	"CCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTT"		3000				120		
DT5	"TCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCC"		4700				50		
DT19	TTCAGCCAAAAAACTTAAGACCGCCGGTCTTGTCCACTACCTTGCAGTAATGCGGTGGACAGGATCGGCGGTTTTCTTTTCTCTTCTCAACTCGGTACCAAAGACGAACAATAAGACGCTGAAAAGCGTCTTTTTTCGTTTTGGTCC	770				1.2			
DT34	GCTGATGCCAGAAAGGGTCCTGAATTTCAGGGCCCTTTTTTTACATGGATTGCTCGGTACCAAATTCCAGAAAAGAGACGCTTTCGAGCGTCTTTTTTCGTTTTGGTCC	570				1.4			
DT36	GATCTAACTAAAAAGGCCGCTCTGCGGCCTTTTTTCTTTTCACTGTAACAACGGAAACCGGCCATTGCGCCGGTTTTTTTTGGCCT	680				3.2			
DT42	"AGTTAACCAAAAAGGGGGGATTTTATCTCCCCTTTAATTTTTCCTCGCAGATAGCAAAAAAGCGCCTTTAGGGCGCTTTTTTACATTG
GTGG"	2500				2.2			
DT54	"GGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCC"		1800				30		
DT56	TACCACCGTCAAAAAAAACGGCGCTTTTTAGCGCCGTTTTTATTTTTCAACCTTCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTC	240				11			
DT60	ACATTTAATAAAAAAAGGGCGGTCGCAAGATCGCCCTTTTTTACGTATGACACAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATG	110				29			
DT65	TGCTCGTACCAGGCCCCTGCAATTTCAACAGGGGCCTTTTTTTATCCAATTCCATCGGGTCCGAATTTTCGGACCTTTTCTCCGC	400				1.0			
DT82	"CTTATTCCATAACAAAGCCGGGTAATTCCCGGCTTTGTTGTATCTGAACAATAAATGGATGCCCTGCGTAAGCGGGGCATTTTTCTTCCT"	170				2.8			
DT83	AGCGTCAAAAGGCCGGATTTTCCGGCCTTTTTTATTAGGCAGCATGCTGCCAGGTGATCCCCCTGGCCACCTCTTTT	600				4.4			
DT86	TAATCATTCTTAGCGTGACCGGGAAGTCGGTCACGCTACCTCTTCTGAAGAAACAGCAAACAATCCAAAACGCCGCGTTCAGCGGCGTTTTTTCTGCTTTTCT	210				0.4			
DT100	"GTGAAGTGAAAAATGGCGCACATTGTGCGCCATTTTTTTTGTCTGCCGTTTACCGCTTCTCTGAAAATCAACGGGCAGGTCACTGACTTGCCCGTTTTTTTATCCCTTCTCCACACCG"	4700				12			
DT101	"TCTTTAAAAAGAAACCTCCGCATTGCGGAGGTTTCGCCTTTTGATACTCTGTCTGAAGTAATTCTTGCCGCAGTGAAAAATGGCGCCCATCGGCGCCATTTTTTTATGCTTCCATTAGAAAGCAAAAAGCCTGCTAGAAAGCAGGCTTTTTTGAATTTGGCTCCTCTGAC"		2800				160		
DT103	"AAAGTTCTGAAAAAGGGTCACTTCGGTGGCCCTTTTTTATCGCCACGGTTTGAGCAGTGCACTTGCTTAAAATCCCGCCAGCGGCGGGATTTTTTATTGTCCGGTTTAAGACA"	790				4.0			
DT104	"GCAGACAAAAAAAATGGCGCACAATGTGCGCCATTTTTCACTTCACAGGTACTATTGTTTTGAATTGAAAAGGGCGCTTCGGCGCCCTTTTTGCATTTGTTGACGGCATATATTTGTATATCGAAGCGCCCTGATGGGCGCTTTTTTTATTTAATCGATAACCAGA"		580				101		
"""

import io

voigt_bidirectional_terms = pd.read_csv(
    io.StringIO(voigt_bidirectional_terms_tsv), sep="\s+", index_col=0
)

## 2019 Hudson

In [None]:
!mkdir -p data/2019hudson_supp
!curl -b does_not_exist -Lo data/2019hudson_supp.zip "https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/synbio/4/1/10.1093_synbio_ysz026/2/ysz026_supplementary_data.zip?Expires=1625620668&Signature=ce7~qRkVxcwAzjs98YDSNU03XBjhxBoyL6Mr2RsQEordmOg8N8Fh5u0trhiAxYSZtzbF~U~x2DjUG5dEYLjtMiF-YTejDIOMqtPdmVd6-n4WK5wn02j8JS3whJt57SL6fCAYAlcBiVKPyPH-tTyctP84SK6v8~zjS07hmob6LY-MmBt-XDgaTMiks6Pqkw3yUfnaUZP4IbXnO0Nt~p5uFlE~iStyoaTz~Y4uGvLSjcA2YUmiCvUVQOlNVcXwPMpHkot6CnD0~ZhKMxE8w4dYhGs4DJk2xsrecIfKHS1WCWzyd6N20lAB43hHQNQEKr61oNSazYD9rCRfGIckt5Uepg__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA"

In [None]:
with ZipFile("data/2019hudson_supp.zip") as zip:
    with zip.open(
        "Supplemental Table 3A - TermSeq FACS Binning Data_allover200reads.csv"
    ) as f:
        hudson_terminators = pd.read_csv(
            f, header=[2], thousands=",", na_values=["#DIV/0!"]
        )
hudson_terminators.loc[pd.isnull(hudson_terminators["Ave TS"]), "Ave TS"] = np.inf
for col in ("Ave", "SD", "CI (95% CL)"):
    hudson_terminators[col] = (
        pd.to_numeric(hudson_terminators[col].str.rstrip("%")) / 100
    )
hudson_terminators.rename(
    {"Terminator Sequence (5' to 3')": "Sequence"}, axis=1, inplace=True
)
hudson_terminators["Sequence_rev"] = hudson_terminators["Sequence"].map(
    lambda x: str(sequence.reverse_complement(x))
)
hudson_terminators = hudson_terminators.join(
    hudson_terminators[
        ["Sequence", "Ave TS", "Ave", "SD", "SD/AVE", "Term ID"]
    ].set_index("Sequence"),
    on="Sequence_rev",
    rsuffix="_rev",
)
hudson_terminators["Ave TS_min"] = hudson_terminators[["Ave TS", "Ave TS_rev"]].min(
    axis=1
)
hudson_terminators["Ave_min"] = hudson_terminators[["Ave", "Ave_rev"]].min(axis=1)
hudson_terminators = hudson_terminators[
    pd.isnull(hudson_terminators["Ave_rev"])
    | (hudson_terminators["Ave"] >= hudson_terminators["Ave_rev"])
    | (
        (np.isinf(hudson_terminators["Ave"]))
        & (hudson_terminators["Term ID"] >= hudson_terminators["Term ID_rev"])
    )
].copy()
hudson_terminators.set_index("Term ID", inplace=True)

# Voigt bidirectional terms

In [None]:
selected_bidirectional_terms = voigt_bidirectional_terms[
    voigt_bidirectional_terms["Ts_reverse"] >= 10
]

In [None]:
selected_bidirectional_terms

# Hudson bidirectional terms

In [None]:
hudson_bidirectional = hudson_terminators[
    ~pd.isnull(hudson_terminators["Ave TS_rev"])
].copy()
hudson_bidirectional.sort_values("Ave_min", ascending=False, inplace=True)

In [None]:
hudson_bidirectional

In [None]:
hudson_bidirectional.plot.scatter("Ave TS", "Ave TS_rev")

In [None]:
hudson_bidirectional[hudson_bidirectional["Ave_min"] > 0.995].plot.scatter(
    "Ave", "Ave_rev"
)

In [None]:
plt.scatter(hudson_bidirectional["Ave TS"], np.log10(1 - hudson_bidirectional["Ave"]))

In [None]:
%%time
hudson_sim = map_pairwise(
    similarity, hudson_bidirectional["Sequence"], hudson_bidirectional["Sequence"]
)

In [None]:
%%time
selected_hudson = get_nonrepetitive(
    similarity, 0.7, hudson_bidirectional["Sequence"], [], 2
)

In [None]:
selected_hudson

In [None]:
hudson_bidirectional.loc[["T799", "T97", "T540"]]

In [None]:
olib = reg[("oLIB", "oligos")]
olt = reg[("oLT", "oligos")]

In [None]:
backbone_flanks = {
    "upstream": (
        sequence.reverse_complement(olib["oLIB179"]["Sequence"]),
        olib["oLIB180"]["Sequence"],
    ),
    "downstream": (
        sequence.reverse_complement(olib["oLIB181"]["Sequence"]),
        olib["oLIB182"]["Sequence"],
    ),
}

backbone_insulation_terminators = {
    "upstream": "T97",
    "downstream": "T540",
}

In [None]:
date = datetime.now().strftime("%-d/%-m/%Y")

base_row = {
    "Author": "Jacob Quinn Shenker",
    "Vendor": "IDT",
    "Type": "Primer",
    "Date": date,
    "Order date": date,
}

rows = []

for orientation, terminator in backbone_insulation_terminators.items():
    seq = hudson_bidirectional.loc[terminator, "Sequence"]
    if orientation == "upstream":
        seq = sequence.reverse_complement(seq)
    seq = workflow.add_flanks(seq, [backbone_flanks[orientation]])
    for antisense in (False, True):
        if antisense:
            oligo_seq = sequence.reverse_complement(seq)
        else:
            oligo_seq = seq
        oligo_seq = str(oligo_seq)
        sense_str = "antisense" if antisense else "sense"
        name = f"JUMP_{orientation}_{terminator}_{sense_str}"
        term = hudson_bidirectional.loc[terminator]
        description = "2019 Hudson terminator {sense_str} oligo.\nTerminator efficiency: {te:.4f} / {te_rev:.4f} (reverse)\nTerminator strength (TS): {ts:.0f} / {ts_rev:.0f} (reverse)".format(
            sense_str=sense_str,
            te=term["Ave"],
            te_rev=term["Ave_rev"],
            ts=term["Ave TS"],
            ts_rev=term["Ave TS_rev"],
        )
        row = {
            "Name": name,
            "Sequence": oligo_seq,
            "Description": description,
            **base_row,
        }
        rows.append(row)

for row in rows:
    olt[olt.next_id()] = row

In [None]:
olt.save()

In [None]:
rows

In [None]:
print(rows[-1]["Description"])

In [None]:
str(upstream_backbone_insulation)

In [None]:
str(downstream_backbone_insulation)

In [None]:
hudson_bidirectional[["Ave TS", "Ave TS_rev"]][:20]

In [None]:
show_heatmap(hudson_sim)

# Homology

In [None]:
synthetic_terminators

In [None]:
ts = synthetic_terminators["Sequence"][:3]

In [None]:
def align(a, b):
    # return pairwise2.align.localxs(a, b, -0.5, -0.5)
    return pairwise2.align.localms(a, b, 1, -0.5, -0.5, -0.5)


def show_alignment(a, b):
    print(pairwise2.format_alignment(*align(a, b)[0], full_sequences=True))


def homology(a, b):
    b_rc = str(sequence.reverse_complement(b))
    forward = align(a, b)[0].score
    reverse = align(a, b_rc)[0].score
    return max(forward, reverse)


def min_length(a, b):
    return min(len(a), len(b))


def similarity(a, b):
    return homology(a, b) / min_length(a, b)

In [None]:
def map_pairwise(func, xs, ys):
    mat = np.zeros((len(xs), len(ys)))
    if xs.equals(ys):
        for i in range(len(xs)):
            for j in range(i):
                mat[i, j] = mat[j, i] = func(xs[i], ys[j])
    else:
        for i in range(len(xs)):
            for j in range(len(ys)):
                mat[i, j] = func(xs[i], ys[j])
    return pd.DataFrame(mat, index=xs.index, columns=ys.index)

In [None]:
def show_heatmap(df):
    with pd.option_context(
        "display.max_rows", None, "display.max_columns", None
    ):  # more options can be specified also
        display(df.style.background_gradient(cmap="RdPu", axis=None))

In [None]:
%%time
hmat = map_pairwise(
    homology, synthetic_terminators["Sequence"], selected_terms["sequence"]
)

In [None]:
lengths = map_pairwise(
    min_length, synthetic_terminators["Sequence"], selected_terms["sequence"]
)

In [None]:
nhmat = hmat / lengths

In [None]:
import seaborn as sns

In [None]:
synthetic_terminators.columns

In [None]:
plt.scatter(synthetic_terminators["Average Strength"], synthetic_terminators["Length"])
plt.scatter(natural_terminators["Average Strength"], natural_terminators["Length"])

In [None]:
def get_terminator(terminators, names):
    seq = ""
    for name in names:
        if name.endswith("_r"):
            seq += str(sequence.reverse_complement(terminators.loc[name[:-2]]))
        else:
            seq += terminators.loc[name]
    return seq

In [None]:
get_terminator(synthetic_terminators["Sequence"], ["L3S2P22_r", "L3S2P21", "L3S2P56"])

In [None]:
len(_)

In [None]:
get_terminator(
    synthetic_terminators["Sequence"], ["L3S1P56_r", "L3S2P22_r", "L3S2P21", "L3S2P56"]
)

In [None]:
len(_)

In [None]:
strong_synthetic = synthetic_terminators.sort_values(
    "Average Strength", ascending=False
)[["Average Strength", "Length", "Sequence"]][:50]
strong_synthetic

In [None]:
all_terminators = pd.concat((synthetic_terminators, natural_terminators))
strong_all = all_terminators.sort_values("Average Strength", ascending=False)[
    ["Average Strength", "Length", "Sequence"]
][:60]
strong_all

In [None]:
pool = pd.concat((strong_synthetic["Sequence"], selected_terms["sequence"]))

In [None]:
%%time
pool_sim = map_pairwise(similarity, pool, pool)

In [None]:
from itertools import chain


def get_nonrepetitive(similarity_func, max_similarity, candidates, others, num):
    accepted = []
    accepted_keys = []
    idx = 0
    for i in range(num):
        while idx < len(candidates):
            candidate = candidates.iloc[idx]
            if not len(accepted) and not len(others):
                similarity = 0
            else:
                similarity = max(
                    similarity_func(candidate, seq) for seq in chain(accepted, others)
                )
            if similarity <= max_similarity:
                accepted.append(candidate)
                accepted_keys.append(candidates.index[idx])
                break
            idx += 1
        else:
            raise ValueError(f"ran out of candidates after accepting {i}")
    series = pd.Series(accepted, name=candidates.name, index=accepted_keys)
    series.index.name = candidates.index.name
    return series

In [None]:
pd.append(strong_synthetic["Sequence"][[0]])

In [None]:
%%time
selected_pool = get_nonrepetitive(
    similarity, 0.7, strong_all["Sequence"], selected_terms["sequence"], 8
)

In [None]:
show_alignment(selected_pool["L3S1P56"], selected_pool["L3S3P00"])

In [None]:
show_alignment(selected_pool["L3S3P47"], selected_pool["L3S3P00"])

In [None]:
show_alignment(selected_pool["L3S3P47"], selected_pool["L3S3P41"])

In [None]:
%%time
show_heatmap(map_pairwise(similarity, selected_pool, selected_pool))

In [None]:
%%time
show_heatmap(map_pairwise(similarity, selected_pool, selected_pool))

In [None]:
show_heatmap(pool_sim)

In [None]:
%%time
hmat = map_pairwise(
    homology, strong_synthetic["Sequence"], strong_synthetic["Sequence"]
)

In [None]:
lengths = map_pairwise(
    min_length, synthetic_terminators["Sequence"], selected_terms["sequence"]
)
nhmat = hmat / lengths

In [None]:
get_terminator(synthetic_terminators["Sequence"], [""])

In [None]:
cm = sns.light_palette("orange", as_cmap=True)
with pd.option_context(
    "display.max_rows", None, "display.max_columns", None
):  # more options can be specified also
    display(nhmat.style.background_gradient(cmap=cm))

In [None]:
hmat.idxmin()

In [None]:
hmat["DT5"].values

In [None]:
a = synthetic_terminators.loc["L1U8H11"]["Sequence"]
b = selected_terms.loc["DT5"]["sequence"]
alignments = pairwise2.align.localxs(a, b, -0.5, -0.5)
print(pairwise2.format_alignment(*alignments[0]))

In [None]:
a = synthetic_terminators.loc["L3S2P21"]["Sequence"]
b = selected_terms.loc["DT5"]["sequence"]
alignments = pairwise2.align.localxs(a, b, -0.5, -0.5)
print(pairwise2.format_alignment(*alignments[0], full_sequences=True))

In [None]:
pairwise2.align.localxs(ts[0], ts[2], -0.5, -0.5)[0].score

In [None]:
alignments = pairwise2.align.localxs(ts[0], ts[2], -0.5, -0.5)
print(pairwise2.format_alignment(*alignments[0]))

In [None]:
alignments = pairwise2.align.localxs(
    ts[0], str(sequence.reverse_complement(ts[2])), -0.5, -0.5
)
print(pairwise2.format_alignment(*alignments[0]))

In [None]:
# FROM: https://github.com/ViennaRNA/ViennaRNA/issues/64

# read DNA parameters
RNA.read_parameter_file(
    f"{os.environ['CONDA_PREFIX']}/share/ViennaRNA/dna_mathews2004.par"
)


def primer_secondary_structure(seq):
    md = RNA.md()
    fc = RNA.fold_compound(seq, md)
    (_, mfe_monomer) = fc.mfe()
    fc_dimer = RNA.fold_compound(f"{seq}&{seq}", md)
    (_, mfe_homodimer) = fc_dimer.mfe()
    return mfe_monomer, mfe_homodimer

# Old

## Makeshift oligo orders

In [None]:
def _format_seq(seq):
    return str(sequence.get_seq(seq)).lower()

In [None]:
NO_GBLOCK = ["DT3", "DT56", "DT60"]
# NO_GBLOCK = []

In [None]:
overhangs = ["aggt", "gctt"]
random_bases = (
    "GCTTCA",
    "TGCTAA",
)  # to add between BsmBI recognition site and ends of oligos
flanks = ("CGTCTCGGTCTCa", "tGAGACCgGAGACG")  # storage vector BsmBI flanks
seqs_to_order = {}
for term_name, row in selected_terms.iterrows():
    seq = row["sequence"]
    seq = workflow.add_flanks(
        workflow.add_overhangs(seq.lower(), overhangs),
        [flanks, random_bases],
    )
    seqs_to_order[term_name] = seq

In [None]:
seqs_to_order

In [None]:
prefix = "oLT"
id_num = 37
for term_name, seq in seqs_to_order.items():
    if term_name in NO_GBLOCK:
        continue
    # for sense in (False, True):
    for sense in (True,):
        if sense:
            oligo_seq = seq
        else:
            oligo_seq = sequence.reverse_complement(seq)
        id_ = f"{prefix}{id_num}"
        name = f"Voigt_{term_name}"
        print(f"{id_}\t{name}\t{_format_seq(oligo_seq)}")
        # print(f"{name}\t{_format_seq(oligo_seq)}")
        id_num += 1

In [None]:
for term_name, seq in seqs_to_order.items():
    if term_name not in NO_GBLOCK:
        continue
    for sense in (True, False):
        if sense:
            oligo_seq = seq
        else:
            oligo_seq = sequence.reverse_complement(seq)
        id_ = f"{prefix}{id_num}"
        name = f"Voigt_{term_name}_{'sense' if sense else 'antisense'}"
        print(f"{id_}\t{name}\t{_format_seq(oligo_seq)}")
        id_num += 1