In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
import toml
import re
from zipfile import ZipFile
import urllib
from datetime import datetime
import string
import pygsheets
import requests
from tqdm.auto import tqdm
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import Bio.Entrez as Entrez
import benchlingapi

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.benchling as bapi
from paulssonlab.api.util import base_url
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.viennarna as viennarna
import paulssonlab.cloning.thermodynamics as thermodynamics
import paulssonlab.cloning.primers as primers
import paulssonlab.cloning.ncbi as ncbi

In [None]:
hv.extension("bokeh")

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
bench_session = benchlingapi.Session(config["benchling"]["api_key"])
benchling_folder = bapi.get_project_root(bench_session, config["benchling"]["project"])

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"], benchling_folder)

# Primers to make FP parts

In [None]:
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
lib_parts = reg[("LIB", "parts")]
part_types = reg[("LIB", "parts", "Part types")]

In [None]:
def overhangs_for(x):
    return (x["Upstream overhang"], x["Downstream overhang"])

In [None]:
gg_overhangs = overhangs_for(part_types["CDS_CD"])

In [None]:
storage_flanks = (
    lib_parts["JUMP_storage_vector_prefix"]["Sequence"],
    lib_parts["JUMP_storage_vector_suffix"]["Sequence"],
)

In [None]:
ua_rbs = "tctagatttaagaaggagatatacat"
cluzel_cterm = "atgtccagacctgcaggcatgcaagctctagaggcat"
# flanks = (ua_rbs + "atg", "taa" + cluzel_cterm)

## Source plasmids

In [None]:
%%time
plasmids = {
    row["Names"]: plib_maps[id_]
    for id_, row in plib_plasmids.items()
    if "cluzel-fp" in row["Tags"]
}

In [None]:
plasmids.keys()

## Extract FP inserts

In [None]:
%%time
locations = {
    name: sequence.amplicon_location(
        seq, ua_rbs, sequence.reverse_complement(cluzel_cterm)
    )
    for name, seq in plasmids.items()
}

In [None]:
inserts = {name: seq.slice(*locations[name]) for name, seq in plasmids.items()}

In [None]:
s = plasmids["pEB1-SCFP3A"]

In [None]:
sequence.amplicon_location(s, flanks[0], sequence.reverse_complement(flanks[1]))

In [None]:
s.slice(261, 972)

In [None]:
print(str(_.seq))

In [None]:
# check for BsmBI/BsaI/BbsI/AarI

In [None]:
plasmids["pEB1-SCFP3A"]

In [None]:
for enzyme_name in ("BsaI", "BsmBI", "BbsI", "AarI"):
    names_with_cuts = []
    for name, seq in inserts.items():
        cuts = enzyme.re_search(seq, enzyme_name)
        if cuts:
            names_with_cuts.append(name)
    print(f"{enzyme_name} ({len(names_with_cuts)}): {', '.join(names_with_cuts)}")

## Find FP common ends

In [None]:
names = np.array(list(inserts.keys()))

In [None]:
max_end_length = 40

In [None]:
letters = np.array([list(s.seq_lower()[:max_end_length]) for s in inserts.values()])
counts = (letters[np.newaxis, :, :] == letters[:, np.newaxis, :]).sum(axis=0)
forward_cumulative_counts = np.minimum.accumulate(counts, axis=1)

In [None]:
reverse_letters = np.array(
    [list(s.seq_lower()[-max_end_length:]) for s in inserts.values()]
)[:, ::-1]
reverse_counts = (
    reverse_letters[np.newaxis, :, :] == reverse_letters[:, np.newaxis, :]
).sum(axis=0)
reverse_cumulative_counts = np.minimum.accumulate(reverse_counts, axis=1)

In [None]:
cumulative_counts = np.minimum(forward_cumulative_counts, reverse_cumulative_counts)

In [None]:
majority_size = cumulative_counts.max(axis=0)

In [None]:
majority_size

In [None]:
idxs = np.concatenate(
    (np.where(np.diff(majority_size) != 0)[0], [len(majority_size) - 1])
)

In [None]:
idxs

In [None]:
majority_size[idxs]

In [None]:
for idx in idxs:
    omitted = names[cumulative_counts[:, idx] < majority_size[idx]]
    omitted_str = ", ".join(omitted)
    print(f"length {idx} all except ({len(omitted)}): {omitted_str}")

In [None]:
idx = 13
names[cumulative_counts[:, idx] < majority_size[idx]]

## Design primers

In [None]:
import primer3plus

In [None]:
flanks = workflow.concatenate_flanks(gg_overhangs, storage_flanks)
primers.primer3_amplicon(inserts["pEB1-SCFP3A"], flanks, return_many=3)

In [None]:
# TODO: make find_primer_binding_site more general,
# allow specifying score func so can find amplicons with overhangs on both sides?

In [None]:
# USE CASES:
# 1) take desired product, template seq, find overhangs
# 2) take amplicon, optional overhangs

# TODO:
# tm/ta settings for Q5/phusion


primer3_amplicon_primers(
    inserts["pEB1-SCFP3A"], [gg_overhangs, storage_flanks], return_many=3
)

# Sequence matching test

In [None]:
a = "zzaaaaaxxx"
b = "12aaayyxx"
sequence.longest_contiguous_matching(a, b)

In [None]:
import random


def randdna(n):
    return "".join(random.choices("atcg", k=n))

In [None]:
a = sequence.DsSeqRecord(
    Seq(randdna(30) + "aaaaggggttttgggg" + randdna(30)), circular=True
)
b = sequence.DsSeqRecord(Seq("aaaaggggttttgggg"))
c = sequence.DsSeqRecord(Seq("aaaaggggttttgggc"))

In [None]:
a

In [None]:
b

In [None]:
sequence.enumerate_matches(a, b)

In [None]:
a[29:45]

In [None]:
b[0:16]

In [None]:
sequence.enumerate_primer_binding_sites(a, b)

In [None]:
sequence.enumerate_primer_binding_sites(a, c, require_3prime_clamp=False)

In [None]:
e = sequence.DsSeqRecord(
    Seq(randdna(1) + "atgcgcgggaaaatgcgcacaacattagcgacctagc" + randdna(1)),
    circular=True,
)
f = sequence.DsSeqRecord(Seq("atgcgcgggaaaatgcgcacaacattagcgacctagc"))
g = sequence.DsSeqRecord(Seq("NNNNatgcgcgggaaaatgcgcacaacattagcgacctagcMMMM"))

In [None]:
sequence.extract_matching_subsequence(e, g, min_score=20)