In [None]:
import numpy as np
import pandas as pd
import toml
import pygsheets
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import Restriction

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.design as design
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.codon as codon
import paulssonlab.api.geneious as geneious

# Download 2022 Lagator data

In [None]:
!curl -Lo 2022lagator.zip https://github.com/szarma/Thermoters/archive/refs/heads/master.zip && unzip 2022lagator.zip && mv Thermoters-master 2022lagator && rm 2022lagator.zip

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
geneious_sessionmaker = geneious.connect(**config["geneious"])

In [None]:
reg = registry.Registry(
    gc,
    config["registry"]["folder"],
    geneious_sessionmaker=geneious_sessionmaker,
    geneious_folder="registry",
)

In [None]:
olib_oligos = reg[("oLIB", "oligos")]
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
flib_fragments = reg[("fLIB", "fragments")]
part_types = reg[("fLIB", "fragments", "Part types")]

# Config

In [None]:
placeholder_enzyme = Restriction.BsaI
promoter_enzyme = Restriction.BsaI
part_type = "Promoter_AB"
library_reverse_primer = "oLIB46"
num_random_bases = 6

# Placeholder

In [None]:
part_overhangs = workflow.overhangs_for(part_types[part_type])

In [None]:
promoter_placeholder = design.golden_gate_placeholder(promoter_enzyme, placeholder_enzyme, *part_overhangs)

In [None]:
promoter_placeholder

# Salis calculator

In [None]:
reg.eval_expr("fLIB255/BsaI")

In [None]:
#upstream_sequence = part_overhangs[0]
upstream_sequence = reg.eval_expr("pLIB122/BsaI")["_seq"][-100:].seq_lower()

In [None]:
transcript = reg.eval_command("@GG(oLT60/BsaI, B0032m_RiboJ, AM19_LacIAM)")["_seq"].seq_lower()[:100]

In [None]:
optimization_mode = "targeted_forward_TSS"

In [None]:
tir_r = 0

In [None]:
constraints = "N" * 40

In [None]:
base_name = "salisprom1"

In [None]:
for num, tir_f in enumerate(np.geomspace(10, 100_000, 10)):
    print(f"{base_name}.{num+1:04.0f},{upstream_sequence},{transcript},{tir_f:.0f},{tir_r},{optimization_mode},{constraints}")

# 2022 Lagator data

## 36N

In [None]:
prom_ns = pd.read_csv("2022lagator/36N_seqences/36N_constitutive.csv")

In [None]:
prom_ns["estimate_bin"].plot.hist(bins=50)

## PR

In [None]:
prom_pr = pd.read_csv("2022lagator/Pr_Pl_sequences/Pr_constitutive.csv")

In [None]:
def uniform_subset_mask(values, min_dist=0.02):
    mask = np.zeros(len(values), dtype=bool)
    accepted = []
    for idx in range(len(values)):
        value = values[idx]
        dist = min(abs(value - x) for x in accepted) if accepted else np.inf
        if dist >= min_dist:
            mask[idx] = True
            accepted.append(value)
    return mask

In [None]:
mask = uniform_subset_mask(prom_pr["Mean"])

In [None]:
mask.sum()

In [None]:
prom_pr[mask]["Mean"].plot.hist(bins=50)

In [None]:
prom_pr[prom_pr["Coverage"] > 30]["Mean"].plot.hist(bins=50)

In [None]:
prom_pr[2:].plot.scatter("Mean", "Coverage")

In [None]:
prom_pr.plot.scatter("Mean", "Std")

In [None]:
prom_pr[mask][1:].plot.scatter("Mean", "Coverage")

In [None]:
prom_pr[mask].plot.scatter("Mean", "Std")

# Order oligo pool

In [None]:
library_primer_seq_rc = workflow.normalize_seq(
    sequence.reverse_complement(reg.get(library_reverse_primer)["Sequence"])
)
random_prefix = design.random_bases(num_random_bases, seed=57)

In [None]:
random_prefix

In [None]:
oligo_seqs = []
for _, row in prom_pr[mask].iterrows():
    # add BsaI flanks
    oligo_seq = (
        design.type2s_with_spacer(promoter_enzyme, len(part_overhangs[0]))
        + part_overhangs[0]
        + row["sequence"]
        + part_overhangs[1]
        + sequence.reverse_complement(
            design.type2s_with_spacer(promoter_enzyme, len(part_overhangs[1]))
        )
    )
    oligo_seq = random_prefix + oligo_seq + library_primer_seq_rc
    oligo_seq = workflow.normalize_seq(oligo_seq)
    oligo_seqs.append(oligo_seq)

In [None]:
pool_name = "2022lagator_promoter_pr"

In [None]:
for seq in oligo_seqs[:100]:
    #print(f"{pool_name}\t{seq}")
    print(seq)

# Old

In [None]:
fp_placeholder_overhangs = [workflow.overhangs_for(part_type) for part_type in fp_placeholder_types]
part_overhangs = workflow.overhangs_for(part_types[part_type])

In [None]:
seqs = {}
for enzyme, fp_overhangs in product(enzymes, fp_overhangs):
    # ATG-mScarletI_ph-TAATAA
    oligo_seq = (design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[0]))
        + row["Sequence"]
        + sequence.reverse_complement(
            design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[1]))
        )
    )
    oligo_seq = workflow.normalize_seq(oligo_seq)
    oligo_seq = design.random_bases(num_random_bases, seed=oligo_seq) + oligo_seq
    # ATG-ClpP_ph-linker-mScarletI_ph-TAATAA
    # flipped: ATG-mScarletI_ph-linker-ClpP_ph-TAATAA
    # ATG-mScarletI_ph-linker-refFP_ph-TAATAA
    # flipped: ATG-refFP_ph-linker-mScarletI_ph-TAATAA
    ##########
    # 
    # entry = reg.get(name)
    # new_name = re.sub(r"_BC$", "_RiboJ", name)
    # if use_storage_vector:
    #     if part_enzyme != Restriction.BsaI:
    #         raise ValueError("storage vector assumes BsaI part")
    #     seq = (
    #         design.random_bases(num_random_bases)
    #         + lib_parts["JUMP_storage_vector_prefix"]["Sequence"]
    #         + workflow.smoosh_and_normalize_sequences(
    #             part_overhangs[0],
    #             entry["_seq"].trim_overhangs(),
    #             part_overhangs[1],
    #         )
    #         + lib_parts["JUMP_storage_vector_suffix"]["Sequence"]
    #         + design.random_bases(num_random_bases)
    #     )
    # else:
    #     seq = workflow.smoosh_and_normalize_sequences(
    #         part_overhangs[0],
    #         entry["_seq"].trim_overhangs(),
    #         part_overhangs[1],
    #     )
    #     seq = (
    #         design.random_bases(num_random_bases)
    #         + design.type2s_with_spacer(part_enzyme, len(part_overhangs[0]))
    #         + seq
    #         + sequence.reverse_complement(
    #             design.type2s_with_spacer(part_enzyme, len(part_overhangs[1]))
    #         )
    #         + design.random_bases(num_random_bases)
    #     )
    # seq = workflow.normalize_seq(seq)
    # # include first sentence
    # description = (
    #     entry["Description"].split(".")[0]
    #     + f". Same as {name} but with RiboJ-compatible overhangs."
    # )
    
    seqs[new_name] = {"Sequence": seq, "Description": description}

## Oligos

In [None]:
base = {"Author": "Jacob Quinn Shenker", "Date": workflow.date()}

oligo_base = {
    **base,
    "Order date": workflow.date(),
    "Vendor": "IDT",
    "Type": "Primer",
}

part_base = {
    **base,
    "Tags": part_tags,
    "Reference": reference,
}

apply_oligo = {"Name": None}#{"Sequence": workflow.normalize_seq}
overwrite_oligo = False
apply = {"Name": None}
overwrite = True

for name, row in tags.items():
    # add BsaI flanks
    oligo_seq = (design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[0]))
        + row["Sequence"]
        + sequence.reverse_complement(
            design.type2s_with_spacer(tag_enzyme, len(tag_overhangs[1]))
        )
    )
    oligo_seq = workflow.normalize_seq(oligo_seq)
    oligo_seq = design.random_bases(num_random_bases, seed=oligo_seq) + oligo_seq
    if workflow.DEGENERATE_BASES_REGEX.search(workflow.normalize_seq(row["Sequence"])):
        # is library with degenerate bases, append library primer
        oligo_seq = oligo_seq + library_primer_seq_rc
        oligo_row = {**oligo_base, **row, "Name": f"{name}_v2", "Sequence": oligo_seq.upper()}
        oligo_id = olib_oligos.upsert(oligo_row, apply=apply_oligo, overwrite=overwrite_oligo)
        usage = f"{oligo_id}<{library_reverse_primer}>/{tag_enzyme}"
    else:
        # not library, just order two oligos to anneal
        # in this case, there's no reverse primer binding site,
        # so we need to add random bases to ensure efficient cutting
        oligo_seq = oligo_seq + design.random_bases(num_random_bases, seed=oligo_seq)
        oligo_id = olib_oligos.upsert(
            {
                **oligo_base,
                **row,
                "Name": f"{name}_sense",
                "Sequence": workflow.normalize_seq_upper(oligo_seq),
            },
            apply=apply_oligo,
            overwrite=overwrite_oligo,
        )
        oligo_id2 = olib_oligos.upsert(
            {
                **oligo_base,
                **row,
                "Name": f"{name}_antisense",
                "Sequence": workflow.normalize_seq_upper(
                    sequence.reverse_complement(oligo_seq)
                ),
            },
            apply=apply_oligo,
            overwrite=overwrite_oligo,
        )
        usage = f"{oligo_id}={oligo_id2}/{tag_enzyme}"
    part_row = {
        **part_base,
        **row,
        "Name": name,
        "Sequence": workflow.normalize_seq(row["Sequence"]),
        "Usage": usage,
        "Type": tag_part_type,
        "Upstream overhang": tag_overhangs[0],
        "Downstream overhang": tag_overhangs[1],
        "Species/codon usage": "E. coli",
    }
    flib_fragments.upsert(part_row, apply=apply, overwrite=overwrite)

In [None]:
olib_oligos.commit()
flib_fragments.commit()

In [None]:
base = {"Author": "Jacob Quinn Shenker", "Date": workflow.date()}

oligo_base = {
    **base,
    "Order date": workflow.date(),
    "Vendor": "Genewiz",
    "Type": "Primer",
}

part_base = {
    **base,
    "Tags": part_tags,
    "Reference": reference,
}

# apply = {"Sequence": workflow.normalize_seq}
apply = {"Name": None}

for name, row in seqs.items():
    seq = row["Sequence"]
    oligo_id = olib_oligos.upsert(
        {
            **oligo_base,
            **row,
            "Name": f"{name}_sense",
            "Sequence": workflow.normalize_seq_upper(seq),
        },
        apply=apply,
    )
    oligo_id2 = olib_oligos.upsert(
        {
            **oligo_base,
            **row,
            "Name": f"{name}_antisense",
            "Sequence": workflow.normalize_seq_upper(sequence.reverse_complement(seq)),
        },
        apply=apply,
    )
    usage = f"{oligo_id}={oligo_id2}/{part_enzyme}"
    part_seq = workflow.normalize_seq(workflow.re_digest_part(seq, part_enzyme))
    part_row = {
        **part_base,
        **row,
        "Sequence": part_seq,
        "Usage": usage,
        "Type": part_type,
        "Upstream overhang": part_overhangs[0],
        "Downstream overhang": part_overhangs[1],
        "Species/codon usage": "E. coli",
    }
    lib_parts[name] = part_row