#### Imports

In [None]:
import re
import copy
import itertools
import random

import pandas as pd
import numpy as np

from Bio import SeqIO
from Bio.Seq import Seq

In [None]:
# Columns:
# (1) Promoter identifier assigned by RegulonDB
# (2) Promoter Name
# (3) DNA strand where the promoter is located
# (4) Genome map position of Transcription Start Site (+1)
# (5) Sigma Factor that recognize the promoter
# (6) Promoter Sequence (+1 upper case)
# (7) Evidence that supports the existence of the promoter
# (8) Evidence confidence level (Confirmed, Strong, Weak)

five_prime_adaptor = "TTGCTCGTACGACGCTCGAG"
three_prime_adaptor = "GGATCCTTAAGGTGCCGGG"

data = pd.read_csv(
    "./RegulonDB_Promoters.tsv",
    sep="\t",
    skiprows=37,
    names=[
        "Promoter Identifier",
        "Promoter Name",
        "Strand",
        "Transcription Start Site (+1)",
        "Sigma Factor",
        "Promoter Sequence",
        "Evidence",
        "Confidence",
    ],
)
data = data[data["Sigma Factor"].str.contains("Sigma38", na=False)].reset_index(
    drop=True
)
data = data[~data["Promoter Sequence"].isna()].reset_index(drop=True)
genome = SeqIO.read("./RegulonDB_MG1655_Ref.gb", "gb")


def get_promoter(tss_plus1, strand, genome=genome):
    if strand == "forward":
        start = tss_plus1 - 80
        end = tss_plus1 + 20

    elif strand == "reverse":
        start = tss_plus1 - 21
        end = tss_plus1 + 79

    promoter = genome[start:end]

    if strand == "reverse":
        promoter = promoter.reverse_complement()

    promoter.seq = str(promoter.seq).lower()
    promoter.seq = promoter.seq[:79] + promoter.seq[79].upper() + promoter.seq[80:]

    return str(promoter.seq)


def generate_random_sequences(num_seqs, str_len=100, init_id=0):
    str_arr = np.random.choice(["A", "C", "G", "T"], size=(num_seqs, str_len))
    str_list = np.apply_along_axis("".join, 1, str_arr).tolist()
    df_out = [
        [
            init_id + k,
            "DUMMY_" + str(init_id + k),
            "DUMMY_" + str(init_id + k),
            "forward",
            0,
            "",
            "",
            "",
            "",
            item,
        ]
        for k, item in enumerate(str_list)
    ]

    df_out = pd.DataFrame(
        df_out,
        columns=[
            "Promoter ID",
            "Promoter Identifier",
            "Promoter Name",
            "Strand",
            "Transcription Start Site (+1)",
            "Sigma Factor",
            "Promoter Sequence",
            "Evidence",
            "Confidence",
            "Larger Promoter Sequence",
        ],
    )
    return df_out

1. Determine promoter sequences of interest
2. Extract -79 to +20 region (100 bp total)
3. Also include positive (Anderson) and negative (random sequence) controls
4. Add restriction sites and priming sequences

In [None]:
data = data.reset_index()
data.columns = ["Promoter ID"] + list(data.columns[1:])
data["Larger Promoter Sequence"] = data.apply(
    lambda x: get_promoter(x["Transcription Start Site (+1)"], x["Strand"]), axis=1
)

# check promoters match
is_same = (
    data["Larger Promoter Sequence"].apply(lambda x: x[19:])
    == data["Promoter Sequence"]
)
print(np.sum(is_same.values) == len(is_same))

In [None]:
# using 20 bp downstram and 45 bp upstream, using random sequence

init_id = data["Promoter ID"].max()
upstream_seq = "TTCTCCGTAGTGCTGACTGTATCGTTCCGTGAAGCGCACCATACCA"
downstream_seq = "AGCTTGTTGAGTATCATGA"

anderson_promoters = {
    "BBa_J23100": "TTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGC",
    "BBa_J23101": "TTTACAGCTAGCTCAGTCCTAGGTATTATGCTAGC",
    "BBa_J23102": "TTGACAGCTAGCTCAGTCCTAGGTACTGTGCTAGC",
    "BBa_J23105": "TTTACGGCTAGCTCAGTCCTAGGTACTATGCTAGC",
    "BBa_J23106": "TTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGC",
    "BBa_J23107": "TTTACGGCTAGCTCAGCCCTAGGTATTATGCTAGC",
    "BBa_J23112": "CTGATAGCTAGCTCAGTCCTAGGGATTATGCTAGC",
    "BBa_J23113": "CTGATGGCTAGCTCAGTCCTAGGGATTATGCTAGC",
    "BBa_J23114": "TTTATGGCTAGCTCAGTCCTAGGTACAATGCTAGC",
    "BBa_J23117": "TTGACAGCTAGCTCAGTCCTAGGGATTGTGCTAGC",
}

anderson_df = [
    [
        init_id + k,
        item[0],
        item[0],
        "forward",
        0,
        "",
        "",
        "",
        "",
        upstream_seq + item[1] + downstream_seq,
    ]
    for k, item in enumerate(anderson_promoters.items())
]

anderson_df = pd.DataFrame(
    anderson_df,
    columns=[
        "Promoter ID",
        "Promoter Identifier",
        "Promoter Name",
        "Strand",
        "Transcription Start Site (+1)",
        "Sigma Factor",
        "Promoter Sequence",
        "Evidence",
        "Confidence",
        "Larger Promoter Sequence",
    ],
)

dummy_df = generate_random_sequences(10, init_id=anderson_df["Promoter ID"].max() + 1)
final_df = pd.concat([data, anderson_df, dummy_df]).reset_index(drop=True)
final_df["Sequences To Order"] = final_df["Larger Promoter Sequence"].apply(
    lambda x: (five_prime_adaptor + x + three_prime_adaptor).upper()
)

In [None]:
final_df[230:]

In [None]:
final_df["Sequences To Order"].apply(lambda x: len(x)).unique()

In [None]:
final_df.to_csv("./2021-02-15_rpos_promoter_library.csv")