# Build oligo library design from scratch
The aim of this code is to create a TF reporter oligo library from scratch. A selection of ~30 TFs is made, and for these TFs oligos will be designed, which each have TF binding sites in front of a minimal promoter and a unique barcode in the transcription unit. Various parameters will be explored in the design to optimize the design of the TF reporters. 

In [None]:
## Import libraries
import os
import sys
sys.path.append('/DATA/usr/m.trauernicht/software/')
!conda install pandas -y
import pandas as pd

from functools import reduce

In [2]:
## Import parameters to build DF
os.chdir("/DATA/usr/m.trauernicht/projects/TFreporter/Oligo_Design/")
tf_df = pd.read_csv("TF_motifs.csv", sep=';')
barcodes = pd.read_csv("Barcode.csv")
distance = pd.read_csv("Distance.csv")
promoters = pd.read_csv("Promoter.csv")
spacings = pd.read_csv("Spacing.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/DATA/usr/m.trauernicht/projects/TFreporter/Oligo_Design/'

In [43]:
## Merge all parameters to create large DF 
tf_df['tmp'] = 1
barcodes['tmp'] = 1
spacings['tmp'] = 1
distance['tmp'] = 1
promoters['tmp'] = 1
tf_dfs = [tf_df, barcodes, distance, promoters, spacings]
tf_df_all = reduce(lambda left,right: 
                   pd.merge(left,right,on = ['tmp']), tf_dfs)
del tf_df_all['tmp']

# pd.display.options.XX = value
tf_df_all

Unnamed: 0,TF,Motif,Barcode,Distance,Promoter,Spacing
0,Stat3,TTCCCGGAA,1,10bp,minP,5bp
1,Stat3,TTCCCGGAA,1,10bp,minP,10bp
2,Stat3,TTCCCGGAA,1,10bp,minP,21bp
3,Stat3,TTCCCGGAA,1,10bp,hBGm,5bp
4,Stat3,TTCCCGGAA,1,10bp,hBGm,10bp
...,...,...,...,...,...,...
4990,Atf3,GATGACGT,5,21bp+AA,hBGm,10bp
4991,Atf3,GATGACGT,5,21bp+AA,hBGm,21bp
4992,Atf3,GATGACGT,5,21bp+AA,mCMV,5bp
4993,Atf3,GATGACGT,5,21bp+AA,mCMV,10bp


In [44]:
# Background sequence (bg41 from Davis et al. 2019) - used for spacing, distance & primer adapters
background = "TGTTCAGAAGGGCCAGACAATGCCAAGGACTCAGGGGAGGAGAATTAAGTCAGAGAGTTTCATTACTGAGTGTTGTTTGACTTTGTTGTCACGGATTCATTTAACCATCTCTCTACCATGGTAAAAATGTGTATCCTATGTCCAGTATGAA"

In [60]:
# Adding the DNA sequence from 5' to 3'

## Constant 5' primer sequence
tf_df_all['Primer1_seq'] = background[0:17]

## TF motif#1 - find an unbiased way to select motifs (only high-quality data, threshold frequencies, otherwise assign random)
tf_df_all['motif1'] = tf_df_all['Motif']

# Spacer sequence between TF motifs (5, 10, 21 bp)
tf_df_all['space1'][tf_df_all.spacings == '5bp'] = 'GC' + background[18] + 'AT'
tf_df_all['space2'][tf_df_all.spacings == '10bp'] = 'GC' + background[18:23] + 'AT'
tf_df_all['space3'][tf_df_all.spacings == '21bp'] = 'GC' + background[18:34] + 'AT'


tf_df_all

KeyError: 'space1'

In [3]:



  
# Do the same for the other 3 repeats
tf.df$motif2 <- tf.df$Motif
tf.df$Space2[tf.df$Spacing == "5bp"] <- paste("GC", substr(background, 20, 20), "AT", sep = "")
tf.df$Space2[tf.df$Spacing == "10bp"] <- paste("GC", substr(background, 25, 30), "AT", sep = "")
tf.df$Space2[tf.df$Spacing == "21bp"] <- paste("GC", substr(background, 36, 52), "AT", sep = "")
tf.df$motif3 <- tf.df$Motif
tf.df$Space3[tf.df$Spacing == "5bp"] <- paste("GC", substr(background, 21, 21), "AT", sep = "")
tf.df$Space3[tf.df$Spacing == "10bp"] <- paste("GC", substr(background, 31, 36), "AT", sep = "")
tf.df$Space3[tf.df$Spacing == "21bp"] <- paste("GC", substr(background, 37, 53), "AT", sep = "")
tf.df$motif4 <- tf.df$Motif
  
# Sequence from last TF-motif to start of minimal promoter (10, 21, 21 incl. polyA)
tf.df$Distance_seq[tf.df$Distance == "10bp" & tf.df$Spacing == "5bp"] <- paste("GC", substr(background, 22, 29))
tf.df$Distance_seq[tf.df$Distance == "10bp" & tf.df$Spacing == "10bp"] <- paste("GC", substr(background, 32, 39))
tf.df$Distance_seq[tf.df$Distance == "10bp" & tf.df$Spacing == "21bp"] <- paste("GC", substr(background, 38, 45))
tf.df$Distance_seq[tf.df$Distance == "21bp" & tf.df$Spacing == "5bp"] <- paste("GC", substr(background, 22, 40))
tf.df$Distance_seq[tf.df$Distance == "21bp" & tf.df$Spacing == "10bp"] <- paste("GC", substr(background, 32, 50))
tf.df$Distance_seq[tf.df$Distance == "21bp" & tf.df$Spacing == "21bp"] <- paste("GC", substr(background, 38, 56))
tf.df$Distance_seq[tf.df$Distance == "21bp+AA" & tf.df$Spacing == "5bp"] <- paste("GC", substr(background, 22, 25), "AAAAAAAAAAAAAAA")
tf.df$Distance_seq[tf.df$Distance == "21bp+AA" & tf.df$Spacing == "10bp"] <- paste("GC", substr(background, 32, 35), "AAAAAAAAAAAAAAA")
tf.df$Distance_seq[tf.df$Distance == "21bp+AA" & tf.df$Spacing == "21bp"] <- paste("GC", substr(background, 38, 41), "AAAAAAAAAAAAAAA")

# Minimal promoter
tf.df$Promoter_sequence <- "TAGAGGGTATATAATGGAAGCTCGACTTCCAG"
tf.df$Promoter_sequence[tf.df$Promoter == "mCMV"] <- "GGCGTTTACTATGGGAGGTCTATATAAGCAGAGCTCGTTTAGTGAACCGTCAGATC"
tf.df$Promoter_sequence[tf.df$Promoter == "hBGm"] <- "GGGCTGGGCATAAAAGTCAGGGCAGAGCCATCTATTGCTTACATTTGCTTCT"

# S1 Illumina adapter
tf.df$S1_primer <- "CACGACGCTCTTCCGATCT"

# Adding the barcodes
# Remove barcodes with EcoRI & NheI sites first
barcode.list <- as.data.frame(barcode.list)
barcode.list <- barcode.list[-grep("GAATTC|GCTAGC",barcode.list$barcode.list),]
#Randomizing barcodes
set.seed(123)
barcode.list <- sample(barcode.list)
tf.df$barcode <- barcode.list[1:nrow(tf.df)]

# 3' Primer sequence
tf.df$Primer2_seq <- "GTGTATCCTCTGTCCAGC"

SyntaxError: invalid syntax (<ipython-input-3-7321350c26b9>, line 2)

In [4]:
import os
os.getcwd()

'/DATA/usr/m.trauernicht/projects/tf_activity_reporter/Oligo_Design/MPRA-30-TFs'