In [1]:
library(tidyverse)
library(pheatmap)
library(viridis)
library(stringr)

save_pheatmap_png <- function(x, filename, width=1200, height=1000, res = 200) {
  png(filename, width = width, height = height, res = res)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──

[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.4
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: viridisLite



In [2]:
save_dir = '../data/processed/fig4_modelling/vocab_sum_final_post_scan/'


# 4. add MPRA adaptors and barcodes

import in `../data/processed/fig4_modelling/vocab_sum_final_post_scan/mpra_lib_seq.csv`


note that the rs sites chosen from section 4B (8E1.ipynb) are: `[RS_ECORI ,RS_BAMHI , RS_XHOI , RS_NHEI]`
also import over constants

output is `../data/processed/fig4_modelling/vocab_sum_final_post_scan/mpra_oligo_df_filt.csv`

## 4A define constants and helper functions

In [27]:

# define constants
LEN_FILLER = 20
NUM_OLIGO_PER_MOTIF = 10
DESIRED_OLIGO_LENGTH = 135
MAX_OLIGO_LENGTH = 230
MAX_MOTIF_LENGTH = 63
FORWARD_PCR_PRIMER = 'ACTGGCCGCTTCACTG'
REVERSE_PCR_PRIMER = 'AGATCGGAAGAGCGTCG'
# LINKER_SPACER_v10 = 'CAGTATGCGATGCTCATGATTGTGACTGAAAGTCTATGAGTCCATTGA'
RS_ECORI = 'GAATTC' # 5'-3'
RS_BAMHI = 'GGATCC'
RS_XHOI = 'CTCGAG'
RS_NHEI = 'GCTAGC'

RS_LIST = c(RS_ECORI ,RS_BAMHI , RS_XHOI , RS_NHEI)
RS_LIST = paste(RS_LIST, collapse='|')


LETTERS=c('A', 'C', 'G', 'T')

In [28]:
# FUNCTIONS
gen_random_seq <- function (n=10, letters=LETTERS){
  # return(do.call(paste0, replicate(5, sample(letters, n, TRUE), FALSE)))
return(paste(sample(letters, n, TRUE),collapse =''))
}

gen_random_seq_restrict <- function(n=10, letters=LETTERS, rs_list = RS_LIST, max_iter = 100){
  counter = 0
  while (counter < max_iter){
    test = gen_random_seq (n, letters)
    if (!any(str_detect(test, rs_list))){
      return(test)
    }
    counter = counter + 1
    # print(c('failed',counter))
  }
}

read_fasta = function(filepath) {
  df = data.table( name = character(), motif = character())
  # print(df)
  con = file(filepath, "r")
  while ( TRUE ) {
    line = readLines(con, n = 1)
    line = str_replace_all(line, "[\r\n]" , "")
    if ( length(line) == 0 ) {
      break
    } 
    if ( str_detect(line, "^>") ) {
      n = str_split(line, "> ")[[1]][2]
    }
    else{
      m = line
      # print(c(n,m))
      df<-df %>%
        add_row(name=n,motif=m)
    }
    
  }

  close(con)
  return(df)
}


## 4B load barcodes

In [29]:
# load files
barcode_file = read.table('../data/external/barcodes_list_new.txt', header=FALSE)
# check barcodes for restriction sites within barcode and 
barcode_file_checked <- barcode_file %>%
    mutate(pass_restrict = !str_detect(as.character(V1), RS_LIST)) %>%
    filter(pass_restrict==TRUE) %>%
    mutate(barcode = as.character(V1))

# barcode_file_checked <- check_seq(barcode_file, "xbai_on")
barcode_file_checked <- filter(barcode_file_checked,pass_restrict==TRUE )
barcode_df <-  select(barcode_file_checked,barcode)

# shuffle barcodes
str(barcode_df)
barcode_df_shuf <- slice(barcode_df, sample(1:n()))

'data.frame':	399427 obs. of  1 variable:
 $ barcode: chr  "CTTAAGGCTTAGAAGAGTAT" "AAGATGGCTTAGAAGAGTAT" "GTATAGGCTTAGAAGAGTAT" "GATACGGCTTAGAAGAGTAT" ...


## 4C load sequences, 
check the edge cases and change the edges if it's bad

### pre edge manipulation

In [58]:
mpra_lib_seq = read.csv(paste0(save_dir, 'mpra_lib_seq.csv'),stringsAsFactors=F)
dim(mpra_lib_seq)
head(mpra_lib_seq)

Unnamed: 0_level_0,X,chr,name,seq,seq_type,start,stop
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>
1,0,chr14,ARID5B::KLF5+GDSD6+8+A_B,ACTCCCAAAACTTCCTAACAGAAGAGCGGTTGAAGGCAAGTGAGCTGCAAATAGCAATCTCTGCCTCTCCCCAAATAAATCCTCCCAAGGCTCACACTCAAGATTTGTGAAAGTTCCAACTCCTCCGGCGGGTGA,vocab,69253380,69253514
2,1,chr6,ARID5B::KLF5+GDSD6+5+A_B,CAGGTGGCCTTGATCCAGGCAAAATCTGGTTAAGATTCGGTTCACCAGAGAAACAGAACCAATAGGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTAGATGAGAGATTCGTTTTAAGGAATTGGC,vocab,13477381,13477515
3,2,chr6,ARID5B::KLF5+GDSD6+6+A_B,CAGGTGGCCTTGATCCAGGCAAAATCTGAGTGGTAGTTCATTCACCAGAGAAACAGAACCAATAGGGGGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTAGATGAGAGATTCGTTTTAAGGAATTGGC,vocab,13477381,13477515
4,3,chr17,ARID5B::KLF5+GDSD6+10+A_B,GACGGCGGCGACGCGCGGGCGGATTCAGCGGAGCGATCCACGAATTTGCGCCACTTCCCTATTCACCAACCCCGCCCCCGACTATCGGGGACTGCCCCCCCCGCTAAGATACGCCACTACCTCAGCGGCGCTAAC,vocab,7145720,7145854
5,4,chr19,ARID5B::KLF5+GDSD6+7+A_B,TCTTCCCCTCCACGCTGCGGGCTATGGGTACGCAGGGTATGCGTTTAATGAGTTGAATGATGCCCCACCCTAAAATATATGTCCACATTCCAGAACTGTGAATGTGACTTGACTTGGAGTAAGTGTCTTCAAAGA,vocab,14616931,14617065
6,5,chr1,ARID5B::KLF5+GDSD6+2+A_B,CGGAGCTTTCTGTCCCTGCTGTAGCAGGACTGCAACCCCAGACAGTATGGAAGGGAATGGGCGTGGCTCTGTTCCAATACTTCATTTATGGACGCTGAAGTTTGATTTCCACATAATTTTCACATGTCAGAAAGT,vocab,152008405,152008539


In [59]:
# check sequences for edge cases that go bad with addition of adaptors/rs sites
mpra_lib_seq  = mpra_lib_seq%>%
    mutate(pass_restrict_seq = !str_detect(seq, RS_LIST)) %>%
    mutate(pass_restrict_fwd_seq = !str_detect(str_c(FORWARD_PCR_PRIMER,seq), RS_LIST))%>%
    mutate(pass_restrict_seq_xho = !str_detect(str_c(seq,str_sub(RS_XHOI,1,-2)), RS_LIST))
# sequences
dim(mpra_lib_seq)
dim(distinct(select(mpra_lib_seq,name)))
# seq that aren't solvable
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_seq==TRUE),name)))
# seq that aren't solvable b/c of adaptor and Xho addition (the ends of the seq)
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_fwd_seq==TRUE),name)))
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_seq_xho==TRUE),name)))


# filter(mpra_lib_seq, pass_restrict_fwd_seq==FALSE)
# filter(mpra_lib_seq, pass_restrict_seq_xho==FALSE)

In [60]:
unique(nchar(mpra_lib_seq$seq))

In [61]:
# ## testing
# seq = 'AATTCTGATGAATGCATAAGTTAAACATTCAAGCTGGAGAAACTGGTGCCTGAGTACAAGGCCTGGAATGTGAAAACAAATACATTAAGACCTCGCCTGGACTTTCTCAGACCCTAACATCTGATCGAAGGCATT'
# seq
# paste0('CA',str_sub(seq, 3,-1))
# paste0(str_sub(seq, 1,-2),'C')

### post edge manipulation

- for things that miss the fwd_seq filter, change the first two letters to CA
- for things that miss the seq_xho filter, change the last letters to T 

keep modifying until something works

In [62]:

mpra_lib_seq = mpra_lib_seq%>%
    mutate(seq = if_else(pass_restrict_fwd_seq==TRUE, seq, paste0('CA',str_sub(seq, 3,-1))))%>%
    mutate(seq = if_else(pass_restrict_seq_xho==TRUE, seq, paste0(str_sub(seq, 1,-2),'C')))


In [63]:
# check sequences for edge cases that go bad with addition of adaptors/rs sites
mpra_lib_seq  = mpra_lib_seq%>%
    mutate(pass_restrict_seq = !str_detect(seq, RS_LIST)) %>%
    mutate(pass_restrict_fwd_seq = !str_detect(str_c(FORWARD_PCR_PRIMER,seq), RS_LIST))%>%
    mutate(pass_restrict_seq_xho = !str_detect(str_c(seq,str_sub(RS_XHOI,1,-2)), RS_LIST))
# sequences
dim(mpra_lib_seq)
dim(distinct(select(mpra_lib_seq,name)))
# seq that aren't solvable
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_seq==TRUE),name)))
# seq that aren't solvable b/c of adaptor and Xho addition (the ends of the seq)
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_fwd_seq==TRUE),name)))
dim(distinct(select(filter(mpra_lib_seq, pass_restrict_seq_xho==TRUE),name)))



## 6D make lib --> oligo (20x for barcode adding)

In [64]:
# making oligo sequences (try 20 first)
oligo_df <- mpra_lib_seq %>%
   group_by(name,seq, seq_type) %>% 
     expand(count = seq(1:as.numeric(NUM_OLIGO_PER_MOTIF*2))) %>% 
   ungroup()

In [65]:
# check if adding the FWD PRIMER +/- the 
oligo_df <- oligo_df %>%
  mutate(id = str_c(name,count, sep='.')) %>%
  mutate(pass_restrict_seq = !str_detect(seq, RS_LIST)) %>%
  mutate(pass_restrict_fwd_seq = !str_detect(str_c(FORWARD_PCR_PRIMER,seq), RS_LIST))%>%
  mutate(pass_restrict_seq_xho = !str_detect(str_c(seq,str_sub(RS_XHOI,1,-2)), RS_LIST))


In [66]:
# sequences
dim(oligo_df)
dim(distinct(select(oligo_df,name)))
# seq that aren't solvable
dim(distinct(select(filter(oligo_df, pass_restrict_seq==TRUE),name)))
# seq that aren't solvable b/c of adaptor and Xho addition (the ends of the seq)
dim(distinct(select(filter(oligo_df, pass_restrict_fwd_seq==TRUE),name)))
dim(distinct(select(filter(oligo_df, pass_restrict_seq_xho==TRUE),name)))


In [67]:
unique(nchar(oligo_df$seq))

as seen from the above edge manipulation section, all sequences are good

### adding barcodes to oligo library

In [68]:
oligo_df['barcode'] = barcode_df_shuf[1:nrow(oligo_df),]

In [69]:
colnames(oligo_df)
dim(oligo_df)
head(oligo_df)
# unique(oligo_df$name)[1:40]

name,seq,seq_type,count,id,pass_restrict_seq,pass_restrict_fwd_seq,pass_restrict_seq_xho,barcode
<chr>,<chr>,<chr>,<int>,<chr>,<lgl>,<lgl>,<lgl>,<chr>
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,1,ABCF3.1,True,True,True,GAAGTCGGCGTCAATAGTCC
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,2,ABCF3.2,True,True,True,CGATTGCAGGAGAATAGTCC
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,3,ABCF3.3,True,True,True,GTCATAGTCAGAGAAGCTCT
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,4,ABCF3.4,True,True,True,AGTTGGAATGAAGGCCTCGA
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,5,ABCF3.5,True,True,True,GTATAAAGCGCTGGCCTCGA
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,6,ABCF3.6,True,True,True,AGAACTATAAGTCGAGATCT


### adding all the parts of sequences together

In [70]:
oligo_df_annon = oligo_df %>%
  mutate(len_seq = str_length(seq)) %>%
  mutate(len_filler = LEN_FILLER)%>% #max(len_motif_combo) - len_motif_combo) %>%
  group_by(1:n()) %>%
  mutate(filler = gen_random_seq_restrict(n= len_filler))%>%
  ungroup()%>%
  mutate(filler_check = str_length(filler)==len_filler)%>%
  mutate(oligo = str_c(FORWARD_PCR_PRIMER, seq, RS_XHOI, filler, RS_NHEI, barcode, REVERSE_PCR_PRIMER))%>%
  mutate(len_oligo = str_length(oligo))%>%
  mutate(pass_restrict_ECORI = !str_detect(oligo, RS_ECORI)) %>%
  mutate(pass_restrict_BAMHI = !str_detect(oligo, RS_BAMHI)) %>%
  mutate(pass_restrict_XHOI = str_count(oligo, RS_XHOI) == 1) %>%
  mutate(pass_restrict_NHEI = str_count(oligo, RS_NHEI) == 1) %>%
  mutate(pass_all =filler_check& pass_restrict_ECORI & pass_restrict_BAMHI& pass_restrict_XHOI& pass_restrict_NHEI)

In [71]:
# how many errors
nrow(oligo_df_annon)
nrow(distinct(select(filter(oligo_df_annon, pass_all==FALSE),id)))
nrow(distinct(select(filter(oligo_df_annon, pass_restrict_ECORI==FALSE),id)))
nrow(distinct(select(filter(oligo_df_annon, pass_restrict_BAMHI==FALSE),id)))
nrow(distinct(select(filter(oligo_df_annon, pass_restrict_XHOI==FALSE),id)))
nrow(distinct(select(filter(oligo_df_annon, pass_restrict_NHEI==FALSE),id)))


### filter out the bad rs ones and saving


In [72]:
oligo_df_filt <- oligo_df_annon %>%
   filter(pass_all==TRUE)%>%
   group_by(name, seq) %>%
   do(sample_n(.,NUM_OLIGO_PER_MOTIF)) %>% 
   mutate(count =  1:n()) %>%
   ungroup()%>%
  mutate(id = str_c(name,count, sep='.'))%>%
 # recheck
  mutate(pass_restrict_ECORI = !str_detect(oligo, RS_ECORI)) %>%
  mutate(pass_restrict_BAMHI = !str_detect(oligo, RS_BAMHI)) %>%
  mutate(pass_restrict_XHOI = str_count(oligo, RS_XHOI) == 1) %>%
  mutate(pass_restrict_NHEI = str_count(oligo, RS_NHEI) == 1) %>%
  mutate(pass_all = pass_restrict_ECORI & pass_restrict_BAMHI& pass_restrict_XHOI& pass_restrict_NHEI)

In [73]:
# number of fails
nrow(distinct(select(filter(oligo_df_filt, pass_all==FALSE),id))) # should be 0
#number of distinct barcodes
summarize(oligo_df_filt,n_distinct( barcode)) # should be equal to num motifs * 10

n_distinct(barcode)
<int>
97210


In [74]:
colnames(oligo_df_filt)
dim(oligo_df_filt)
head(oligo_df_filt)


name,seq,seq_type,count,id,pass_restrict_seq,pass_restrict_fwd_seq,pass_restrict_seq_xho,barcode,len_seq,⋯,1:n(),filler,filler_check,oligo,len_oligo,pass_restrict_ECORI,pass_restrict_BAMHI,pass_restrict_XHOI,pass_restrict_NHEI,pass_all
<chr>,<chr>,<chr>,<int>,<chr>,<lgl>,<lgl>,<lgl>,<chr>,<int>,⋯,<int>,<chr>,<lgl>,<chr>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,1,ABCF3.1,True,True,True,AGTTGGAATGAAGGCCTCGA,136,⋯,4,TCCTTACAAAAGAACGGCCG,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGTCCTTACAAAAGAACGGCCGGCTAGCAGTTGGAATGAAGGCCTCGAAGATCGGAAGAGCGTCG,221,True,True,True,True,True
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,2,ABCF3.2,True,True,True,GTCATAGTCAGAGAAGCTCT,136,⋯,3,TCTCTTGACGCGATGCGAGC,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGTCTCTTGACGCGATGCGAGCGCTAGCGTCATAGTCAGAGAAGCTCTAGATCGGAAGAGCGTCG,221,True,True,True,True,True
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,3,ABCF3.3,True,True,True,GATACTTCATGCAATATGCT,136,⋯,19,TGCTATGGGGGATAGATCTT,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGTGCTATGGGGGATAGATCTTGCTAGCGATACTTCATGCAATATGCTAGATCGGAAGAGCGTCG,221,True,True,True,True,True
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,4,ABCF3.4,True,True,True,CGATTGCAGGAGAATAGTCC,136,⋯,2,ACCGTGCCAGGGCTCACCGT,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGACCGTGCCAGGGCTCACCGTGCTAGCCGATTGCAGGAGAATAGTCCAGATCGGAAGAGCGTCG,221,True,True,True,True,True
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,5,ABCF3.5,True,True,True,TGAGAACCAGTTCCAACGCA,136,⋯,11,AGACTGGTAGTGAACCTGAT,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGAGACTGGTAGTGAACCTGATGCTAGCTGAGAACCAGTTCCAACGCAAGATCGGAAGAGCGTCG,221,True,True,True,True,True
ABCF3,ACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCA,pos,6,ABCF3.6,True,True,True,GCTGGCGGCAATCGCTCAAC,136,⋯,8,AGAGTAGGATAAGAGCCGCT,True,ACTGGCCGCTTCACTGACCTTGAGCTCCCAGCCTTTGCCCCACCCCTCGCTCCCGGAACTCCACCTCCCAGAAGGCAGCGAGAACCGCACATGTGGCTAATCTTTCAGCGGAAAAGGTGTCGCGCACGCGCAGAGAGCGCGGCTCCGGCGCACTCGAGAGAGTAGGATAAGAGCCGCTGCTAGCGCTGGCGGCAATCGCTCAACAGATCGGAAGAGCGTCG,221,True,True,True,True,True


In [75]:
unique(nchar(oligo_df_filt$oligo))

oligos are 221 bp long

In [76]:
write.csv(oligo_df_filt, file=paste0(save_dir,'mpra_oligo_df_filt.csv'))
oligo_df_filt = read.csv(paste0(save_dir,'mpra_oligo_df_filt.csv'),stringsAsFactors=F)

## 6E create submission worthy txt files
- `../data/processed/fig4_modelling/vocab_sum_final/mpra_oligo_df_final.txt` - with columns: name, seq_type, id, oligo, seq, filler, barcode
- `../data/processed/fig4_modelling/vocab_sum_final/mpra_oligo_df_final_SUBMIT.txt` - with columns: id, oligo


In [77]:
write.table(select(oligo_df_filt, name, seq_type, id, oligo, seq, filler, barcode), file=paste0(save_dir,'mpra_oligo_df_final.txt'),sep='\t', quote = FALSE,row.names=FALSE)
write.table(select(oligo_df_filt,  id, oligo), file=paste0(save_dir,'mpra_oligo_df_final_SUBMIT.txt'),sep='\t', quote = FALSE,row.names=FALSE)
