<h1><center> 
MFE predictor using RNAVienna
</center></h1>

<center>
Luis R. Soenksen<sup>1,2*</sup>, Nicolaas M Angenent-Mari<sup>1,2*</sup> Diogo M. Camacho<sup>2*</sup>, Alexander S. Garruss<sup>2,3*</sup>, Katherine M Collins<sup>1*</sup>, Sameed Siddiqui<sup>1,2</sup>, George Church<sup>1,2,3,4</sup>, Timothy K. Lu<sup>1,4</sup>, and James J. Collins<sup>1,2,3,4</sup>
</center>


<center><font color=gray><font size="1.5">
<sup>1</sup>Massachusetts Institute of Technology, <sup>2</sup>Wyss Institute for Biologically Inspired Engineering, <sup>3</sup>Harvard John A. Paulson School of Engineering and Applied Sciences, and <sup>4</sup>Broad Institute of MIT and Harvard. *Contributed equally
</font></font></center>


---------------------------------------------------------------------------------------------

In [1]:
## Import Libraries
# General system libraries
import os
import numpy as np
import pandas as pd
from time import time
from IPython.display import Image

# DNA/RNA Analysis Libraries (Biopython, ViennaRNA, pysster) 
# Biopython Lib
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_rna, generic_dna, generic_protein, IUPAC
# ViennaRNA Lib
import RNA
# pysster Lib
from pysster import utils
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster.One_Hot_Encoder import One_Hot_Encoder
from pysster.Alphabet_Encoder import Alphabet_Encoder

# Import TPOT libs
from tpot import TPOTRegressor

# Import sklearn libs
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, r2_score

# Math & Visualization Libs
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Warnings
import warnings

Using TensorFlow backend.


### Import Dataset
Let's see what our file look like

In [5]:
# Create Data folder if not existent
data_folder = "data/"
if not os.path.isdir(data_folder):
    os.makedirs(data_folder)

# Define path to load desired Toehold dataset file (.csv)
data_filename = "2019-01-24_toehold_dataset.csv"
data_path = data_folder + data_filename
data = pd.read_csv(data_path)

#Show dataframe
data.head()

Unnamed: 0,organism,sequence_class,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,stem1,atg,stem2,linker,post_linker,random_switch_stem2,random_mfe,true_mfe,target
0,zika,viral,zika_tile_1,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AGTTGTTGATCTGTGTGAATCAGACTGCGA,AACCAAACACACAAACGCAC,TCGCAGTCTGATTCACACAGATCAACAACT,AACAGAGGAGA,AGTTGT,ATG,TCTGTGTGA,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,ATAAAAAAAGGCAGCGACAGTGGCTCGCGGGGCTTTTGATCCCCAG...,,-18.6,-18.6
1,zika,viral,zika_tile_2,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTGATCTGTGTGAATCAGACTGCGACAGTT,AACCAAACACACAAACGCAC,AACTGTCGCAGTCTGATTCACACAGATCAA,AACAGAGGAGA,TTGATC,ATG,GTGAATCAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,CCTACCCGCCGTGCCGCGCGCTCGCGTCACAAATGTAGGTGTATCG...,,-17.8,-17.8
2,zika,viral,zika_tile_3,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTGTGTGAATCAGACTGCGACAGTTCGAGT,AACCAAACACACAAACGCAC,ACTCGAACTGTCGCAGTCTGATTCACACAG,AACAGAGGAGA,CTGTGT,ATG,TCAGACTGC,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,TGGCTCGATAGGGCAAGCCCTTTTCGCCCACGTACATAACATTGTC...,,-21.7,-21.7
3,zika,viral,zika_tile_4,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TGAATCAGACTGCGACAGTTCGAGTTTGAA,AACCAAACACACAAACGCAC,TTCAAACTCGAACTGTCGCAGTCTGATTCA,AACAGAGGAGA,TGAATC,ATG,CTGCGACAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,GATGTTCCCTTAAACCGGTTCATGCTGTTTTTGTGCGAGCCGCCAA...,,-24.1,-24.1
4,zika,viral,zika_tile_5,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CAGACTGCGACAGTTCGAGTTTGAAGCGAA,AACCAAACACACAAACGCAC,TTCGCTTCAAACTCGAACTGTCGCAGTCTG,AACAGAGGAGA,CAGACT,ATG,ACAGTTCGA,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,GGCTTCTTCGCTCTCGTCCTGACTGGCGTCTTGTAGTCGCGTCTTT...,,-23.3,-23.3


In [7]:
### Random DNA 59-Nucleotide switch region to get MFE
random_switch_stem2_data = data['random_switch_stem2']

# Compute minimum free energy (MFE) and corresponding structure from RNA sequence (using ViennaRNA)
# see: https://www.tbi.univie.ac.at/RNA/ViennaRNA/doc/html/examples_python.html
random_mfe_data = []
random_ss_data = []
for index, seq in random_switch_stem2_data.items():
    (random_ss_seq, random_mfe_seq) = RNA.fold(seq)
    data.at[index, 'random_mfe'] = random_mfe_seq
    
# print output
print("MFEs calculated from Random Sequences!")

MFEs calculated from Random Sequences!


In [9]:
#Show dataframe
data.to_csv("2019-01-24_toehold_dataset_wRandomMFE.csv")
data.head()

Unnamed: 0,organism,sequence_class,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,stem1,atg,stem2,linker,post_linker,random_switch_stem2,random_mfe,true_mfe,target
0,zika,viral,zika_tile_1,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AGTTGTTGATCTGTGTGAATCAGACTGCGA,AACCAAACACACAAACGCAC,TCGCAGTCTGATTCACACAGATCAACAACT,AACAGAGGAGA,AGTTGT,ATG,TCTGTGTGA,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,ATAAAAAAAGGCAGCGACAGTGGCTCGCGGGGCTTTTGATCCCCAG...,-15.3,-18.6,-18.6
1,zika,viral,zika_tile_2,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTGATCTGTGTGAATCAGACTGCGACAGTT,AACCAAACACACAAACGCAC,AACTGTCGCAGTCTGATTCACACAGATCAA,AACAGAGGAGA,TTGATC,ATG,GTGAATCAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,CCTACCCGCCGTGCCGCGCGCTCGCGTCACAAATGTAGGTGTATCG...,-13.4,-17.8,-17.8
2,zika,viral,zika_tile_3,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTGTGTGAATCAGACTGCGACAGTTCGAGT,AACCAAACACACAAACGCAC,ACTCGAACTGTCGCAGTCTGATTCACACAG,AACAGAGGAGA,CTGTGT,ATG,TCAGACTGC,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,TGGCTCGATAGGGCAAGCCCTTTTCGCCCACGTACATAACATTGTC...,-12.3,-21.7,-21.7
3,zika,viral,zika_tile_4,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TGAATCAGACTGCGACAGTTCGAGTTTGAA,AACCAAACACACAAACGCAC,TTCAAACTCGAACTGTCGCAGTCTGATTCA,AACAGAGGAGA,TGAATC,ATG,CTGCGACAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,GATGTTCCCTTAAACCGGTTCATGCTGTTTTTGTGCGAGCCGCCAA...,-12.9,-24.1,-24.1
4,zika,viral,zika_tile_5,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CAGACTGCGACAGTTCGAGTTTGAAGCGAA,AACCAAACACACAAACGCAC,TTCGCTTCAAACTCGAACTGTCGCAGTCTG,AACAGAGGAGA,CAGACT,ATG,ACAGTTCGA,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAA...,GGCTTCTTCGCTCTCGTCCTGACTGGCGTCTTGTAGTCGCGTCTTT...,-9.6,-23.3,-23.3
