In [1]:
## Import Libraries
# General system libraries
import os
import numpy as np
import pandas as pd
from time import time
from IPython.display import Image

# Multiprocessing
import multiprocessing

# DNA/RNA Analysis Libraries (Biopython, ViennaRNA, pysster) 
# Biopython Lib
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import generic_rna, generic_dna, generic_protein, IUPAC
# ViennaRNA Lib
import RNA
# pysster Lib
from pysster import utils
from pysster.Data import Data
from pysster.Grid_Search import Grid_Search
from pysster.One_Hot_Encoder import One_Hot_Encoder
from pysster.Alphabet_Encoder import Alphabet_Encoder

# Import TPOT libs
from tpot import TPOTRegressor

# Import sklearn libs
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.metrics import median_absolute_error, r2_score

# Math & Visualization Libs
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Warnings
import warnings

Using TensorFlow backend.


In [2]:
# Create Data folder if not existent
data_folder = "data/"
if not os.path.isdir(data_folder):
    os.makedirs(data_folder)

In [3]:
#JOIN FORWARD AND REVESE READ FILES
# Define path to load desired first half of Toehold dataset file (.csv)
data_filename_A = "2019-02-06_toehold_dataset_A.csv"
data_path_A = data_folder + data_filename_A
data_A = pd.read_csv(data_path_A)

# Define path to load desired second half of Toehold dataset file (.csv)
data_filename_B= "2019-02-06_toehold_dataset_B.csv"
data_path_B = data_folder + data_filename_B
data_B = pd.read_csv(data_path_B)

# Adds the integer values of counts from both dataframes
data_fr = data_A
data_fr_list =['On_Gate1_counts', 'On_Gate2_counts', 'On_Gate3_counts', 'On_Gate4_counts', 'Off_Gate1_counts', 'Off_Gate2_counts', 'Off_Gate3_counts', 'Off_Gate4_counts']
data_fr[data_fr_list] = data_A[data_fr_list].add(data_B[data_fr_list], fill_value=0).values

In [4]:
data_path_join = (data_folder + data_filename_A.replace('_A.csv','.csv')) 
data_fr.to_csv(data_path_join, index=False)
display(data_fr.head())

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,post_linker,nupack_mfe,On_Gate1_counts,On_Gate2_counts,On_Gate3_counts,On_Gate4_counts,Off_Gate1_counts,Off_Gate2_counts,Off_Gate3_counts,Off_Gate4_counts
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,TAAAGGAGAA,-13.0,,,,,,,,
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,TAAAGGAGAA,-16.4,,,,,,,,
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACAACAACAAACAA,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,TAAAGGAGAA,-13.2,,,,,,,,
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACAACAACAAACAA,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,TAAAGGAGAA,-21.5,10.0,17.0,8.0,7.0,1.0,15.0,8.0,
4,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACAACAACAAACAA,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,...,TAAAGGAGAA,-14.9,,,,,4.0,3.0,3.0,


In [5]:
# Define path to load desired Toehold dataset file (.csv)
data_filename = "2019-02-06_toehold_dataset.csv"
data_path = data_folder + data_filename
data = pd.read_csv(data_path)

#Change all NaNs for zeros
data = data.fillna(0)

#Show dataframe
data.head()

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,post_linker,nupack_mfe,On_Gate1_counts,On_Gate2_counts,On_Gate3_counts,On_Gate4_counts,Off_Gate1_counts,Off_Gate2_counts,Off_Gate3_counts,Off_Gate4_counts
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,TAAAGGAGAA,-13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,TAAAGGAGAA,-16.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACAACAACAAACAA,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,TAAAGGAGAA,-13.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACAACAACAAACAA,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,TAAAGGAGAA,-21.5,10.0,17.0,8.0,7.0,1.0,15.0,8.0,0.0
4,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACAACAACAAACAA,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,...,TAAAGGAGAA,-14.9,0.0,0.0,0.0,0.0,4.0,3.0,3.0,0.0


In [6]:
# Define or Calculate Total Counts in Bin N (CountN)
on_gate_vars = ['On_Gate1_counts','On_Gate2_counts','On_Gate3_counts','On_Gate4_counts']
countN_on_gates = data[on_gate_vars].sum()

off_gate_vars = ['Off_Gate1_counts','Off_Gate2_counts','Off_Gate3_counts','Off_Gate4_counts']
countN_off_gates = data[off_gate_vars].sum()
print('Calculated CountN sums: ON')
print(countN_on_gates)
print()
print('Calculated CountN sums: OFF')
print(countN_off_gates)
print()


# Define or Calculate Total Fraction of cells in Bin N (FractN)
fractN_on_gates = pd.Series([0.2725, 0.3602, 0.2367, 0.1313], 
                            index = on_gate_vars)

fractN_off_gates = pd.Series([0.4536, 0.3877, 0.1293, 0.0311], 
                            index = off_gate_vars)
print('Defined FractN sums: ON')
print(fractN_on_gates)
print()
print('Defined FractN sums: OFF')
print(fractN_off_gates)
print()

Calculated CountN sums: ON
On_Gate1_counts    377075.0
On_Gate2_counts    333920.0
On_Gate3_counts    511699.0
On_Gate4_counts    534370.0
dtype: float64

Calculated CountN sums: OFF
Off_Gate1_counts     802916.0
Off_Gate2_counts     962969.0
Off_Gate3_counts    1217091.0
Off_Gate4_counts     810745.0
dtype: float64

Defined FractN sums: ON
On_Gate1_counts    0.2725
On_Gate2_counts    0.3602
On_Gate3_counts    0.2367
On_Gate4_counts    0.1313
dtype: float64

Defined FractN sums: OFF
Off_Gate1_counts    0.4536
Off_Gate2_counts    0.3877
Off_Gate3_counts    0.1293
Off_Gate4_counts    0.0311
dtype: float64



In [7]:
# Create Copy of dataframe to insert new columns
data_proc = data;

# Calculate and insert base adjusted ON count vector values
col_counts = on_gate_vars
new_col_names = ['Cb1_on','Cb2_on','Cb3_on','Cb4_on']
n=0
for column in col_counts:
    data_proc[new_col_names[n]] = data[column] * fractN_on_gates[column] / countN_on_gates[column]
    n=n+1;

# Calculate and insert base adjusted OFF count vector values
col_counts = off_gate_vars  
new_col_names = ['Cb1_off','Cb2_off','Cb3_off','Cb4_off']
n=0
for column in col_counts:
    data_proc[new_col_names[n]] = data[column] * fractN_off_gates[column] / countN_off_gates[column]
    n=n+1;   

# Calculate and insert count-normalized adjusted ON count vector values
col_counts = on_gate_vars
new_col_names = ['Cbn1_on','Cbn2_on','Cbn3_on','Cbn4_on']
n=0
for column in col_counts:
    data_proc[new_col_names[n]] = (data[column] * fractN_on_gates[column] / countN_on_gates[column])/data_proc[['Cb1_on','Cb2_on','Cb3_on','Cb4_on']].sum(axis=1)
    n=n+1;

# Calculate and insert count-normalized adjusted OFF count vector values
col_counts = off_gate_vars  
new_col_names = ['Cbn1_off','Cbn2_off','Cbn3_off','Cbn4_off']
n=0
for column in col_counts:
    data_proc[new_col_names[n]] = (data[column] * fractN_off_gates[column] / countN_off_gates[column])/data_proc[['Cb1_off','Cb2_off','Cb3_off','Cb4_off']].sum(axis=1)
    n=n+1;  

    
# Calculate and insert ON, OFF & ON-OFF collapsed values
#data_proc['ON'] = (data_proc['Cb1_on']*0 + data_proc['Cb2_on']*(1/3) + data_proc['Cb3_on']*(2/3) + data_proc['Cb4_on']*(1))
#data_proc['OFF'] = (data_proc['Cb1_off']*0 + data_proc['Cb2_off']*(1/3) + data_proc['Cb3_off']*(2/3) + data_proc['Cb4_off']*(1))
#data_proc['ON_OFF'] = data_proc['ON'] - data_proc['OFF']

# Calculate and insert normalized "n" ON, OFF & ON-OFF collapsed values
data_proc['ON'] = (data_proc['Cbn1_on']*0 + data_proc['Cbn2_on']*(1/3) + data_proc['Cbn3_on']*(2/3) + data_proc['Cbn4_on']*(1))
data_proc['OFF'] = (data_proc['Cbn1_off']*0 + data_proc['Cbn2_off']*(1/3) + data_proc['Cbn3_off']*(2/3) + data_proc['Cbn4_off']*(1))
data_proc['ON_OFF'] = data_proc['ON'] - data_proc['OFF']


In [11]:
#Show last section of dataframe
#data_proc[['Cbn1_on','Cbn2_on','Cbn3_on','Cbn4_on','Cbn1_off','Cbn2_off','Cbn3_off','Cbn4_off','ON','OFF','ON_OFF']]

In [9]:
data_path_proc = (data_folder + data_filename.replace('.csv','_proc.csv')) 
data_proc.to_csv(data_path_proc, index=False)
display(data_proc.head())

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,Cbn2_on,Cbn3_on,Cbn4_on,Cbn1_off,Cbn2_off,Cbn3_off,Cbn4_off,ON,OFF,ON_OFF
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,,,,,,,,,,
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACAACAACAAACAA,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,,,,,,,,,,
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACAACAACAAACAA,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,,,,,,,,,,
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACAACAACAAACAA,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,0.591829,0.119432,0.055509,0.075791,0.81019,0.114019,0.0,0.332407,0.346076,-0.013669
4,AACCAAACACACAAACGCACAAAAAAAATGGAAAACAGTTACTAAT...,ACATATTAGTAACTGTTTTCCATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2624,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACATATTAGTAACTGTTTTCCATTTTTTTT,AACAACAACAAACAA,AAAAAAAATGGAAAACAGTTACTAATATGT,AACAGAGGAGA,...,,,,0.596826,0.318999,0.084175,0.0,,0.16245,


In [16]:
# Drop rows where on-off is NaN
data_proc_on_off = data_proc.dropna(subset=['ON_OFF'])
data_proc_on_off = data_proc_on_off.drop(['Cb1_on','Cb2_on','Cb3_on','Cb4_on','Cb1_off','Cb2_off','Cb3_off','Cb4_off'], axis=1)
data_path_proc_on_off = (data_folder + data_filename.replace('.csv','_proc_on_off.csv')) 
data_proc_on_off.to_csv(data_path_proc_on_off, index=False)
display(data_proc_on_off.head())

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,Cbn2_on,Cbn3_on,Cbn4_on,Cbn1_off,Cbn2_off,Cbn3_off,Cbn4_off,ON,OFF,ON_OFF
3,AACCAAACACACAAACGCACAAAAAAAATAACGTAGGACTACTACT...,TCCAAGTAGTAGTCCTACGTTATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_6492,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TCCAAGTAGTAGTCCTACGTTATTTTTTTT,AACAACAACAAACAA,AAAAAAAATAACGTAGGACTACTACTTGGA,AACAGAGGAGA,...,0.591829,0.119432,0.055509,0.075791,0.81019,0.114019,0.0,0.332407,0.346076,-0.013669
5,AACCAAACACACAAACGCACAAAAAAAATTTGGATTTATTTATGTC...,ATGAGACATAAATAAATCCAAATTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_19684,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ATGAGACATAAATAAATCCAAATTTTTTTT,AACAACAACAAACAA,AAAAAAAATTTGGATTTATTTATGTCTCAT,AACAGAGGAGA,...,1.0,0.0,0.0,0.244214,0.72517,0.030616,0.0,0.333333,0.262134,0.071199
6,AACCAAACACACAAACGCACAAAAAAACATGAGCTTTGCTTTTTTC...,ACTTGAAAAAAGCAAAGCTCATGTTTTTTTAACCAAACACACAAAC...,human_PROX1,human_PROX1_tile_176,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACTTGAAAAAAGCAAAGCTCATGTTTTTTT,AACAACAACAAACAA,AAAAAAACATGAGCTTTGCTTTTTTCAAGT,AACAGAGGAGA,...,0.0,0.12178,0.802118,0.150849,0.161256,0.59571,0.092185,0.883305,0.543077,0.340228
11,AACCAAACACACAAACGCACAAAAAAAGATTTTTTTCCGATGTTGA...,TGTATCAACATCGGAAAAAAATCTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_7220,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TGTATCAACATCGGAAAAAAATCTTTTTTT,AACAACAACAAACAA,AAAAAAAGATTTTTTTCCGATGTTGATACA,AACAGAGGAGA,...,0.139674,0.459203,0.307549,0.0,0.136372,0.863628,0.0,0.660242,0.621209,0.039033
22,AACCAAACACACAAACGCACAAAAAAATGATTTCCATATCTTTGAT...,ACCCATCAAAGATATGGAAATCATTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_8336,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACCCATCAAAGATATGGAAATCATTTTTTT,AACAACAACAAACAA,AAAAAAATGATTTCCATATCTTTGATGGGT,AACAGAGGAGA,...,0.10546,0.587911,0.024022,0.377289,0.604974,0.017737,0.0,0.451116,0.213483,0.237633


-----------------------------------------------------------------------------------------------------------------