This is the model trained on only Green et al. 2014 data with the same architecture as the previously described CNN model. We don't expect this to be able to generalize well as there are only 168 data points.

In [64]:
# import statements 

import os

import platform
import random
import shutil
import sys

import math
import itertools
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tqdm import tqdm_notebook as tqdm
import keras
from keras.models import load_model

# some visualization imports
from keras import activations

# various imports for the keras model
from keras.layers.core import Permute
from keras import backend as K
from keras.engine.topology import Layer
import keras as keras
from keras.callbacks import TensorBoard
from keras import metrics as metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Conv1D, Concatenate
from keras.optimizers import SGD
from keras.regularizers import l2

# evaluate performance w/ on and off regression separately 
from scipy.stats import pearsonr, spearmanr 

# imports for the grid search and kfold CV
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

# data one-hot encoding imports
from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

# Part 1: Load and clean up the data for use.

In [65]:
# Load in data
data_dir = ''
sequence_file = 'Green2014_clean.csv'
sequences = pd.read_csv(data_dir + sequence_file,sep=',')
print(sequences.head(5))

   Unnamed: 0  Toehold ID                                   Toehold sequence  \
0          11          68  AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...   
1         117         110  ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...   
2         108         100  CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...   
3         122         116  TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...   
4          17         117  TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...   

                    Switch region                         Trigger  Avg ONOFF  \
0  AATGTATGTAATAGTTCGTCGAGGTGTCCA  TGGACACCTCGACGAACTATTACATACATT       24.8   
1  ATGATAATGTAGAGGTGCGGAGTGATTGTA  TACAATCACTCCGCACCTCTACATTATCAT        9.7   
2  CGAAGTATTGTAAGGTGTAGTGTGCGTTGA  TCAACGCACACTACACCTTACAATACTTCG       13.6   
3  TAAGTAAATGAAAGTGTATGTATGTTGCTG  CAGCAACATACATACACTTTCATTTACTTA        8.7   
4  TCAATAAGGCGGAGTTCGTCGAGGTGCCTG  CAGGCACCTCGACGAACTCCGCCTTATTGA        8.5   

   sdev ONOFF Toehold Rating  
0      

In [66]:
seqs = sequences['Toehold sequence']
onoff_vals = np.array(sequences['Avg ONOFF'])
onoff_vals = preprocessing.MinMaxScaler().fit_transform(onoff_vals.reshape(-1, 1)) #normalize

# Part 2: Get data in the proper format for ML

In [67]:
from pysster.One_Hot_Encoder import One_Hot_Encoder
alph_letters = 'ATCG'
alph = list(alph_letters)

# one-hot encode
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

X = np.stack([_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
nsamples, nx, ny = X.shape # have to flatten the one hot encoded into one dimension

# reformat for CNN if needed
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(seqs[0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

input shape:  (168, 59, 4)
modified shape:  (168, 59, 4)


In [68]:
# reshape and normalize if needed (it is not needed)
y = np.array(onoff_vals)
print('target shape: ', y.shape)

target shape:  (168, 1)


# Part 3. Set-up framework for model. Ensure needed parameters can be varied.

In [69]:
from keras import optimizers
def twoheaded_conv1d(conv_layer_parameters, hidden_layers, dropout_rate = 0.2, reg_coeff = 0.0001,learning_rate=0.001, num_features = 59, num_channels = 4): 
    # num_features = seq length, num_channels = alphabet size (i.e. # nucleotides)
    X_in = Input(shape=(num_features,num_channels),dtype='float32')
    prior_layer = X_in 
    for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):
        conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
        prior_layer = conv_layer
    H = Flatten()(prior_layer)
    for idx, h in enumerate(hidden_layers): 
        H = Dropout(dropout_rate)(H)
        H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff),name='dense_'+str(idx))(H)
    out_on = Dense(1,activation="linear",name='on_output')(H)
    model = Model(inputs=[X_in], outputs=[out_on])
    opt = optimizers.adam(lr = learning_rate)
    model.compile(loss={'on_output': 'mse'},optimizer=opt,metrics=['mse'])
    return model


# Part 4. Define desired model features. Build sample model to view architecture.

In [70]:
# [(kernel_width_layer1, #filters_layer1), (kernel_width_layer2, #filters_layer2), ...]
conv_layer_parameters = [(5,10), (3,5)]
hidden_layer_choices = {5: (150, 60, 15),} # dependent on # filters in final convolutional layer before MLP 
hidden_layers = hidden_layer_choices[5]
dropout_rate = 0.1
l2_reg_coeff = 0.0001
learning_rate = 0.0005 

# build sample master model (to be trained completely later)
sample_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

# print model architecture 
sample_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        (None, 59, 4)             0         
_________________________________________________________________
conv_0 (Conv1D)              (None, 59, 10)            210       
_________________________________________________________________
conv_1 (Conv1D)              (None, 59, 5)             155       
_________________________________________________________________
flatten_16 (Flatten)         (None, 295)               0         
_________________________________________________________________
dropout_46 (Dropout)         (None, 295)               0         
_________________________________________________________________
dense_0 (Dense)              (None, 150)               44400     
_________________________________________________________________
dropout_47 (Dropout)         (None, 150)               0         
__________

# Part 5. Run K-Fold CV to ensure reliability of performance metrics. For on and off values. 

In [71]:
# define kfold object 
num_folds = 5 # really don't need to run 10 folds here since data is so tiny
seed = 0 # set for reproducability 
random.seed(seed)
kfold = KFold(n_splits=num_folds, shuffle=True, random_state= 0)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [72]:
# functions to evaluate the model

def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)[0]
    pearson_corr = pearsonr(preds_y, true_y)[0][0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return [r2_score, pearson_corr, spearman_corr]

def print_summary_results(avg_metrics, std_metrics): 
    print('Average:')
    print('\tR2:', avg_metrics[0], '\n\tPearson:', avg_metrics[1],'\n\tSpearman:', avg_metrics[2],)
    print('Standard deviation:')
    print('\tR2:', std_metrics[0], '\n\tPearson:', std_metrics[1],'\n\tSpearman:', std_metrics[2],)
    

In [73]:
# run kfold 
cv_scores_on=[]
preds_on = []
true_on = []
fold_count=0
for train, test in kfold.split(X, y): 
    print('Beginning fold #', fold_count)
    # create model w/ parameters as defined
    # NOTE: create a model from scratch each time to ensure no weights are carried over per fold  
    kfold_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                             dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                             learning_rate= learning_rate)
    
    # split data again for validation set (to be used w/ early stopping)
    X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)
    
    # train the model
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
    kfold_model.fit(X[train], [y[train][:,0]],epochs=num_epochs, batch_size=128,verbose=0, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])

    # evaluate the model
    y_preds = np.array(kfold_model.predict(X_test))
    
    # get ON/OFF metrics (mark as on_metrics for simplicity)
    print('--- ON/OFF Metrics ---')
    on_metrics = compute_metrics(y_preds,np.expand_dims(y_test[:,0], 1))
    cv_scores_on.append(on_metrics)
    preds_on.append(np.squeeze(y_preds))
    true_on.append(y_test[:,0])
    
    # delete model to ensure no weights are carried over 
    del kfold_model

    fold_count += 1
    
out_dir = 'metrics/'    
np.savetxt(out_dir + 'only_green_trained_preds.csv', preds_on, delimiter=",")
np.savetxt(out_dir + 'only_green_trained_true.csv', true_on, delimiter=",")

Beginning fold # 0
--- ON/OFF Metrics ---
R2:  0.007461835930195169
Pearson:  -0.08638191900042028
Spearman:  -0.07847947520459901
Beginning fold # 1
--- ON/OFF Metrics ---
R2:  0.060966856704937816
Pearson:  0.24691467494852917
Spearman:  0.14705882352941177
Beginning fold # 2
--- ON/OFF Metrics ---
R2:  0.40303155479837793
Pearson:  0.6348476626706425
Spearman:  0.1361128398079764
Beginning fold # 3
--- ON/OFF Metrics ---
R2:  0.23168635828049927
Pearson:  0.4813380914497618
Spearman:  0.27696078431372556
Beginning fold # 4
--- ON/OFF Metrics ---
R2:  0.31140850741985876
Pearson:  0.5580398797755038
Spearman:  0.5343137254901961


# Part 6. Compute average metrics.


In [74]:
avg_metric_folds_on = np.mean(cv_scores_on, axis = 0) # avg over columns 
std_metric_folds_on = np.std(cv_scores_on, axis = 0) # st dev over columns

In [75]:
print('--- ON/OFF Metrics ---')
print_summary_results(avg_metric_folds_on, std_metric_folds_on)

--- ON/OFF Metrics ---
Average:
	R2: 0.20291102262677377 
	Pearson: 0.3669516779688034 
	Spearman: 0.20319333958734215
Standard deviation:
	R2: 0.14899428335654766 
	Pearson: 0.2612613416919032 
	Spearman: 0.20103372741828057


# Part 7. Train model and save for future use.


In [76]:
# train on more of the data (no testing - use metrics from kfold as final metrics)
# have small held-out data for early stopping

# split data again for validation set (to be used w/ early stopping)
train_size = 0.851
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = train_size, test_size = 1-train_size)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [77]:
# build model
model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

In [78]:
# train model
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=2, mode='auto')
model.fit(X_train, [y_train[:,0]],epochs=num_epochs, batch_size=128,verbose=2, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])


Train on 142 samples, validate on 26 samples
Epoch 1/150
 - 2s - loss: 0.5507 - mean_squared_error: 0.5199 - val_loss: 0.2512 - val_mean_squared_error: 0.2203
Epoch 2/150
 - 0s - loss: 0.2555 - mean_squared_error: 0.2246 - val_loss: 0.1453 - val_mean_squared_error: 0.1145
Epoch 3/150
 - 0s - loss: 0.1270 - mean_squared_error: 0.0961 - val_loss: 0.1072 - val_mean_squared_error: 0.0764
Epoch 4/150
 - 0s - loss: 0.1063 - mean_squared_error: 0.0755 - val_loss: 0.0933 - val_mean_squared_error: 0.0625
Epoch 5/150
 - 0s - loss: 0.0924 - mean_squared_error: 0.0615 - val_loss: 0.0892 - val_mean_squared_error: 0.0584
Epoch 6/150
 - 0s - loss: 0.0850 - mean_squared_error: 0.0542 - val_loss: 0.0884 - val_mean_squared_error: 0.0576
Epoch 7/150
 - 0s - loss: 0.0845 - mean_squared_error: 0.0536 - val_loss: 0.0893 - val_mean_squared_error: 0.0585
Epoch 8/150
 - 0s - loss: 0.0844 - mean_squared_error: 0.0536 - val_loss: 0.0902 - val_mean_squared_error: 0.0594
Epoch 9/150
 - 0s - loss: 0.0881 - mean_squ

Epoch 72/150
 - 0s - loss: 0.0467 - mean_squared_error: 0.0168 - val_loss: 0.0723 - val_mean_squared_error: 0.0425
Epoch 73/150
 - 0s - loss: 0.0481 - mean_squared_error: 0.0183 - val_loss: 0.0722 - val_mean_squared_error: 0.0424
Epoch 74/150
 - 0s - loss: 0.0470 - mean_squared_error: 0.0171 - val_loss: 0.0724 - val_mean_squared_error: 0.0426
Epoch 75/150
 - 0s - loss: 0.0437 - mean_squared_error: 0.0139 - val_loss: 0.0729 - val_mean_squared_error: 0.0431
Epoch 76/150
 - 0s - loss: 0.0464 - mean_squared_error: 0.0166 - val_loss: 0.0734 - val_mean_squared_error: 0.0437
Epoch 77/150
 - 0s - loss: 0.0462 - mean_squared_error: 0.0164 - val_loss: 0.0738 - val_mean_squared_error: 0.0440
Epoch 78/150
 - 0s - loss: 0.0432 - mean_squared_error: 0.0134 - val_loss: 0.0739 - val_mean_squared_error: 0.0442
Epoch 79/150
 - 0s - loss: 0.0469 - mean_squared_error: 0.0172 - val_loss: 0.0740 - val_mean_squared_error: 0.0442
Epoch 80/150
 - 0s - loss: 0.0416 - mean_squared_error: 0.0119 - val_loss: 0.074

<keras.callbacks.History at 0x13af77630>

In [79]:
# save model (architecture + weights) 
out_dir = '../models/'
model.save(out_dir + 'only_green_trained_model.h5')
model.save_weights(out_dir + 'only_green_trained_model_weights.h5')