Goal: train, evaluate, and save a single model architecture.

In [39]:
# import statements 

import os

import platform
import random
import shutil
import sys

import math
import itertools
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tqdm import tqdm_notebook as tqdm
import keras
from keras.models import load_model

# some visualization imports
from keras import activations

# various imports for the keras model
from keras.layers.core import Permute
from keras import backend as K
from keras.engine.topology import Layer
import keras as keras
from keras.callbacks import TensorBoard
from keras import metrics as metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Conv1D, Concatenate
from keras.optimizers import SGD
from keras.regularizers import l2

# evaluate performance w/ on and off regression separately 
from scipy.stats import pearsonr, spearmanr 

# imports for the grid search and kfold CV
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

# data one-hot encoding imports
from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical


# Part 1: Load in data.

In [40]:
# Load in data
data_dir = ''
sequence_file = 'Green2014_clean.csv'
sequences = pd.read_csv(data_dir + sequence_file,sep=',')
print(sequences.head(5))

   Unnamed: 0  Toehold ID                                   Toehold sequence  \
0          11          68  AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...   
1         117         110  ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...   
2         108         100  CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...   
3         122         116  TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...   
4          17         117  TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...   

                    Switch region                         Trigger  Avg ONOFF  \
0  AATGTATGTAATAGTTCGTCGAGGTGTCCA  TGGACACCTCGACGAACTATTACATACATT       24.8   
1  ATGATAATGTAGAGGTGCGGAGTGATTGTA  TACAATCACTCCGCACCTCTACATTATCAT        9.7   
2  CGAAGTATTGTAAGGTGTAGTGTGCGTTGA  TCAACGCACACTACACCTTACAATACTTCG       13.6   
3  TAAGTAAATGAAAGTGTATGTATGTTGCTG  CAGCAACATACATACACTTTCATTTACTTA        8.7   
4  TCAATAAGGCGGAGTTCGTCGAGGTGCCTG  CAGGCACCTCGACGAACTCCGCCTTATTGA        8.5   

   sdev ONOFF Toehold Rating  
0      

In [41]:
seqs = sequences['Toehold sequence']
onoff_vals = np.array(sequences['Avg ONOFF'])
onoff_vals = preprocessing.MinMaxScaler().fit_transform(onoff_vals.reshape(-1, 1)) #normalize

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [42]:
from pysster.One_Hot_Encoder import One_Hot_Encoder
alph_letters = 'ATCG'
alph = list(alph_letters)

# one-hot encode 
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

X = np.stack([_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
nsamples, nx, ny = X.shape # have to flatten the one hot encoded into one dimension

# reformat for CNN if needeed
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(seqs[0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

input shape:  (168, 59, 4)
modified shape:  (168, 59, 4)


In [43]:
# reshape and normalize if needed (it is not needed)
y = np.array(onoff_vals)
print('target shape: ', y.shape)

target shape:  (168, 1)


# Part 3. Set-up framework for model. Ensure needed parameters can be varied.

In [44]:
from keras import optimizers
def twoheaded_conv1d(transfer_model, conv_layer_parameters, hidden_layers, dropout_rate = 0.2, reg_coeff = 0.0001,learning_rate=0.001, num_features = seq_len, num_channels = 4): 
    # num_features = seq length, num_channels = alphabet size (i.e. # nucleotides)
    X_in = Input(shape=(num_features,num_channels),dtype='float32')
    prior_layer = X_in 
    if conv_layer_parameters != None: # add in transfer learning here
        for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):            
            transfer_weights = transfer_model.layers[idx+1].get_weights()
            # When adding a new layer, you can define the argument "weights", a list that contains initial w and b with shape speicified.
            conv_layer = Conv1D(filters=num_filters, weights = transfer_weights, trainable=False, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
            prior_layer = conv_layer
    H = Flatten()(prior_layer)
    # have to do some bookkeeeping on the number of layers here
    num_conv_layers_in_tf_model = 2
    num_addl_to_add_to_correct_for_dropout_layers = 2
    for idx, h in enumerate(hidden_layers): 
        idx_num = idx+num_conv_layers_in_tf_model+num_addl_to_add_to_correct_for_dropout_layers+1
        transfer_weights = transfer_model.layers[idx_num].get_weights()
        H = Dropout(dropout_rate)(H)
        H = Dense(h, activation='relu', weights=transfer_weights, kernel_regularizer=l2(reg_coeff),name='dense_'+str(idx))(H)
        num_addl_to_add_to_correct_for_dropout_layers = num_addl_to_add_to_correct_for_dropout_layers + 1
    out_on = Dense(1,activation="linear",name='on_output')(H)
    model = Model(inputs=[X_in], outputs=[out_on])
    opt = optimizers.adam(lr = learning_rate)
    model.compile(loss={'on_output': 'mse'},optimizer=opt,metrics=['mse'])
    return model


# Part 4a: Load trained model

In [45]:
transfer_model_dir = '../models/'
transfer_final_model_path = transfer_model_dir + 'onoff_original_model.h5'
transfer_final_weights_path = transfer_model_dir + 'onoff_original_model_weights.h5'
transfer_model = load_model(transfer_final_model_path)
transfer_model.load_weights(transfer_final_weights_path)

# visually inspect architecture
transfer_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_22 (InputLayer)        (None, 59, 4)             0         
_________________________________________________________________
conv_0 (Conv1D)              (None, 59, 10)            210       
_________________________________________________________________
conv_1 (Conv1D)              (None, 59, 5)             155       
_________________________________________________________________
flatten_22 (Flatten)         (None, 295)               0         
_________________________________________________________________
dropout_64 (Dropout)         (None, 295)               0         
_________________________________________________________________
dense_0 (Dense)              (None, 150)               44400     
_________________________________________________________________
dropout_65 (Dropout)         (None, 150)               0         
__________

# Part 4. Define desired model features. Build sample model to view architecture.

In [46]:
# [(kernel_width_layer1, #filters_layer1), (kernel_width_layer2, #filters_layer2), ...]
conv_layer_parameters = [(5,10), (3,5)]
hidden_layer_choices = {5: (150, 60, 15),} # dependent on # filters in final convolutional layer before MLP 
hidden_layers = hidden_layer_choices[5]
dropout_rate = 0.1
l2_reg_coeff = 0.0001
learning_rate = 0.0005 

# build sample master model (to be trained completely later)
sample_model = twoheaded_conv1d(transfer_model, conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

# print model architecture 
sample_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 59, 4)             0         
_________________________________________________________________
conv_0 (Conv1D)              (None, 59, 10)            210       
_________________________________________________________________
conv_1 (Conv1D)              (None, 59, 5)             155       
_________________________________________________________________
flatten_15 (Flatten)         (None, 295)               0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 295)               0         
_________________________________________________________________
dense_0 (Dense)              (None, 150)               44400     
_________________________________________________________________
dropout_44 (Dropout)         (None, 150)               0         
__________

# Part 5. Run K-Fold CV to ensure reliability of performance metrics. For on and off values. 

In [47]:
# define kfold object 
num_folds = 5 # smaller than 10 because 168 data points
seed = 0 # set for reproducability 
random.seed(seed)
kfold = KFold(n_splits=num_folds, shuffle=True, random_state= 0)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [48]:
# functions to evaluate the model

def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)[0]
    pearson_corr = pearsonr(preds_y, true_y)[0][0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return [r2_score, pearson_corr, spearman_corr]

def print_summary_results(avg_metrics, std_metrics): 
    print('Average:')
    print('\tR2:', avg_metrics[0], '\n\tPearson:', avg_metrics[1],'\n\tSpearman:', avg_metrics[2],)
    print('Standard deviation:')
    print('\tR2:', std_metrics[0], '\n\tPearson:', std_metrics[1],'\n\tSpearman:', std_metrics[2],)
    

In [49]:
# run kfold 
cv_scores_on=[]
preds_on = []
true_on = []
fold_count=0
for train, test in kfold.split(X, y): 
    print('Beginning fold #', fold_count)
    # create model w/ parameters as defined
    # NOTE: create a model from scratch each time to ensure no weights are carried over per fold  
    kfold_model = twoheaded_conv1d(transfer_model, conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                             dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                             learning_rate= learning_rate)
    
    # split data again for validation set (to be used w/ early stopping)
    X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)
    
    # train the model
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
    kfold_model.fit(X[train], [y[train][:,0]],epochs=num_epochs, batch_size=128,verbose=0, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])

    # evaluate the model
    y_preds = np.array(kfold_model.predict(X_test))
    
    # get ON/OFF metrics (mark as on_metrics for simplicity)
    print('--- ON/OFF Metrics ---')
    on_metrics = compute_metrics(y_preds,np.expand_dims(y_test[:,0], 1))
    cv_scores_on.append(on_metrics)
    preds_on.append(np.squeeze(y_preds))
    true_on.append(y_test[:,0])
    
    # delete model to ensure no weights are carried over 
    del kfold_model

    fold_count += 1

out_dir = 'metrics/'
np.savetxt(out_dir + 'freeze_weights_tf_onoff_preds.csv', preds_on, delimiter=",")
np.savetxt(out_dir + 'freeze_weights_tf_onoff_true.csv', true_on, delimiter=",")

Beginning fold # 0
--- ON/OFF Metrics ---
R2:  0.15109850175371958
Pearson:  0.388713907332526
Spearman:  0.4061352750464285
Beginning fold # 1
--- ON/OFF Metrics ---
R2:  0.004320035178234364
Pearson:  0.06572697450997089
Spearman:  0.10294117647058824
Beginning fold # 2
--- ON/OFF Metrics ---
R2:  0.018765997619138754
Pearson:  0.13698904196737327
Spearman:  0.2034313725490196
Beginning fold # 3
--- ON/OFF Metrics ---
R2:  0.48632365807943845
Pearson:  0.6973690974508682
Spearman:  0.32843137254901966
Beginning fold # 4
--- ON/OFF Metrics ---
R2:  0.24493566323392532
Pearson:  0.4949097526154898
Spearman:  0.7598039215686275


# Part 6. Compute average metrics.


In [50]:
avg_metric_folds_on = np.mean(cv_scores_on, axis = 0) # avg over columns 
std_metric_folds_on = np.std(cv_scores_on, axis = 0) # st dev over columns

In [51]:
print('--- ON/OFF Metrics ---')
print_summary_results(avg_metric_folds_on, std_metric_folds_on)

--- ON/OFF Metrics ---
Average:
	R2: 0.1810887711728913 
	Pearson: 0.3567417547752456 
	Spearman: 0.3601486236367367
Standard deviation:
	R2: 0.17648798650892475 
	Pearson: 0.23200019735502342 
	Spearman: 0.2251937303548586


# Part 7. Train model and save for future use.


In [52]:
# train on more of the data (no testing - use metrics from kfold as final metrics)
# have small held-out data for early stopping

# split data again for validation set (to be used w/ early stopping)
train_size = 0.851
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = train_size, test_size = 1-train_size)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [53]:
# build model
model = twoheaded_conv1d(transfer_model, conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

In [54]:
# train model
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=2, mode='auto')
model.fit(X_train, [y_train[:,0]],epochs=num_epochs, batch_size=128,verbose=2, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])


Train on 142 samples, validate on 26 samples
Epoch 1/150
 - 2s - loss: 0.0872 - mean_squared_error: 0.0830 - val_loss: 0.0670 - val_mean_squared_error: 0.0628
Epoch 2/150
 - 0s - loss: 0.0524 - mean_squared_error: 0.0483 - val_loss: 0.0740 - val_mean_squared_error: 0.0698
Epoch 3/150
 - 0s - loss: 0.0522 - mean_squared_error: 0.0480 - val_loss: 0.0790 - val_mean_squared_error: 0.0748
Epoch 4/150
 - 0s - loss: 0.0519 - mean_squared_error: 0.0478 - val_loss: 0.0803 - val_mean_squared_error: 0.0761
Epoch 5/150
 - 0s - loss: 0.0492 - mean_squared_error: 0.0450 - val_loss: 0.0782 - val_mean_squared_error: 0.0740
Epoch 6/150
 - 0s - loss: 0.0471 - mean_squared_error: 0.0429 - val_loss: 0.0741 - val_mean_squared_error: 0.0699
Epoch 7/150
 - 0s - loss: 0.0433 - mean_squared_error: 0.0391 - val_loss: 0.0703 - val_mean_squared_error: 0.0661
Epoch 8/150
 - 0s - loss: 0.0416 - mean_squared_error: 0.0375 - val_loss: 0.0667 - val_mean_squared_error: 0.0625
Epoch 9/150
 - 0s - loss: 0.0397 - mean_squ

<keras.callbacks.History at 0x1432d29b0>

In [55]:
# save model (architecture + weights) 
out_dir = '../models/'
model.save(out_dir + 'freeze_weights_tf_onoff_model.h5')
model.save_weights(out_dir + 'freeze_weights_tf_onoff_model_weights.h5')