Goal: train and evaluate models for 2 layer CNN. This notebook stores regression values. Don't recommend rerunning this as it will take a lot of time due to 10 fold validation.

In [1]:
# import statements 

import os
#disable CUDA

import platform
import random
import shutil
import sys

import math
import itertools
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tqdm import tqdm_notebook as tqdm
import keras

# some visualization imports
from keras import activations

# various imports for the keras model
from keras.layers.core import Permute
from keras import backend as K
from keras.engine.topology import Layer
import keras as keras
from keras.callbacks import TensorBoard
from keras import metrics as metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Conv1D, Concatenate
from keras.optimizers import SGD
from keras.regularizers import l2

# evaluate performance w/ on and off regression separately 
from scipy.stats import pearsonr, spearmanr 

# imports for the grid search and kfold CV
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

# data one-hot encoding imports (help from Luis)
from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical


Using TensorFlow backend.


# Part 1: Load in data. Filter and sample to avoid bias from expiremental errors. 

In [2]:
data_dir = '../../data/'
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,stem2,linker,post_linker,on_value,off_value,onoff_value,on_qc,off_qc,onoff_qc,switch_sequence
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,CCATTTTTT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,,0.333333,,,,,AAAAAAAAAAAAAAAAAATGGAAAACAGTTAACAGAGGAGAAACTG...
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,GTTTTCCAT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,,,,,,,AAAAAAAAAAAAATGGAAAACAGTTACTAAAACAGAGGAGATTAGT...
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,CAATAGTAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,0.068295,0.0,0.068295,2.0,1.1,1.1,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...


In [3]:
qc_cutoff=1.1
data_df = data_df[data_df['on_qc'] >= qc_cutoff].reset_index()
data_df = data_df[data_df['off_qc'] >= qc_cutoff].reset_index()
toehold_seqs = data_df['switch_sequence']
seq_len = len(toehold_seqs[0])
print('Number of remaining sequences: ', len(data_df))

Number of remaining sequences:  91534


Now downsample data to avoid bias.

In [4]:
on_value_bin_labels = np.arange(1000)
on_value_bins = pd.cut(data_df['on_value'], bins=1000, labels=on_value_bin_labels)
bin_floor_on = math.floor(data_df['on_value'].value_counts(bins=1000).mean())


off_value_bin_labels = np.arange(1000)
off_value_bins = pd.cut(data_df['off_value'], bins=1000, labels=off_value_bin_labels)
bin_floor_off = math.floor(data_df['off_value'].value_counts(bins=1000).mean())

In [5]:
# Going through the 1000 bin counts and preventing no more than 
# the mean number of counts in each bin, then adding all of the indicies
# of the bins to a list for the on and off values
sample_ids_on = []
for bin_label in on_value_bin_labels:
    bin_indices = on_value_bins[on_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_on:
        sample = np.random.choice(bin_indices, size=bin_floor_on, replace=False)
    else:
        sample = bin_indices
    sample_ids_on.append(sample.tolist())  

sample_ids_off = []
for bin_label in off_value_bin_labels:
    bin_indices = off_value_bins[off_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_off:
        sample = np.random.choice(bin_indices, size=bin_floor_off, replace=False)
    else:
        sample = bin_indices
    sample_ids_off.append(sample.tolist()) 

In [6]:
# Breaking down list of lists into one list
sample_on = itertools.chain.from_iterable(sample_ids_on)
sample_off = itertools.chain.from_iterable(sample_ids_off)

# take intersection of sample_ids_on and sample_ids_off 
sample_ids_union = set(sample_on).union(sample_off)
sub_df = data_df.loc[sample_ids_union].reset_index(drop=True)

print('New number of remaining seqs:', len(sub_df))

New number of remaining seqs: 81140


In [7]:
# update parameters to match original (in order to not break later code w/ new sampling)
data_df = sub_df
toehold_seqs = data_df['switch_sequence']

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [10]:
alph_letters = sorted('ATCG')
alph = list(alph_letters)

# one-hot encode
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'#'switch'
X = np.stack([_get_one_hot_encoding(s) for s in toehold_seqs]).astype(np.float32)

# reformat for CNN if needed
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(data_df[input_col_name][0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

y_on = np.array(data_df['on_value'].astype(np.float32))
y_off = np.array(data_df['off_value'].astype(np.float32))

# combine on and off targets
y = np.transpose(np.array([y_on,y_off,]))
print('target shape: ', y.shape)

input shape:  (81140, 59, 4)
modified shape:  (81140, 59, 4)
target shape:  (81140, 2)


# Part 3. Set-up framework for model. Ensure needed parameters can be varied.

In [11]:
from keras import optimizers
def twoheaded_conv1d(conv_layer_parameters, hidden_layers, dropout_rate = 0.2, reg_coeff = 0.0001,learning_rate=0.001, num_features = 59, num_channels = 4): 
    # num_features = seq length, num_channels = alphabet size (i.e. # nucleotides)
    X_in = Input(shape=(num_features,num_channels),dtype='float32')
    prior_layer = X_in 
    if conv_layer_parameters != None: 
        # addded to use same function for mlp 
        for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):
            conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
            prior_layer = conv_layer
    H = Flatten()(prior_layer)
    for idx, h in enumerate(hidden_layers): 
        H = Dropout(dropout_rate)(H)
        H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff),name='dense_'+str(idx))(H)
    out_on = Dense(1,activation="linear",name='on_output')(H)
    out_off = Dense(1, activation='linear', name='off_output')(H)
    model = Model(inputs=[X_in], outputs=[out_on, out_off])
    opt = optimizers.adam(lr = learning_rate)
    model.compile(loss={'on_output': 'mse', 'off_output': 'mse'},optimizer=opt,metrics=['mse'])
    return model


# Part 4. Define desired model features. Will ablate parts of model for 1 layer CNN and MLP.

In [12]:
model_tags = ['twolayerconv']
#[(kernel_width_layer1, #filters_layer1), (kernel_width_layer2, #filters_layer2), ...]
master_conv_layer_parameters = [(5,10), (3,5)]
hidden_layer_choices = {5: (150, 60, 15),}
hidden_layers = hidden_layer_choices[5]
dropout_rate = 0.1
l2_reg_coeff = 0.0001
learning_rate = 0.0005 

# Part 5. Run K-Fold CV to ensure reliability of performance metrics. For on and off values. 

In [13]:
# define kfold object 
num_folds = 5
seed = 0 # set for reproducability 
random.seed(seed)
kfold = KFold(n_splits=num_folds, shuffle=True, random_state= 0)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [14]:
# functions to evaluate the model

def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)[0]
    pearson_corr = pearsonr(preds_y, true_y)[0][0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    mse_val = sklearn.metrics.mean_squared_error(preds_y, true_y)
    mae_val = sklearn.metrics.mean_absolute_error(preds_y, true_y)
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return [r2_score, pearson_corr, spearman_corr, mse_val, mae_val]

def print_summary_results(avg_metrics, std_metrics): 
    print('Average:')
    print('\tR2:', avg_metrics[0], '\n\tPearson:', avg_metrics[1],'\n\tSpearman:', avg_metrics[2],)
    print('Standard deviation:')
    print('\tR2:', std_metrics[0], '\n\tPearson:', std_metrics[1],'\n\tSpearman:', std_metrics[2],)
    

In [15]:
# run kfold 
avg_metric_folds_on = []
avg_metric_folds_off = []
std_metric_folds_on = []
std_metric_folds_off = [] 

for model_tag in model_tags: 
    conv_layer_parameters = master_conv_layer_parameters
    cv_scores_on=[]
    cv_scores_off=[]
    fold_count=0
    for train, test in kfold.split(X, y): 
        print('Beginning fold #', fold_count)
        # create model w/ parameters as defined
        # NOTE: create a model from scratch each time to ensure no weights are carried over per fold  
        kfold_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                                 dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                                 learning_rate= learning_rate)

        # split data again for validation set (to be used w/ early stopping)
        X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)

        # train the model
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
        kfold_model.fit(X[train], [y[train][:,0], y[train][:,1]],epochs=num_epochs, batch_size=128,verbose=0, validation_data=(X_val, [y_val[:,0], y_val[:,1]]), callbacks=[early_stopping])

        # evaluate the model (for ON and OFF separately)
        y_preds = np.array(kfold_model.predict(X_test))
        # get on and off metrics separately
        print('--- ON Metrics ---')
        on_metrics = compute_metrics(y_preds[0],np.expand_dims(y_test[:,0], 1))
        print('--- OFF Metrics ---')
        off_metrics = compute_metrics(y_preds[1],np.expand_dims(y_test[:,1], 1))

        # save raw csv scores
        cv_scores_on.append(on_metrics)
        cv_scores_off.append(off_metrics)

        # delete model to ensure no weights are carried over 
        del kfold_model

        fold_count += 1
    out_dir = 'regression_table_cross_vals/'
    np.savetxt(out_dir+'cv_scores_on_'+model_tag +'.csv', cv_scores_on, delimiter=',')
    np.savetxt(out_dir+'cv_scores_off_'+model_tag +'.csv', cv_scores_off, delimiter=',')
    
    # generate average scores 
    avg_metric_folds_on.append(np.mean(cv_scores_on, axis = 0)) # avg over columns 
    std_metric_folds_on.append(np.std(cv_scores_on, axis = 0)) # st dev over columns
    avg_metric_folds_off.append(np.mean(cv_scores_off, axis = 0)) # avg over columns 
    std_metric_folds_off.append(np.std(cv_scores_off, axis = 0)) # st dev over columns 

Beginning fold # 0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
--- ON Metrics ---
R2:  0.69220793
Pearson:  0.83199036
Spearman:  0.8326690627455716
--- OFF Metrics ---
R2:  0.5264068
Pearson:  0.72553897
Spearman:  0.6930938024084969
Beginning fold # 1
--- ON Metrics ---
R2:  0.6829216
Pearson:  0.8263907
Spearman:  0.8266380515547543
--- OFF Metrics ---
R2:  0.5180784
Pearson:  0.71977663
Spearman:  0.6914051032821362
Beginning fold # 2
--- ON Metrics ---
R2:  0.68633085
Pearson:  0.82845086
Spearman:  0.8295668801730957
--- OFF Metrics ---
R2:  0.5106081
Pearson:  0.71456844
Spearman:  0.6852682010088658
Beginning fold # 3
--- ON Metrics ---
R2:  0.6735512
Pearson:  0.82070166
Spearman:  0.8217849723226383
--- OFF Metrics ---
R2:  0.5328806
Pearson:  0.7299867
Spearman:  0.6918757609756563
Begin

# Part 6. Save average metrics for each model architecture.


In [16]:
# bad code: need to convert to np array for saving later     
avg_metric_folds_on=np.array(avg_metric_folds_on)
avg_metric_folds_off = np.array(avg_metric_folds_off)
std_metric_folds_on = np.array(std_metric_folds_on)
std_metric_folds_off = np.array(std_metric_folds_off)

In [17]:

on_df = pd.DataFrame({'Model':model_tags, 'R2': avg_metric_folds_on[:,0], 'Pearson':avg_metric_folds_on[:,1], 'Spearman': avg_metric_folds_on[:,2],
                      'MSE': avg_metric_folds_on[:,3], 'MAE': avg_metric_folds_on[:,4], 
                           'R2 (std)': std_metric_folds_on[:,0], 'Pearson (std)':std_metric_folds_on[:,1], 'Spearman (std)': std_metric_folds_on[:,2],
                     'MSE (std)': std_metric_folds_on[:,3], 'MAE (std)': std_metric_folds_on[:,4], 
                     })

on_df.to_csv(out_dir+'summary_on_compare_models_reg_results.csv')

off_df =pd.DataFrame({'Model':model_tags, 'R2': avg_metric_folds_off[:,0], 'Pearson':avg_metric_folds_off[:,1], 'Spearman': avg_metric_folds_off[:,2],
                      'MSE': avg_metric_folds_off[:,3], 'MAE': avg_metric_folds_off[:,4],      
                      'R2 (std)': std_metric_folds_off[:,0], 'Pearson (std)':std_metric_folds_off[:,1], 'Spearman (std)': std_metric_folds_off[:,2],
                     'MSE (std)': std_metric_folds_off[:,3], 'MAE (std)': std_metric_folds_off[:,4], 
                     })
off_df.to_csv(out_dir+'summary_off_compare_models_reg_results.csv')

# Part 7: Repeat, but with scrambled toehold data set for control.

In [22]:
from random import shuffle

def shuffle_seq(seq):
    new_seq = list(seq)
    shuffle(new_seq)
    return ''.join(new_seq)

def _get_one_hot_encoding_and_scramble(seq):
    new_seq = shuffle_seq(seq)
    one_hot_seq = one.encode(new_seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'
X = np.stack(
    [_get_one_hot_encoding_and_scramble(s) for s in toehold_seqs]).astype(np.float32)

# reformat for CNN if neded
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(data_df[input_col_name][0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

y_on = np.array(data_df['on_value'].astype(np.float32))
y_off = np.array(data_df['off_value'].astype(np.float32))

# combine on and off
y = np.transpose(np.array([y_on,y_off,]))
print('target shape: ', y.shape)

input shape:  (81140, 59, 4)
modified shape:  (81140, 59, 4)
target shape:  (81140, 2)


In [23]:
# run kfold 
avg_metric_folds_on = []
avg_metric_folds_off = []
std_metric_folds_on = []
std_metric_folds_off = [] 

for model_tag in model_tags: 
    conv_layer_parameters = master_conv_layer_parameters # only look at 2 layer cnn
    cv_scores_on=[]
    cv_scores_off=[]
    fold_count=0
    for train, test in kfold.split(X, y): 
        print('Beginning fold #', fold_count)
        # create model w/ parameters as defined
        # NOTE: create a model from scratch each time to ensure no weights are carried over per fold  
        kfold_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                                 dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                                 learning_rate= learning_rate)

        # split data again for validation set (to be used w/ early stopping)
        X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)

        # train the model
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
        kfold_model.fit(X[train], [y[train][:,0], y[train][:,1]],epochs=num_epochs, batch_size=128,verbose=0, validation_data=(X_val, [y_val[:,0], y_val[:,1]]), callbacks=[early_stopping])

        # evaluate the model (for ON and OFF separately)
        y_preds = np.array(kfold_model.predict(X_test))
        # get on and off metrics separately
        print('--- ON Metrics ---')
        on_metrics = compute_metrics(y_preds[0],np.expand_dims(y_test[:,0], 1))
        print('--- OFF Metrics ---')
        off_metrics = compute_metrics(y_preds[1],np.expand_dims(y_test[:,1], 1))

        # save raw csv scores
        cv_scores_on.append(on_metrics)
        cv_scores_off.append(off_metrics)

        # delete model to ensure no weights are carried over 
        del kfold_model

        fold_count += 1
    out_dir = 'regression_table_cross_vals/'
    np.savetxt(out_dir+'cv_scores_scrambled_on_'+model_tag +'.csv', cv_scores_on, delimiter=',')
    np.savetxt(out_dir+'cv_scores_scrambled_off_'+model_tag +'.csv', cv_scores_off, delimiter=',')
    
    # generate average scores 
    avg_metric_folds_on.append(np.mean(cv_scores_on, axis = 0)) # avg over columns 
    std_metric_folds_on.append(np.std(cv_scores_on, axis = 0)) # st dev over columns
    avg_metric_folds_off.append(np.mean(cv_scores_off, axis = 0)) # avg over columns 
    std_metric_folds_off.append(np.std(cv_scores_off, axis = 0)) # st dev over columns 

Beginning fold # 0
--- ON Metrics ---
R2:  0.095986284
Pearson:  0.30981654
Spearman:  0.3128906351619775
--- OFF Metrics ---
R2:  0.10302406
Pearson:  0.3209736
Spearman:  0.2953098160201004
Beginning fold # 1
--- ON Metrics ---
R2:  0.08192284
Pearson:  0.28622165
Spearman:  0.2829859166891824
--- OFF Metrics ---
R2:  0.08851528
Pearson:  0.29751518
Spearman:  0.2839574315369072
Beginning fold # 2
--- ON Metrics ---
R2:  0.098902
Pearson:  0.3144869
Spearman:  0.311184008543745
--- OFF Metrics ---
R2:  0.11255815
Pearson:  0.33549687
Spearman:  0.30234227831953586
Beginning fold # 3
--- ON Metrics ---
R2:  0.101116754
Pearson:  0.3179886
Spearman:  0.31915828046009687
--- OFF Metrics ---
R2:  0.10201382
Pearson:  0.31939602
Spearman:  0.29869866237965575
Beginning fold # 4
--- ON Metrics ---
R2:  0.10458343
Pearson:  0.3233936
Spearman:  0.32138869952420385
--- OFF Metrics ---
R2:  0.10911764
Pearson:  0.3303296
Spearman:  0.31119539339905233


In [24]:
# bad code: need to convert to np array for saving later     
avg_metric_folds_on=np.array(avg_metric_folds_on)
avg_metric_folds_off = np.array(avg_metric_folds_off)
std_metric_folds_on = np.array(std_metric_folds_on)
std_metric_folds_off = np.array(std_metric_folds_off)

In [25]:

on_df = pd.DataFrame({'Model':model_tags, 'R2': avg_metric_folds_on[:,0], 'Pearson':avg_metric_folds_on[:,1], 'Spearman': avg_metric_folds_on[:,2],
                      'MSE': avg_metric_folds_on[:,3], 'MAE': avg_metric_folds_on[:,4], 
                           'R2 (std)': std_metric_folds_on[:,0], 'Pearson (std)':std_metric_folds_on[:,1], 'Spearman (std)': std_metric_folds_on[:,2],
                     'MSE (std)': std_metric_folds_on[:,3], 'MAE (std)': std_metric_folds_on[:,4], 
                     })

on_df.to_csv(out_dir+'summary_scrambled_on_compare_models_reg_results.csv')

off_df =pd.DataFrame({'Model':model_tags, 'R2': avg_metric_folds_off[:,0], 'Pearson':avg_metric_folds_off[:,1], 'Spearman': avg_metric_folds_off[:,2],
                      'MSE': avg_metric_folds_off[:,3], 'MAE': avg_metric_folds_off[:,4],      
                      'R2 (std)': std_metric_folds_off[:,0], 'Pearson (std)':std_metric_folds_off[:,1], 'Spearman (std)': std_metric_folds_off[:,2],
                     'MSE (std)': std_metric_folds_off[:,3], 'MAE (std)': std_metric_folds_off[:,4], 
                     })
off_df.to_csv(out_dir+'summary_scrambled_off_compare_models_reg_results.csv')