Goal: run a grid search to find the best architecture for predicting the on and off target values per toehold. Uses a randomized parameter search to improve time-efficiency. This notebook is intended for dividing up the share of work of grid search. 

See Part 4 for what parameters to change per run. This same notebook can be used multiple times, so long as those two variables (rand_param_combos and save_file_tag) are changed. 

In [None]:
# import statements 
# everything MUST be imported and activated in the O2 cluster before running (as a .py file)

import os
#disable CUDA

import platform
import random
import shutil
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tqdm import tqdm_notebook as tqdm
import keras

# some visualization imports
from keras import activations

# various imports for the keras model
from keras.layers.core import Permute
from keras import backend as K
from keras.engine.topology import Layer
import keras as keras
from keras.callbacks import TensorBoard
from keras import metrics as metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Conv1D, Concatenate
from keras.optimizers import SGD
from keras.regularizers import l2

# evaluate performance w/ on and off regression separately 
from scipy.stats import pearsonr, spearmanr 

# imports for the grid search and kfold CV
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

# data one-hot encoding imports (help from Luis)
from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical


# Part 1: Load in data. Filter and sample to avoid bias from expiremental errors. 

In [None]:
data_dir = '../../data/'
# diff sheets, so need to read i/n 
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
print(data_df.head(2))

In [None]:
qc_cutoff=1.1
data_df = data_df[data_df['on_qc'] >= qc_cutoff].reset_index()
data_df = data_df[data_df['off_qc'] >= qc_cutoff].reset_index()
toehold_seqs = data_df['switch_sequence']
seq_len = len(toehold_seqs[0])
print('Number of remaining sequences: ', len(data_df))

Now downsample data to avoid bias.

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import math
import itertools
on_value_bin_labels = np.arange(1000)
on_value_bins = pd.cut(data_df['on_value'], bins=1000, labels=on_value_bin_labels)
bin_floor_on = math.floor(data_df['on_value'].value_counts(bins=1000).mean())


off_value_bin_labels = np.arange(1000)
off_value_bins = pd.cut(data_df['off_value'], bins=1000, labels=off_value_bin_labels)
bin_floor_off = math.floor(data_df['off_value'].value_counts(bins=1000).mean())

In [None]:
# Going through the 1000 bin counts and preventing no more than 
# the mean number of counts in each bin, then adding all of the indicies
# of the bins to a list for the on and off values
sample_ids_on = []
for bin_label in on_value_bin_labels:
    bin_indices = on_value_bins[on_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_on:
        sample = np.random.choice(bin_indices, size=bin_floor_on, replace=False)
    else:
        sample = bin_indices
    sample_ids_on.append(sample.tolist())  

sample_ids_off = []
for bin_label in off_value_bin_labels:
    bin_indices = off_value_bins[off_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_off:
        sample = np.random.choice(bin_indices, size=bin_floor_off, replace=False)
    else:
        sample = bin_indices
    sample_ids_off.append(sample.tolist()) 

In [None]:
# Breaking down list of lists into one list
sample_on = itertools.chain.from_iterable(sample_ids_on)
sample_off = itertools.chain.from_iterable(sample_ids_off)

# take union of sample_ids_on and sample_ids_off 
sample_ids_union = set(sample_on).union(sample_off)
sub_df = data_df.loc[sample_ids_union].reset_index(drop=True)

print('New number of remaining seqs:', len(sub_df))

In [None]:
# update parameters to match original (in order to not break later code w/ new sampling)
data_df = sub_df
toehold_seqs = data_df['switch_sequence']

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [None]:
alph_letters = 'ATCG'
alph = sorted(list(alph_letters))

# one-hot encode
# modified code from Luis to get correct format for TPOT w/ our nt seq
# use pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'#'switch'
X = np.stack(
    [_get_one_hot_encoding(s) for s in toehold_seqs]).astype(np.float32)

print('input shape: ', X.shape)
# reformat for CNN 
alph_len = len(alph)
seq_len = len(data_df[input_col_name][0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

y_on = np.array(data_df['on_value'].astype(np.float32))
y_off = np.array(data_df['off_value'].astype(np.float32))

# combine to both targets included
# cols 0-1 = on bin, 2-3 = off bin 
y = np.transpose(np.array([y_on,y_off,]))
print('target shape: ', y.shape)

# Part 3. Set-up framework for model. Ensure needed parameters can be varied.

In [None]:
from keras import optimizers
def twoheaded_conv1d(conv_layer_parameters, hidden_layers, dropout_rate = 0.2, reg_coeff = 0.0001,learning_rate=0.001, num_features = 59, num_channels = 4): 
    # num_features = seq length, num_channels = alphabet size (i.e. # nucleotides)
    X_in = Input(shape=(num_features,num_channels),dtype='float32')
    #(kernel_width, num_filters) = conv_layer_parameters
    prior_layer = X_in 
    for kernel_width, num_filters in conv_layer_parameters:
        conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same')(prior_layer) # mimic a kmer
        prior_layer = conv_layer
    H = Flatten()(prior_layer)
    for h in hidden_layers: 
        H = Dropout(dropout_rate)(H)
        H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff))(H)
    out_on = Dense(1,activation="linear",name='on_output')(H)
    out_off = Dense(1, activation='linear', name='off_output')(H)
    model = Model(inputs=[X_in], outputs=[out_on, out_off])
    #model.summary()
    #crossentropy is for probabilities
    opt = optimizers.adam(lr = learning_rate)
    model.compile(loss={'on_output': 'mse', 'off_output': 'mse'},optimizer=opt,metrics=['mse'])
    return model


# Part 4. Enter hyperparameter list from the google doc. 

In [None]:
# ENTER your selected list here
# REPLACE the nested list w/ your nested list
# RUN7 
rand_param_combos =  [[5, 3, 10, 5], [10, 10, 10, 5], [15, 5, 10, 5], [5, 15, 10, 5], [20, 20, 10, 5], [10, 3, 10, 5], [5, 20, 10, 5]]

# RUN8 #rand_param_combos = [[5, 3, 5, 5, 0.1, 0.0, 0.0005], [3, 3, 10, 10, 0.3, 0.0001, 0.0005], [7, 5, 5, 5, 0.3, 0.0001, 0.0005], [5, 7, 5, 5, 0.3, 0.0001, 0.0005], [3, 3, 10, 10, 0.1, 0.0, 0.001]]

# RUN11 #rand_param_combos =  [[7, 3, 5, 5, 0.1, 0.0001, 0.001], [3, 5, 10, 5, 0.3, 0.0, 0.0005], [3, 5, 10, 10, 0.3, 0.0, 0.001], [5, 5, 10, 5, 0.1, 0.0001, 0.001], [5, 5, 15, 5, 0.1, 0.0001, 0.001]]

# RUN2 #rand_param_combos = [[3, 3, 10, 5, 0.1, 0.0001, 0.001], [7, 5, 15, 5, 0.1, 0.0, 0.0005], [7, 3, 10, 5, 0.1, 0.0001, 0.001], [3, 3, 10, 5, 0.1, 0.0, 0.0005], [3, 7, 5, 5, 0.3, 0.0, 0.001]]

# RUN12 #rand_param_combos =  [[7, 3, 5, 10, 0.1, 0.0, 0.0005], [5, 7, 5, 5, 0.1, 0.0, 0.001], [5, 5, 5, 5, 0.3, 0.0001, 0.001], [3, 7, 10, 10, 0.1, 0.0001, 0.001], [7, 3, 15, 5, 0.3, 0.0, 0.001]]

# RUN10 #rand_param_combos = [[7, 5, 5, 10, 0.3, 0.0001, 0.001], [5, 5, 15, 10, 0.1, 0.0, 0.001], [3, 3, 10, 10, 0.3, 0.0001, 0.001], [5, 3, 10, 5, 0.1, 0.0001, 0.0005], [3, 7, 10, 5, 0.3, 0.0, 0.0005]]

# CHANGE the file tag each time 
# each grid search run will generate an on and off .csv w/ average metrics
# upload to github the results after each run (and cross off that the parameters have been tried on the google doc)
# ideally name_run# where run # is the count of the search you've run 
saving_file_tag = 'jackie_additional_runs'

# Also, please manually check that the seed printed out is different per run. Thanks! 

# Part 5. Run Grid Search. Use K-Fold CV to ensure reliability of results.

In [None]:
# define kfold object 
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)

# hidden layer width is dependent of output size of CNN layers = seq_len * num_filters
# manually determine (fairly arbitrary)
# dependent on number of 
hidden_layer_choices = {5: (150, 60, 15), 10: (300, 100, 30), 15: (400,150, 30),}

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [None]:
# function to save data as we go 
def save_per_combo(parameters, avg_metric_folds_on,avg_metric_folds_off,std_metric_folds_on,std_metric_folds_off): 
    on_df = pd.DataFrame({'Params':parameters, 'R2': avg_metric_folds_on[:,0], 'Pearson':avg_metric_folds_on[:,1], 'Spearman': avg_metric_folds_on[:,2],
                           'R2 (std)': std_metric_folds_on[:,0], 'Pearson (std)':std_metric_folds_on[:,1], 'Spearman (std)': std_metric_folds_on[:,2],
                         })

    on_df.to_csv('grid_search_additional/on_2headed_cnn_running_reg_results.csv')

    off_df =pd.DataFrame({'Params':parameters, 'R2': avg_metric_folds_off[:,0], 'Pearson':avg_metric_folds_off[:,1], 'Spearman': avg_metric_folds_off[:,2],
                               'R2 (std)': std_metric_folds_off[:,0], 'Pearson (std)':std_metric_folds_off[:,1], 'Spearman (std)': std_metric_folds_off[:,2],
                         })
    off_df.to_csv('grid_search_additional/off_2headed_cnn_running_reg_results.csv')

In [None]:
# run grid search 
parameters = [str(params) for params in rand_param_combos]
avg_metric_folds_on= []
std_metric_folds_on= [] 
avg_metric_folds_off= []
std_metric_folds_off= [] 
final_seeds = [] 
for param_combo_count, param_combo in enumerate(rand_param_combos): 
    
    print('On combination #', param_combo_count)
    # unpack the parameter combination 
    (kernel_width1, kernel_width2, num_filt1, num_filt2) = param_combo
    dr = 0.1
    lreg = 0.0001
    lr = 0.0005 
    hidden_layers = hidden_layer_choices[num_filt2] # MLP architecture dependent on filters 
    conv_layer_parameters = [(kernel_width1,num_filt1),(kernel_width2,num_filt2)]
    print('Param combo: ', param_combo)
    
    cv_scores_on = []
    cv_scores_off=[]
    fold_count = 0

    for train, test in kfold.split(X, y): 
        print('Beginning fold #', fold_count)
        # create model
        model = twoheaded_conv1d(conv_layer_parameters= conv_layer_parameters, hidden_layers=hidden_layers, dropout_rate=dr, reg_coeff=lreg, learning_rate=lr)

        # split data again for validation set (to be used w/ early stopping)
        X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)

        # train the model
        early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
        model.fit(X[train], [y[train][:,0], y[train][:,1]],epochs=num_epochs, batch_size=128,verbose=2, validation_data=(X_val, [y_val[:,0], y_val[:,1]]), callbacks=[early_stopping])

        # evaluate the model (for ON and OFF separately)
        
        def r2(preds_y, true_y):
            return pearsonr(preds_y, true_y)[0] ** 2

        def compute_metrics(preds_y, true_y): 
            r2_score = r2(preds_y, true_y)[0]
            pearson_corr = pearsonr(preds_y, true_y)[0][0]
            spearman_corr = spearmanr(preds_y, true_y)[0]
            print('R2: ', r2_score)
            print('Pearson: ', pearson_corr)
            print('Spearman: ', spearman_corr)
            return [r2_score, pearson_corr, spearman_corr]

        y_preds = np.array(model.predict(X_test))
        # reshape output properly
        #predictions = np.reshape(y_preds, [np.shape(y_preds)[1], np.shape(y_preds)[0]])

        # get on and off metrics separately
        print('--- ON Metrics ---')
        on_metrics = compute_metrics(y_preds[0],np.expand_dims(y_test[:,0], 1))
        print('--- OFF Metrics ---')
        off_metrics = compute_metrics(y_preds[1],np.expand_dims(y_test[:,1], 1))
        
        cv_scores_on.append(on_metrics)
        cv_scores_off.append(off_metrics)
        
        fold_count += 1

    # compute summary of metrics per parameter combo
    avg_metric_folds_on.append(np.mean(cv_scores_on, axis = 0)) # avg over columns 
    std_metric_folds_on.append(np.std(cv_scores_on, axis = 0)) # st dev over columns
    avg_metric_folds_off.append(np.mean(cv_scores_off, axis = 0)) # avg over columns 
    std_metric_folds_off.append(np.std(cv_scores_off, axis = 0)) # st dev over columns 
    
    final_seeds.append(np.random.get_state()[1][0])
    print('param seed:', np.random.get_state()[1][0])
    
    # save as what we have so far
    parameters_so_far = parameters[:param_combo_count+1]
    save_per_combo(parameters_so_far, np.array(avg_metric_folds_on),np.array(avg_metric_folds_off),np.array(std_metric_folds_on),np.array(std_metric_folds_off))
    
    param_combo_count+=1

# Part 6. Save Metrics. 
NOTE: saved in many different files in different formats for fear of saving anything incorrectly. Bad code, but am slightly paranoid. 

In [None]:
# # save everything individually (fear of saving incorrectly....)
# np.savetxt('mean_metrics_2headed_cnn_reg_on.csv', avg_metric_folds_on, delimiter=",")
# np.savetxt('std_metrics_2headed_cnn_reg_on.csv', std_metric_folds_on, delimiter=",")
# np.savetxt('mean_metrics_2headed_cnn_reg_off.csv', avg_metric_folds_off, delimiter=",")
# np.savetxt('std_metrics_2headed_cnn_reg_off.csv', std_metric_folds_off, delimiter=",")
# np.savetxt('param_combos_2headed_cnn_reg.csv', parameters, fmt='%s', delimiter=",")
# np.savetxt('sampling_of_seeds_reg.csv', final_seeds)#,delimeter=',')

In [None]:
# bad code: need to convert to np array for saving later     
avg_metric_folds_on=np.array(avg_metric_folds_on)
avg_metric_folds_off = np.array(avg_metric_folds_off)
std_metric_folds_on = np.array(std_metric_folds_on)
std_metric_folds_off = np.array(std_metric_folds_off)

In [None]:
# save as dataframes - these are the most reliable and complete 

on_df = pd.DataFrame({'Params':parameters, 'R2': avg_metric_folds_on[:,0], 'Pearson':avg_metric_folds_on[:,1], 'Spearman': avg_metric_folds_on[:,2],
                           'R2 (std)': std_metric_folds_on[:,0], 'Pearson (std)':std_metric_folds_on[:,1], 'Spearman (std)': std_metric_folds_on[:,2],
                     })

on_df.to_csv('grid_search_additional/' + saving_file_tag + 'on_2headed_cnn_reg_results.csv')

off_df =pd.DataFrame({'Params':parameters, 'R2': avg_metric_folds_off[:,0], 'Pearson':avg_metric_folds_off[:,1], 'Spearman': avg_metric_folds_off[:,2],
                           'R2 (std)': std_metric_folds_off[:,0], 'Pearson (std)':std_metric_folds_off[:,1], 'Spearman (std)': std_metric_folds_off[:,2],
                     })
off_df.to_csv('grid_search_additional/' + saving_file_tag + 'off_2headed_cnn_reg_results.csv')
