This is the model trained on Angenent-Mari et al. 2020 data. Revised from figures 2 and 3 to be just a single output node, ON/OFF ratio, rather than ON and OFF values separately as to be compatible with Green et al. normalized data.

In [32]:
# import statements 

import os

import platform
import random
import shutil
import sys

import math
import itertools
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.metrics
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
from tqdm import tqdm_notebook as tqdm
import keras

# some visualization imports
from keras import activations

# various imports for the keras model
from keras.layers.core import Permute
from keras import backend as K
from keras.engine.topology import Layer
import keras as keras
from keras.callbacks import TensorBoard
from keras import metrics as metrics
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input, Conv1D, Concatenate
from keras.optimizers import SGD
from keras.regularizers import l2

# evaluate performance w/ on and off regression separately 
from scipy.stats import pearsonr, spearmanr 

# imports for the grid search and kfold CV
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score

# data one-hot encoding imports
from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical


# Part 1: Load in data. Filter and sample to avoid bias from expiremental errors. 

In [33]:
data_dir = '../../../data/'
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,stem2,linker,post_linker,on_value,off_value,onoff_value,on_qc,off_qc,onoff_qc,switch_sequence
0,AACCAAACACACAAACGCACAAAAAAAAAAAAAAAAAATGGAAAAC...,AACTGTTTTCCATTTTTTTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2626,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,AACTGTTTTCCATTTTTTTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAAAAAAATGGAAAACAGTT,AACAGAGGAGA,...,CCATTTTTT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,,0.333333,,,,,AAAAAAAAAAAAAAAAAATGGAAAACAGTTAACAGAGGAGAAACTG...
1,AACCAAACACACAAACGCACAAAAAAAAAAAAATGGAAAACAGTTA...,TTAGTAACTGTTTTCCATTTTTTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_2625,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTAGTAACTGTTTTCCATTTTTTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAAAAAATGGAAAACAGTTACTAA,AACAGAGGAGA,...,GTTTTCCAT,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,,,,,,,AAAAAAAAAAAAATGGAAAACAGTTACTAAAACAGAGGAGATTAGT...
2,AACCAAACACACAAACGCACAAAAAAAAATTACTACTATTGTTAAT...,CTAAATTAACAATAGTAGTAATTTTTTTTTAACCAAACACACAAAC...,smallpox,smallpox_tile_4951,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTAAATTAACAATAGTAGTAATTTTTTTTT,AACCAAACACACAAACGCAC,AAAAAAAAATTACTACTATTGTTAATTTAG,AACAGAGGAGA,...,CAATAGTAG,AACCTGGCGGCAGCGCAAAAGATGCG,TAAAGGAGAA,0.068295,0.0,0.068295,2.0,1.1,1.1,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...


In [34]:
qc_cutoff=1.1
data_df = data_df[data_df['on_qc'] >= qc_cutoff].reset_index()
data_df = data_df[data_df['off_qc'] >= qc_cutoff].reset_index()
toehold_seqs = data_df['switch_sequence']
seq_len = len(toehold_seqs[0])
print('Number of remaining sequences: ', len(data_df))

Number of remaining sequences:  91534


Now downsample data to avoid bias.

In [38]:
on_value_bin_labels = np.arange(1000)
on_value_bins = pd.cut(data_df['on_value'], bins=1000, labels=on_value_bin_labels)
bin_floor_on = math.floor(data_df['on_value'].value_counts(bins=1000).mean())


off_value_bin_labels = np.arange(1000)
off_value_bins = pd.cut(data_df['off_value'], bins=1000, labels=off_value_bin_labels)
bin_floor_off = math.floor(data_df['off_value'].value_counts(bins=1000).mean())

In [39]:
# Going through the 1000 bin counts and preventing no more than 
# the mean number of counts in each bin, then adding all of the indicies
# of the bins to a list for the on and off values
sample_ids_on = []
for bin_label in on_value_bin_labels:
    bin_indices = on_value_bins[on_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_on:
        sample = np.random.choice(bin_indices, size=bin_floor_on, replace=False)
    else:
        sample = bin_indices
    sample_ids_on.append(sample.tolist())  

sample_ids_off = []
for bin_label in off_value_bin_labels:
    bin_indices = off_value_bins[off_value_bins == bin_label].index
    bin_num = bin_indices.size
    if bin_num > bin_floor_off:
        sample = np.random.choice(bin_indices, size=bin_floor_off, replace=False)
    else:
        sample = bin_indices
    sample_ids_off.append(sample.tolist()) 

In [40]:
# Breaking down list of lists into one list
sample_on = itertools.chain.from_iterable(sample_ids_on)
sample_off = itertools.chain.from_iterable(sample_ids_off)

# take intersection of sample_ids_on and sample_ids_off 
sample_ids_union = set(sample_on).union(sample_off)
sub_df = data_df.loc[sample_ids_union].reset_index(drop=True)

print('New number of remaining seqs:', len(sub_df))

New number of remaining seqs: 81073


In [41]:
# update parameters to match original (in order to not break later code w/ new sampling)
data_df = sub_df
toehold_seqs = data_df['switch_sequence']

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [44]:
alph_letters = sorted('ATCG')
alph = list(alph_letters)

# one-hot encode
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'#'switch'
X = np.stack([_get_one_hot_encoding(s) for s in toehold_seqs]).astype(np.float32)

# reformat for CNN if needed
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(data_df[input_col_name][0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

# reformat target valus
y = np.array(data_df['onoff_value'].astype(np.float32))
y = np.transpose(np.array([y]))
print('target shape: ', y.shape)

input shape:  (81073, 59, 4)
modified shape:  (81073, 59, 4)
target shape:  (81073, 1)


# Part 3. Set-up framework for model. Ensure needed parameters can be varied.

In [45]:
from keras import optimizers
def twoheaded_conv1d(conv_layer_parameters, hidden_layers, dropout_rate = 0.2, reg_coeff = 0.0001,learning_rate=0.001, num_features = 59, num_channels = 4): 
    # num_features = seq length, num_channels = alphabet size (i.e. # nucleotides)
    X_in = Input(shape=(num_features,num_channels),dtype='float32')
    prior_layer = X_in 
    for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):
        conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
        prior_layer = conv_layer
    H = Flatten()(prior_layer)
    for idx, h in enumerate(hidden_layers): 
        H = Dropout(dropout_rate)(H)
        H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff),name='dense_'+str(idx))(H)
    out_on = Dense(1,activation="linear",name='on_output')(H)
    model = Model(inputs=[X_in], outputs=[out_on])
    opt = optimizers.adam(lr = learning_rate)
    model.compile(loss={'on_output': 'mse'},optimizer=opt,metrics=['mse'])
    return model


# Part 4. Define desired model features. Build sample model to view architecture.

In [46]:
# [(kernel_width_layer1, #filters_layer1), (kernel_width_layer2, #filters_layer2), ...]
conv_layer_parameters = [(5,10), (3,5)]
hidden_layer_choices = {5: (150, 60, 15),} # dependent on # filters in final convolutional layer before MLP 

hidden_layers = hidden_layer_choices[5]
dropout_rate = 0.1
l2_reg_coeff = 0.0001
learning_rate = 0.0005 

# build sample master model (to be trained completely later)
sample_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

# print model architecture 
sample_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 59, 4)             0         
_________________________________________________________________
conv_0 (Conv1D)              (None, 59, 10)            210       
_________________________________________________________________
conv_1 (Conv1D)              (None, 59, 5)             155       
_________________________________________________________________
flatten_11 (Flatten)         (None, 295)               0         
_________________________________________________________________
dropout_31 (Dropout)         (None, 295)               0         
_________________________________________________________________
dense_0 (Dense)              (None, 150)               44400     
_________________________________________________________________
dropout_32 (Dropout)         (None, 150)               0         
__________

# Part 5. Run K-Fold CV to ensure reliability of performance metrics. For on and off values. 

In [47]:
# define kfold object 
num_folds = 10
seed = 0 # set for reproducability 
random.seed(seed)
kfold = KFold(n_splits=num_folds, shuffle=True, random_state= 0)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [48]:
# functions to evaluate the model

def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)[0]
    pearson_corr = pearsonr(preds_y, true_y)[0][0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return [r2_score, pearson_corr, spearman_corr]

def print_summary_results(avg_metrics, std_metrics): 
    print('Average:')
    print('\tR2:', avg_metrics[0], '\n\tPearson:', avg_metrics[1],'\n\tSpearman:', avg_metrics[2],)
    print('Standard deviation:')
    print('\tR2:', std_metrics[0], '\n\tPearson:', std_metrics[1],'\n\tSpearman:', std_metrics[2],)
    

In [49]:
# run kfold 
cv_scores_on=[]
preds_on = []
true_on = []
fold_count=0
for train, test in kfold.split(X, y): 
    print('Beginning fold #', fold_count)
    # create model w/ parameters as defined
    # NOTE: create a model from scratch each time to ensure no weights are carried over per fold  
    kfold_model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                             dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                             learning_rate= learning_rate)
    
    # split data again for validation set (to be used w/ early stopping)
    X_val, X_test, y_val, y_test = train_test_split(X[test], y[test], train_size = 0.5, test_size = 0.5)
    
    # train the model
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=0, mode='auto')
    kfold_model.fit(X[train], [y[train][:,0]],epochs=num_epochs, batch_size=128,verbose=0, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])

    # evaluate the model
    y_preds = np.array(kfold_model.predict(X_test))
    
    # get ON/OFF metrics (mark as on_metrics for simplicity)
    print('--- ON/OFF Metrics ---')
    on_metrics = compute_metrics(y_preds,np.expand_dims(y_test[:,0], 1))
    cv_scores_on.append(on_metrics)  
    preds_on.append(np.squeeze(y_preds))
    true_on.append(y_test[:,0])
    
    # delete model to ensure no weights are carried over 
    del kfold_model

    fold_count += 1
out_dir = 'metrics/'  
np.savetxt(out_dir + 'original_revised_onoff_preds.csv', preds_on, delimiter=",")
np.savetxt(out_dir + 'original_revised_onoff_true.csv', true_on, delimiter=",")

Beginning fold # 0
--- ON/OFF Metrics ---
R2:  0.3969207
Pearson:  0.63001645
Spearman:  0.6418700564162919
Beginning fold # 1
--- ON/OFF Metrics ---
R2:  0.37557307
Pearson:  0.6128402
Spearman:  0.6319624560342898
Beginning fold # 2
--- ON/OFF Metrics ---
R2:  0.38244346
Pearson:  0.6184201
Spearman:  0.6356469118124106
Beginning fold # 3
--- ON/OFF Metrics ---
R2:  0.37436205
Pearson:  0.61185133
Spearman:  0.6253720155837936
Beginning fold # 4
--- ON/OFF Metrics ---
R2:  0.3861642
Pearson:  0.6214211
Spearman:  0.6362803860570859
Beginning fold # 5
--- ON/OFF Metrics ---
R2:  0.3669557
Pearson:  0.6057687
Spearman:  0.6194198543089242
Beginning fold # 6
--- ON/OFF Metrics ---
R2:  0.37086862
Pearson:  0.60898983
Spearman:  0.628269196287876
Beginning fold # 7
--- ON/OFF Metrics ---
R2:  0.37558025
Pearson:  0.612846
Spearman:  0.6248741766782452
Beginning fold # 8
--- ON/OFF Metrics ---
R2:  0.39025554
Pearson:  0.62470436
Spearman:  0.6387467160485212
Beginning fold # 9
--- ON/OFF

# Part 6. Compute average metrics.


In [50]:
avg_metric_folds_on = np.mean(cv_scores_on, axis = 0) # avg over columns 
std_metric_folds_on = np.std(cv_scores_on, axis = 0) # st dev over columns

In [51]:
print('--- ON/OFF Metrics ---')
print_summary_results(avg_metric_folds_on, std_metric_folds_on)

--- ON/OFF Metrics ---
Average:
	R2: 0.38060279488563536 
	Pearson: 0.6168874442577362 
	Spearman: 0.631401830613214
Standard deviation:
	R2: 0.008966298325177488 
	Pearson: 0.0072578161278113 
	Spearman: 0.006618512094393616


# Part 7. Train model and save for future use.


In [52]:
# train on more of the data (no testing - use metrics from kfold as final metrics)
# have small held-out data for eearly stopping

# split data again for validation set (to be used w/ early stopping)
train_size = 0.851
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = train_size, test_size = 1-train_size)

# define parameters for training 
num_epochs = 150
patience = int(num_epochs * .1)

In [53]:
# build model
model = twoheaded_conv1d(conv_layer_parameters=conv_layer_parameters, hidden_layers= hidden_layers, 
                         dropout_rate=dropout_rate, reg_coeff=l2_reg_coeff, 
                         learning_rate= learning_rate)

In [54]:
# train model
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=patience, verbose=2, mode='auto')
model.fit(X_train, [y_train[:,0]],epochs=num_epochs, batch_size=128,verbose=2, validation_data=(X_val, [y_val[:,0]]), callbacks=[early_stopping])


Train on 68993 samples, validate on 12080 samples
Epoch 1/150
 - 6s - loss: 0.0925 - mean_squared_error: 0.0663 - val_loss: 0.0794 - val_mean_squared_error: 0.0574
Epoch 2/150
 - 3s - loss: 0.0766 - mean_squared_error: 0.0579 - val_loss: 0.0719 - val_mean_squared_error: 0.0562
Epoch 3/150
 - 3s - loss: 0.0689 - mean_squared_error: 0.0552 - val_loss: 0.0665 - val_mean_squared_error: 0.0546
Epoch 4/150
 - 3s - loss: 0.0642 - mean_squared_error: 0.0536 - val_loss: 0.0597 - val_mean_squared_error: 0.0504
Epoch 5/150
 - 3s - loss: 0.0602 - mean_squared_error: 0.0517 - val_loss: 0.0571 - val_mean_squared_error: 0.0494
Epoch 6/150
 - 3s - loss: 0.0572 - mean_squared_error: 0.0501 - val_loss: 0.0557 - val_mean_squared_error: 0.0492
Epoch 7/150
 - 3s - loss: 0.0550 - mean_squared_error: 0.0489 - val_loss: 0.0522 - val_mean_squared_error: 0.0465
Epoch 8/150
 - 3s - loss: 0.0532 - mean_squared_error: 0.0478 - val_loss: 0.0521 - val_mean_squared_error: 0.0470
Epoch 9/150
 - 3s - loss: 0.0524 - mea

<keras.callbacks.History at 0x15b5de278>

In [55]:
# save model (architecture + weights) 
out_dir = '../models/'
model.save(out_dir + 'onoff_original_model.h5')
model.save_weights(out_dir + 'onoff_original_model_weights.h5')