In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

#import keras as keras
from tensorflow.python import keras
from keras.models import load_model

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

from keras import backend as K 
from scipy.stats import pearsonr, spearmanr 

Using TensorFlow backend.


# Part 1: Load in sequence data. 
## Change file_name here!

In [2]:
# Load in data
data_dir = 'make_tf_learning_models/'
sequence_file = 'Green2014_clean.csv'
sequences = pd.read_csv(data_dir + sequence_file,sep=',')
print(sequences.head(5))

   Unnamed: 0  Toehold ID                                   Toehold sequence  \
0          11          68  AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...   
1         117         110  ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...   
2         108         100  CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...   
3         122         116  TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...   
4          17         117  TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...   

                    Switch region                         Trigger  Avg ONOFF  \
0  AATGTATGTAATAGTTCGTCGAGGTGTCCA  TGGACACCTCGACGAACTATTACATACATT       24.8   
1  ATGATAATGTAGAGGTGCGGAGTGATTGTA  TACAATCACTCCGCACCTCTACATTATCAT        9.7   
2  CGAAGTATTGTAAGGTGTAGTGTGCGTTGA  TCAACGCACACTACACCTTACAATACTTCG       13.6   
3  TAAGTAAATGAAAGTGTATGTATGTTGCTG  CAGCAACATACATACACTTTCATTTACTTA        8.7   
4  TCAATAAGGCGGAGTTCGTCGAGGTGCCTG  CAGGCACCTCGACGAACTCCGCCTTATTGA        8.5   

   sdev ONOFF Toehold Rating  
0      

In [3]:
seqs = sequences['Toehold sequence']
onoff_vals = sequences['Avg ONOFF']

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [4]:
from pysster.One_Hot_Encoder import One_Hot_Encoder
alph_letters = 'ATCG'
alph = list(alph_letters)

# one-hot encode
# modified code from Luis to get correct format for TPOT w/ our nt seq
# use pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

X = np.stack(
    [_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
nsamples, nx, ny = X.shape # have to flatten the one hot encoded into one dimension

print('input shape: ', X.shape)
# reformat for CNN 
alph_len = len(alph)
seq_len = len(seqs[0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

input shape:  (168, 59, 4)
modified shape:  (168, 59, 4)


In [5]:
y = np.array(onoff_vals)

# reshape
y = np.transpose(np.array([y]))
print('target shape: ', y.shape)

target shape:  (168, 1)


# Part 3. Load in original KC onoff model. 

In [6]:
model_dir = 'models/'
final_model_path = model_dir + 'onoff_original_model.h5'
final_weights_path = model_dir + 'onoff_original_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.











In [7]:
predictions = model.predict(X)
on_preds = predictions
sequences['simple_onoff_original_kc_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()




# Part 4. Look at results

In [8]:
sequences.head(5)

Unnamed: 0.1,Unnamed: 0,Toehold ID,Toehold sequence,Switch region,Trigger,Avg ONOFF,sdev ONOFF,Toehold Rating,simple_onoff_original_kc_preds
0,11,68,AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...,AATGTATGTAATAGTTCGTCGAGGTGTCCA,TGGACACCTCGACGAACTATTACATACATT,24.8,2.2,Bad,0.241781
1,117,110,ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...,ATGATAATGTAGAGGTGCGGAGTGATTGTA,TACAATCACTCCGCACCTCTACATTATCAT,9.7,1.1,Bad,0.105242
2,108,100,CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...,CGAAGTATTGTAAGGTGTAGTGTGCGTTGA,TCAACGCACACTACACCTTACAATACTTCG,13.6,1.5,Bad,0.49488
3,122,116,TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...,TAAGTAAATGAAAGTGTATGTATGTTGCTG,CAGCAACATACATACACTTTCATTTACTTA,8.7,1.1,Bad,0.404838
4,17,117,TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...,TCAATAAGGCGGAGTTCGTCGAGGTGCCTG,CAGGCACCTCGACGAACTCCGCCTTATTGA,8.5,1.4,Bad,0.695428


In [9]:
def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)

    pearson_corr = pearsonr(preds_y, true_y)[0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return[r2_score, pearson_corr, spearman_corr]

In [10]:
onoff_vals = np.array(sequences['Avg ONOFF'])
onoff_vals = preprocessing.MinMaxScaler().fit_transform(onoff_vals.reshape(-1, 1))
onoff_vals = onoff_vals.flatten()

onoff_original_metrics = compute_metrics(sequences['simple_onoff_original_kc_preds'], onoff_vals)

R2:  0.0001870542615070489
Pearson:  -0.013676778184464677
Spearman:  -0.03460180027775038


In [11]:
tf_matrix = pd.DataFrame([onoff_original_metrics])
tf_matrix.columns = ['r2', 'pearson', 'spearman']
tf_matrix['model'] = ['onoff_original']
tf_matrix

Unnamed: 0,r2,pearson,spearman,model
0,0.000187,-0.013677,-0.034602,onoff_original
