In [22]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

import keras as keras
from keras.models import load_model

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

from keras import backend as K 
from scipy.stats import pearsonr, spearmanr 

# Part 1: Load in sequence data. 

In [23]:
# Load in data
data_dir = 'make_tf_learning_models/'
sequence_file = 'Green2014_clean.csv' # data from Green et al 2014
sequences = pd.read_csv(data_dir + sequence_file,sep=',')
print(sequences.head(5))

   Unnamed: 0  Toehold ID                                   Toehold sequence  \
0          11          68  AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...   
1         117         110  ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...   
2         108         100  CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...   
3         122         116  TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...   
4          17         117  TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...   

                    Switch region                         Trigger  Avg ONOFF  \
0  AATGTATGTAATAGTTCGTCGAGGTGTCCA  TGGACACCTCGACGAACTATTACATACATT       24.8   
1  ATGATAATGTAGAGGTGCGGAGTGATTGTA  TACAATCACTCCGCACCTCTACATTATCAT        9.7   
2  CGAAGTATTGTAAGGTGTAGTGTGCGTTGA  TCAACGCACACTACACCTTACAATACTTCG       13.6   
3  TAAGTAAATGAAAGTGTATGTATGTTGCTG  CAGCAACATACATACACTTTCATTTACTTA        8.7   
4  TCAATAAGGCGGAGTTCGTCGAGGTGCCTG  CAGGCACCTCGACGAACTCCGCCTTATTGA        8.5   

   sdev ONOFF Toehold Rating  
0      

In [24]:
seqs = sequences['Toehold sequence']
onoff_vals = sequences['Avg ONOFF'] # this is normalized ON value from Green et al 2014

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [25]:
from pysster.One_Hot_Encoder import One_Hot_Encoder
alph_letters = 'ATCG'
alph = list(alph_letters)

# one-hot encode
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

X = np.stack(
    [_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
nsamples, nx, ny = X.shape # have to flatten the one hot encoded into one dimension

# reformat for CNN if needd
print('input shape: ', X.shape)
alph_len = len(alph)
seq_len = len(seqs[0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

input shape:  (168, 59, 4)
modified shape:  (168, 59, 4)


In [26]:
# reshape target if needed
y = np.array(onoff_vals)
y = np.transpose(np.array([y]))
print('target shape: ', y.shape)

target shape:  (168, 1)


# Part 3. Load in original CNN model trained on Angenent-Mari et al. 2020 data.

In [27]:
model_dir = 'models/'
final_model_path = model_dir + 'onoff_original_model.h5'
final_weights_path = model_dir + 'onoff_original_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [28]:
predictions = model.predict(X)
on_preds = predictions
sequences['simple_onoff_original_kc_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()

# Part 4. Load in model trained on only Green et al. 2014 data.

In [29]:
final_model_path = model_dir + 'only_green_trained_model.h5'
final_weights_path = model_dir + 'only_green_trained_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [30]:
predictions = model.predict(X)
on_preds = predictions
sequences['green_onoff_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()

# Part 5. Load in transfer learning model initialized with Angenent-Mari weights.

In [31]:
final_model_path = model_dir + 'initialization_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'initialization_weights_tf_onoff_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [32]:
predictions = model.predict(X)
on_preds = predictions
sequences['weight_initialization_onoff_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()

# Part 6. Load in transfer learning model with Angenent-Mari weights frozen in the CNN layers.

In [33]:
final_model_path = model_dir + 'freeze_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'freeze_weights_tf_onoff_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [34]:
predictions = model.predict(X)
on_preds = predictions
sequences['layer_freeze_onoff_preds'] = np.reshape(on_preds, [len(seqs),])

# Part 7. Look at results

In [35]:
sequences.head(5)

Unnamed: 0.1,Unnamed: 0,Toehold ID,Toehold sequence,Switch region,Trigger,Avg ONOFF,sdev ONOFF,Toehold Rating,simple_onoff_original_kc_preds,green_onoff_preds,weight_initialization_onoff_preds,layer_freeze_onoff_preds
0,11,68,AATGTATGTAATAGTTCGTCGAGGTGTCCAAGCAGAGGAGATGGAC...,AATGTATGTAATAGTTCGTCGAGGTGTCCA,TGGACACCTCGACGAACTATTACATACATT,24.8,2.2,Bad,0.241781,0.018677,0.103607,0.04951
1,117,110,ATGATAATGTAGAGGTGCGGAGTGATTGTAAACAGAGGAGATACAA...,ATGATAATGTAGAGGTGCGGAGTGATTGTA,TACAATCACTCCGCACCTCTACATTATCAT,9.7,1.1,Bad,0.105242,0.060443,0.144504,0.092114
2,108,100,CGAAGTATTGTAAGGTGTAGTGTGCGTTGAGACAGAGGAGATCAAC...,CGAAGTATTGTAAGGTGTAGTGTGCGTTGA,TCAACGCACACTACACCTTACAATACTTCG,13.6,1.5,Bad,0.49488,-0.024859,0.099984,0.068302
3,122,116,TAAGTAAATGAAAGTGTATGTATGTTGCTGGACAGAGGAGACAGCA...,TAAGTAAATGAAAGTGTATGTATGTTGCTG,CAGCAACATACATACACTTTCATTTACTTA,8.7,1.1,Bad,0.404838,-0.003045,0.109763,0.033609
4,17,117,TCAATAAGGCGGAGTTCGTCGAGGTGCCTGAGCAGAGGAGACAGGC...,TCAATAAGGCGGAGTTCGTCGAGGTGCCTG,CAGGCACCTCGACGAACTCCGCCTTATTGA,8.5,1.4,Bad,0.695428,-0.005056,0.065643,0.02986


In [36]:
def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    r2_score = r2(preds_y, true_y)

    pearson_corr = pearsonr(preds_y, true_y)[0]
    spearman_corr = spearmanr(preds_y, true_y)[0]
    print('R2: ', r2_score)
    print('Pearson: ', pearson_corr)
    print('Spearman: ', spearman_corr)
    return[r2_score, pearson_corr, spearman_corr]

In [40]:
onoff_vals = np.array(sequences['Avg ONOFF'])
onoff_vals = preprocessing.MinMaxScaler().fit_transform(onoff_vals.reshape(-1, 1))
onoff_vals = onoff_vals.flatten()

onoff_original_metrics = compute_metrics(sequences['simple_onoff_original_kc_preds'], onoff_vals)
green_metrics = compute_metrics(sequences['green_onoff_preds'], onoff_vals)
tf_init_metrics = compute_metrics(sequences['weight_initialization_onoff_preds'], onoff_vals)
tf_weight_mettrics = compute_metrics(sequences['layer_freeze_onoff_preds'], onoff_vals)

R2:  0.00018705497816400357
Pearson:  -0.013676804384212109
Spearman:  -0.03460180027775038
R2:  0.7275583582304104
Pearson:  0.8529703149760901
Spearman:  0.6877042949764046
R2:  0.3606370845251086
Pearson:  0.6005306690961825
Spearman:  0.4979948599020488
R2:  0.594702915739662
Pearson:  0.7711698358595609
Spearman:  0.6386292916932552


In [38]:
tf_matrix = pd.DataFrame([onoff_original_metrics, green_metrics, tf_init_metrics, tf_weight_mettrics])
tf_matrix.columns = ['r2', 'pearson', 'spearman']
tf_matrix['model'] = ['onoff_original', 'just_green', 'tf_initialization', 'tf_freeze_weights']
tf_matrix

Unnamed: 0,r2,pearson,spearman,model
0,0.000187,-0.013677,-0.034602,onoff_original
1,0.727558,0.85297,0.687704,just_green
2,0.360637,0.600531,0.497995,tf_initialization
3,0.594703,0.77117,0.638629,tf_freeze_weights


In [41]:
out_dir = 'data/'
tf_matrix.to_csv(out_dir + '4b_tf_learning_metrics.csv')