In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

#import keras as keras
from tensorflow.python import keras
from keras.models import load_model

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

from keras import backend as K 
from scipy.stats import pearsonr, spearmanr 

Using TensorFlow backend.


# Part 1: Load in sequence data. 
## Change file_name here!

In [2]:
# Load in data
data_dir = 'make_tf_learning_models/'
sequence_file = 'Pardee2016_clean.xlsx'
sequences = pd.read_excel(data_dir + sequence_file)
print(sequences.head(5))

  Sensor Name                                    Sensor sequence  Rank
0          1A  UCUUCAGCCUCCAUGUGUCAUUCUUCUCACUCUCAAGUUAUAGUUA...     6
1          2A  AAAUUCCCCUUGUUUCUUUUCUCUUUUUCCCAUCAUGUUAUAGUUA...    14
2          3A  UUUCGCUCUAUUCUCAUCAGUUUCAUGUCCUGUGUCGUUAUAGUUA...     8
3          4A  GCUCCCCUUCUACUGAUCUCCACAUGAUGUUUUCCAGUUAUAGUUA...     5
4          5A  AACUUCUUUAUUAUUUCCAUAGCCUCUUUUUUCCCCGUUAUAGUUA...    23


In [3]:
seqs = sequences['Sensor sequence']
ranks = sequences['Rank']

In [4]:
seqs = [x[18:77] for x in seqs] # need to trim off extraneous bits

# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [5]:
from pysster.One_Hot_Encoder import One_Hot_Encoder
alph_letters = 'AUCG'
alph = list(alph_letters)

# one-hot encode
# modified code from Luis to get correct format for TPOT w/ our nt seq
# use pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

X = np.stack(
    [_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
nsamples, nx, ny = X.shape # have to flatten the one hot encoded into one dimension

print('input shape: ', X.shape)
# reformat for CNN 
alph_len = len(alph)
seq_len = len(seqs[0])
X = X.reshape(X.shape[0], seq_len, alph_len).astype('float32')
print('modified shape: ', X.shape)

input shape:  (24, 59, 4)
modified shape:  (24, 59, 4)


In [6]:
#y = np.array(onoff_vals)

# reshape
#y = np.transpose(np.array([y]))
#print('target shape: ', y.shape)

# Part 3. Load in original KC onoff model. 

In [7]:
model_dir = 'models/'
final_model_path = model_dir + 'onoff_original_model.h5'
final_weights_path = model_dir + 'onoff_original_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.











In [8]:
predictions = model.predict(X)
on_preds = predictions
sequences['simple_onoff_original_kc_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()




# Part 4. Load in model from purely Green seqs

In [9]:
final_model_path = model_dir + 'only_green_trained_model.h5'
final_weights_path = model_dir + 'only_green_trained_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [10]:
predictions = model.predict(X)
on_preds = predictions
sequences['green_onoff_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()

# Part 5. Load in tf model with initialization

In [11]:
final_model_path = model_dir + 'initialization_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'initialization_weights_tf_onoff_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [12]:
predictions = model.predict(X)
on_preds = predictions
sequences['weight_initialization_onoff_preds'] = np.reshape(on_preds, [len(seqs),])
K.clear_session()

# Part 6. Load in tf model with weight freezing

In [13]:
final_model_path = model_dir + 'freeze_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'freeze_weights_tf_onoff_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [14]:
predictions = model.predict(X)
on_preds = predictions
sequences['layer_freeze_onoff_preds'] = np.reshape(on_preds, [len(seqs),])

# Part 7. Look at results

In [15]:
sequences.head(5)

Unnamed: 0,Sensor Name,Sensor sequence,Rank,simple_onoff_original_kc_preds,green_onoff_preds,weight_initialization_onoff_preds,layer_freeze_onoff_preds
0,1A,UCUUCAGCCUCCAUGUGUCAUUCUUCUCACUCUCAAGUUAUAGUUA...,6,0.423123,0.048257,0.142335,0.139655
1,2A,AAAUUCCCCUUGUUUCUUUUCUCUUUUUCCCAUCAUGUUAUAGUUA...,14,0.556059,0.218116,0.129653,0.05784
2,3A,UUUCGCUCUAUUCUCAUCAGUUUCAUGUCCUGUGUCGUUAUAGUUA...,8,0.401909,0.2036,0.151441,0.080758
3,4A,GCUCCCCUUCUACUGAUCUCCACAUGAUGUUUUCCAGUUAUAGUUA...,5,0.440283,0.187361,0.150465,0.082119
4,5A,AACUUCUUUAUUAUUUCCAUAGCCUCUUUUUUCCCCGUUAUAGUUA...,23,0.14528,0.133098,0.173248,0.036084


In [16]:
def r2(preds_y, true_y):
    return pearsonr(preds_y, true_y)[0] ** 2

def compute_metrics(preds_y, true_y): 
    # calc spearman with the ranks : https://cmdlinetips.com/2019/08/how-to-compute-pearson-and-spearman-correlation-in-python/
    spearman_corr = np.corrcoef(preds_y.rank(), true_y)[0]
    return spearman_corr[1]

In [17]:
ranks = [(25 - x) for x in ranks]
onoff_vals = np.array(ranks)

onoff_original_metrics = compute_metrics(sequences['simple_onoff_original_kc_preds'], onoff_vals)
print(onoff_original_metrics)

green_metrics = compute_metrics(sequences['green_onoff_preds'], onoff_vals)
print(green_metrics)

tf_init_metrics = compute_metrics(sequences['weight_initialization_onoff_preds'], onoff_vals)
print(tf_init_metrics)

tf_weight_mettrics = compute_metrics(sequences['layer_freeze_onoff_preds'], onoff_vals)
print(tf_weight_mettrics)


-0.01826086956521739
-0.0608695652173913
0.021739130434782605
0.20173913043478262


In [18]:
tf_matrix = pd.DataFrame([onoff_original_metrics, green_metrics, tf_init_metrics, tf_weight_mettrics])
tf_matrix.columns = ['spearman']
tf_matrix['model'] = ['onoff_original', 'just_green', 'tf_initialization', 'tf_freeze_weights']
tf_matrix

Unnamed: 0,spearman,model
0,-0.018261,onoff_original
1,-0.06087,just_green
2,0.021739,tf_initialization
3,0.201739,tf_freeze_weights


In [19]:
out_dir = 'cleaned_csvs/'
tf_matrix.to_csv(out_dir + '4b_zika_tf_learning_metrics.csv')

# Part 8: Make sure no overlap between these zika seqs and the ones used by the model

In [20]:
# load in original data
data_dir = '../../data/'
# diff sheets, so need to read i/n 
file_name = 'newQC_toehold_data.csv'
data_df = pd.read_csv(data_dir + file_name,sep=',')
toeholds = list(data_df.loc[:, 'switch_sequence'])

In [21]:
for seq in seqs:
    test = seq in toeholds
    print('seq is in original model data: ' + str(test))

seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
seq is in original model data: False
