# Goal: Optimize the toehold sequences with gradient ascent to improve the ON/OFF ratio.

### Instructions: Please change the file_name in the second code block to sequences you are interested in redesigning. The format should be a .csv file with at least three columns: a switch_sequence column with the original DNA sequence of the toehold; an on_value column with the ON value of the switch (can be predicted if in silico); and an off_value column with the OFF value of the switch (again, can be predicted).

In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

import keras as keras
from keras.models import load_model
from keras.regularizers import l2

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

import isolearn.keras as iso
from seqprop import *
#import seqprop.visualization
from seqprop.generator import *
from seqprop.predictor import *
from seqprop.optimizer import *


Using TensorFlow backend.


# Part 1: Load in sequence data. 
## Change file_name here!

In [2]:
# enter a .csv with sequences
data_dir = 'gradient_ascent_sequences/'
file_name = 'worst_toehold_sequences.csv' # CHANGE FILENAME!
data_df = pd.read_csv(data_dir + file_name,sep=',')
data_df.head(3)

Unnamed: 0,switch_sequence,on_value,off_value,onoff_value,on_preds,off_preds
0,ACAAAAAAACAATAAAAAATAGAGAAAAAGAACAGAGGAGACTTTT...,0.42827,0.818291,-0.390021,0.521815,0.815901
1,ATAAACAAAATGGATATTATAGACAAAAAAAACAGAGGAGATTTTT...,0.570486,0.934635,-0.36415,0.70009,0.864703
2,GATGTTACAAACGATAATATAGACAAAAATAACAGAGGAGAATTTT...,0.64221,1.0,-0.35779,0.718297,0.850942


In [3]:
toehold_seqs = data_df['switch_sequence']
seq_len = len(toehold_seqs[0])
print('Toehold length: ', seq_len)
num_seqs = len(data_df)
print('Number of sequences: ', num_seqs)

Toehold length:  59
Number of sequences:  100


# Part 2: Extract toeholds to optimize.
### Note: 100 sequences takes ~2 hours to optimize, given compute power, so simplify to just 10 sequences here.

In [4]:
data_df = data_df[0:10]
toehold_seqs = data_df['switch_sequence']
print('Number of sequences: ', len(toehold_seqs))

Number of sequences:  10


# Part 3. Transform Data. One-hot encode sequences and extact target on and off values.

In [5]:
# create DNA alphabet- may need to change if you have RNA toeholds. Just change to 'AUCG' in the first line
alph_letters = sorted('ATCG')
alph = list(alph_letters)

# one-hot encode with pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'
X = np.stack([_get_one_hot_encoding(s) for s in toehold_seqs]).astype(np.float32)
print('input shape: ', X.shape)

# now set y as the on and off values
y_on = np.array(data_df['on_value'].astype(np.float32))
y_off = np.array(data_df['off_value'].astype(np.float32))
y = np.transpose(np.array([y_on,y_off,]))
print('target shape: ', y.shape)

input shape:  (10, 59, 4)
target shape:  (10, 2)


# Part 4. Load in final model. 

In [6]:
model_dir = 'trained_model/'
final_model_path = model_dir + 'final_trained_model.h5'
final_weights_path = model_dir + 'final_trained_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [7]:
# visually inspect architecture
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 59, 4)        0                                            
__________________________________________________________________________________________________
conv_0 (Conv1D)                 (None, 59, 10)       210         input_2[0][0]                    
__________________________________________________________________________________________________
conv_1 (Conv1D)                 (None, 59, 5)        155         conv_0[0][0]                     
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 295)          0           conv_1[0][0]                     
__________________________________________________________________________________________________
dropout_4 

# Part 5. Build model specific for seqprop.

In [8]:
# adapted from: https://github.com/876lkj/seqprop 

# need to re-create EXACT SAME layers as final trained model
# fix weights of layers so only input layer is modified
def load_saved_predictor(model_path) :

    saved_model = load_model(model_path)

    def _initialize_predictor_weights(predictor_model, saved_model=saved_model) :
        #Load pre-trained model
    
        predictor_model.get_layer('conv_0').set_weights(saved_model.get_layer('conv_0').get_weights())
        predictor_model.get_layer('conv_0').trainable = False

        predictor_model.get_layer('conv_1').set_weights(saved_model.get_layer('conv_1').get_weights())
        predictor_model.get_layer('conv_1').trainable = False

        predictor_model.get_layer('dense_0').set_weights(saved_model.get_layer('dense_0').get_weights())
        predictor_model.get_layer('dense_0').trainable = False

        predictor_model.get_layer('dense_1').set_weights(saved_model.get_layer('dense_1').get_weights())
        predictor_model.get_layer('dense_1').trainable = False

        predictor_model.get_layer('dense_2').set_weights(saved_model.get_layer('dense_2').get_weights())
        predictor_model.get_layer('dense_2').trainable = False

        predictor_model.get_layer('on_output').set_weights(saved_model.get_layer('on_output').get_weights())
        predictor_model.get_layer('on_output').trainable = False

        predictor_model.get_layer('off_output').set_weights(saved_model.get_layer('off_output').get_weights())
        predictor_model.get_layer('off_output').trainable = False

    def _load_predictor_func(sequence_input) :
        # input space parameters 
        seq_length = 59
        num_letters = 4 # num nt 
        # expanded version b/c seqprop built for 2d 
        seq_input_shape = (seq_len, num_letters, 1) # modified

        #define new model definition (same architecture except modified input)
        dropout_rate=0.1
        reg_coeff= 0.0001
        hidden_layer_choices = {5: (150, 60, 15), 10: (300, 100, 30), 15: (400,150, 30),}
        conv_layer_parameters = [(5,10), (3,5),]
        hidden_layers = hidden_layer_choices[5]
        
        #expanded_input = Input(shape=seq_input_shape,name='new_input')
        reshaped_input = Reshape(target_shape=(seq_len, num_letters),name='reshaped_input')(sequence_input)#(expanded_input)        #(kernel_width, num_filters) = conv_layer_parameters
        prior_layer = reshaped_input 
        for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):
            conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
            prior_layer = conv_layer
        H = Flatten(name='flatten')(prior_layer)
        for idx,h in enumerate(hidden_layers): 
            H = Dropout(dropout_rate, name='dropout_'+str(idx))(H)
            H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff), name='dense_'+str(idx))(H)
        out_on = Dense(1,activation="linear",name='on_output')(H)
        out_off = Dense(1, activation='linear', name='off_output')(H)
        on_off_out = Concatenate(name='on_of_output')([out_on,out_off])
        
        predictor_inputs = []
        predictor_outputs = [on_off_out]

        return predictor_inputs, predictor_outputs, _initialize_predictor_weights

    return _load_predictor_func

# Part 6. Set-up gradient ascent workflow. Convert to callable function.


In [9]:
# define constants 

# get seed input which we will modify 
num_samples = 1

# template specifying what to modify and what not (biological constaints)
switch = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
rbs = 'AACAGAGGAGA'
start_codon = 'ATG'
stem1 = 'NNNNNN'#'XXXXXX'
stem2 = 'NNNNNNNNN'#'XXXXXXXXX'

bio_constraints = switch + rbs + stem1 + start_codon + stem2 

# define target on, off values 
target_on = 0.99
target_off = 0.001
target = [[target_on,target_off], ] 

In [10]:
# build loss function
# ensure biological constraints are satisfied per sequence

def stem_base_pairing(pwm): 
    # ensure that location of 1s in switch region matches reverse complement of stem
    
    def reverse_complement(base_index): 
        # ACGT = alphabett
        if base_index == 0: return 3
        elif base_index == 1: return 2 
        elif base_index == 2: return 1 
        elif base_index == 3: return 0
    
    # reverse complement is reverse over axis of one-hot encoded nt 
    nt_reversed = K.reverse(pwm, axes=2)
    stem1_score = 6 - K.sum(pwm[:, 24, :, 0]*nt_reversed[:, 41,:, 0] + pwm[:, 25, :, 0]*nt_reversed[:, 42, :, 0]+ pwm[:,26, :, 0]*nt_reversed[:, 43, :, 0] + pwm[:, 27, :, 0]*nt_reversed[:, 44, :, 0] + pwm[:, 28, :, 0]*nt_reversed[:, 45, :, 0]+ pwm[:, 29, :, 0]*nt_reversed[:, 46, :, 0])
    stem2_score = 9 - K.sum(pwm[:, 12, :, 0]*nt_reversed[:, 50, :, 0] + pwm[:, 13, :, 0]*nt_reversed[:, 51, :, 0]+ pwm[:, 14, :, 0]*nt_reversed[:, 52, :, 0]+ pwm[:, 15, :, 0]*nt_reversed[:, 53, :, 0] + pwm[:, 16, :, 0]*nt_reversed[:, 54, :, 0] + pwm[:, 17, :, 0]*nt_reversed[:,55, :, 0]+ pwm[:, 18,:, 0]*nt_reversed[:, 56, :, 0] + pwm[:, 19, :, 0]*nt_reversed[:,57, :, 0] + pwm[:, 20, :, 0]*nt_reversed[:, 58, :, 0])
    return 10*stem1_score + 10*stem2_score

def loss_func(predictor_outputs) :
    pwm_logits, pwm, sampled_pwm, predicted_out = predictor_outputs
  
    #Create target constant -- want predicted value for modified input to be close to target input 
    target_out = K.tile(K.constant(target), (K.shape(sampled_pwm)[0], 1))
    target_cost = (target_out - predicted_out)**2
    print(target_out, target_cost, predicted_out)
    base_pairing_cost = stem_base_pairing(sampled_pwm)
    print(base_pairing_cost)
    print(K.mean(target_cost + base_pairing_cost, axis=-1))
    return K.mean(target_cost + base_pairing_cost, axis=-1)

In [11]:
def run_gradient_ascent(input_toehold_seq, original_out):

    # build generator network
    _, seqprop_generator = build_generator(seq_length=seq_len, n_sequences=num_samples, batch_normalize_pwm=True,init_sequences = [input_toehold_seq],
                                          sequence_templates=bio_constraints)# batch_normalize_pwm=True)
    
    # build predictor network and hook it on the generator PWM output tensor
    _, seqprop_predictor = build_predictor(seqprop_generator, load_saved_predictor(final_model_path), n_sequences=num_samples, eval_mode='pwm')

    #Build Loss Model (In: Generator seed, Out: Loss function)
    _, loss_model = build_loss_model(seqprop_predictor, loss_func, )

    #Specify Optimizer to use
    opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

    #Compile Loss Model (Minimize self)
    loss_model.compile(loss=lambda true, pred: pred, optimizer=opt)

    #Fit Loss Model
    #seed_input = np.reshape([X[0]], [1,59,4,1]) # any input toehold to be modified

    callbacks =[
                EarlyStopping(monitor='loss', min_delta=0.001, patience=5, verbose=0, mode='auto'),
                #SeqPropMonitor(predictor=seqprop_predictor)#, plot_every_epoch=True, track_every_step=True, )#cse_start_pos=70, isoform_start=target_cut, isoform_end=target_cut+1, pwm_start=70-40, pwm_end=76+50, sequence_template=sequence_template, plot_pwm_indices=[0])
            ]


    num_epochs=50
    train_history = loss_model.fit([], np.ones((1, 1)), epochs=num_epochs, steps_per_epoch=1000, callbacks=callbacks)

    #Retrieve optimized PWMs and predicted (optimized) target
    _, optimized_pwm, optimized_onehot, predicted_out = seqprop_predictor.predict(x=None, steps=1)
    print('Original [on, off]:', original_out)
    print('Predicted [on, off]: ', predicted_out)
    
    return optimized_pwm, optimized_onehot, predicted_out

# Part 7. Run gradient ascent on the specified seed inputs. 

In [12]:
def invert_onehot(oh_seq): 
    return ''.join(alph[idx] for idx in np.argmax(oh_seq,axis=1))

In [13]:
optimized_pwms = [] # store the probabilities
optimized_seqs = [] # store the converted sequences to be tested 
predicted_targets = [] # store the original and predicted target values 
for idx, (toehold_seq, original_out) in enumerate(zip(toehold_seqs, y)): 
    optimized_pwm, optimized_onehot, predicted_out = run_gradient_ascent(toehold_seq, original_out)
    optimized_pwms.append(np.reshape(optimized_pwm, [59, 4]))
    predicted_targets.append(predicted_out)
    new_seq = invert_onehot(np.reshape(optimized_onehot, [59,4]))
    optimized_seqs.append(new_seq)

Instructions for updating:
Use tf.random.categorical instead.


NameError: name 'build_loss_model' is not defined

# Part 8. Save modified toeholds.

In [None]:
data_df['new_switch'] = optimized_seqs
data_df['predicted_onoff'] = predicted_targets
data_df['optimized_pwm'] = optimized_pwms

In [None]:
data_df.to_csv(data_dir + 'optimized_toeholds_gradascent.csv')

In [None]:
pip freeze