### Generate the sequences to be tested experimentally. Based off the 4 'predicted bad' sequences.

In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

import keras as keras
from keras.models import load_model
from keras.regularizers import l2

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

import isolearn.keras as iso
from seqprop import *
from seqprop.generator import *
from seqprop.predictor import *
from seqprop.optimizer import *


Using TensorFlow backend.


# Part 1: Load in sequence data. 

In [39]:
# manually input data since there are only four data points
toehold_seqs = ['TGTATAAACCCACAAATGTAAGTGAAAAAAAACAGAGGAGATTTTTTATGTTACATTTG', 'AATGTCCACACCCAAATTATTGAGTATTTTAACAGAGGAGAAAAATAATGAATAATTTG', 'GTTGTTTAATCCTTTAATAAAGTATAAATAAACAGAGGAGATATTTAATGTTTATTAAA', 'ATCAAAGTGTCCCTTATTTACAACATTAAAAACAGAGGAGATTTAATATGGTAAATAAG']
storm_pred_onoff_vals = [-0.0041365, -0.0105771, -0.0318257, 0.00174482]
seq_len = len(toehold_seqs[0])
print('Toehold length: ', seq_len)
num_seqs = len(toehold_seqs)
print('Number of sequences: ', num_seqs)

Toehold length:  59
Number of sequences:  4


# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [41]:
# create DNA alphabet- may need to change if you have RNA toeholds. Just change to 'AUCG' in the first line
alph_letters = sorted('ACGT')
alph = list(alph_letters)

# one-hot encode with pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# now convert the data into one_hot_encoding 
input_col_name = 'switch_sequence'
X = np.stack([_get_one_hot_encoding(s) for s in toehold_seqs]).astype(np.float32)
print('input shape: ', X.shape)

# now set y as the on and off values
y = np.array(storm_pred_onoff_vals).astype(np.float32)
print('target shape: ', y.shape)

input shape:  (4, 59, 4)
target shape:  (4,)


# Part 3. Load in final model. 

In [42]:
model_dir = ''
final_model_path = model_dir + 'freeze_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'freeze_weights_tf_onoff_model_weights.h5'
model = load_model(final_model_path)
model.load_weights(final_weights_path)

In [43]:
# visually inspect architecture
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        (None, 59, 4)             0         
_________________________________________________________________
conv_0 (Conv1D)              (None, 59, 10)            210       
_________________________________________________________________
conv_1 (Conv1D)              (None, 59, 5)             155       
_________________________________________________________________
flatten_21 (Flatten)         (None, 295)               0         
_________________________________________________________________
dropout_61 (Dropout)         (None, 295)               0         
_________________________________________________________________
dense_0 (Dense)              (None, 150)               44400     
_________________________________________________________________
dropout_62 (Dropout)         (None, 150)               0         
__________

# Part 4. Build model specific for seqprop.

In [44]:
# adapted from: https://github.com/876lkj/seqprop 

# need to re-create EXACT SAME layers as final trained model
# fix weights of layers so only input layer is modified
def load_saved_predictor(model_path) :

    saved_model = load_model(model_path)

    def _initialize_predictor_weights(predictor_model, saved_model=saved_model) :
        #Load pre-trained model
        predictor_model.get_layer('conv_0').set_weights(saved_model.get_layer('conv_0').get_weights())
        predictor_model.get_layer('conv_0').trainable = False

        predictor_model.get_layer('conv_1').set_weights(saved_model.get_layer('conv_1').get_weights())
        predictor_model.get_layer('conv_1').trainable = False

        predictor_model.get_layer('dense_0').set_weights(saved_model.get_layer('dense_0').get_weights())
        predictor_model.get_layer('dense_0').trainable = False

        predictor_model.get_layer('dense_1').set_weights(saved_model.get_layer('dense_1').get_weights())
        predictor_model.get_layer('dense_1').trainable = False

        predictor_model.get_layer('dense_2').set_weights(saved_model.get_layer('dense_2').get_weights())
        predictor_model.get_layer('dense_2').trainable = False

        predictor_model.get_layer('on_output').set_weights(saved_model.get_layer('on_output').get_weights())
        predictor_model.get_layer('on_output').trainable = False

    def _load_predictor_func(sequence_input) :
        # input space parameters 
        seq_length = 59
        num_letters = 4 # num nt 
        # expanded version b/c seqprop built for 2d 
        seq_input_shape = (seq_len, num_letters, 1) # modified

        #define new model definition (same architecture except modified input)
        dropout_rate=0.1
        reg_coeff= 0.0001
        hidden_layer_choices = {5: (150, 60, 15),}
        conv_layer_parameters = [(5,10), (3,5),]
        hidden_layers = hidden_layer_choices[5]
        
        reshaped_input = Reshape(target_shape=(seq_len, num_letters),name='reshaped_input')(sequence_input)
        prior_layer = reshaped_input 
        for idx, (kernel_width, num_filters) in enumerate(conv_layer_parameters):
            conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_width, padding='same', name='conv_'+str(idx))(prior_layer) # mimic a kmer
            prior_layer = conv_layer
        H = Flatten(name='flatten')(prior_layer)
        for idx,h in enumerate(hidden_layers): 
            H = Dropout(dropout_rate, name='dropout_'+str(idx))(H)
            H = Dense(h, activation='relu', kernel_regularizer=l2(reg_coeff), name='dense_'+str(idx))(H)
        out_onoff = Dense(1,activation="linear",name='on_output')(H)
        
        predictor_inputs = []
        predictor_outputs = [out_onoff]

        return predictor_inputs, predictor_outputs, _initialize_predictor_weights

    return _load_predictor_func

# Part 6. Set-up gradient ascent workflow. Convert to callable function.


In [45]:
# define constants 

# get seed input which we will modify 
num_samples = 1

# template specifying what to modify and what not (biological constaints)
switch = 'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
rbs = 'AACAGAGGAGA'
start_codon = 'ATG'
stem1 = 'NNNNNN'#'XXXXXX'
stem2 = 'NNNNNNNNN'#'XXXXXXXXX'

bio_constraints = switch + rbs + stem1 + start_codon + stem2 

# define target on/off values 
target_onoff = 1
target = [[target_onoff], ] # keep in this format in case you want to adapt for separate on and off predictions

In [46]:
# build loss function
# ensure biological constraints are satisfied per sequence

def stem_base_pairing(pwm): 
    # ensure that location of 1s in switch region matches reverse complement of stem
    
    def reverse_complement(base_index): 
        # ACGT = alphabett
        if base_index == 0: return 3
        elif base_index == 1: return 2 
        elif base_index == 2: return 1 
        elif base_index == 3: return 0
    
    # reverse complement is reverse over axis of one-hot encoded nt 
    nt_reversed = K.reverse(pwm, axes=2)
    stem1_score = 6 - K.sum(pwm[:, 24, :, 0]*nt_reversed[:, 41,:, 0] + pwm[:, 25, :, 0]*nt_reversed[:, 42, :, 0]+ pwm[:,26, :, 0]*nt_reversed[:, 43, :, 0] + pwm[:, 27, :, 0]*nt_reversed[:, 44, :, 0] + pwm[:, 28, :, 0]*nt_reversed[:, 45, :, 0]+ pwm[:, 29, :, 0]*nt_reversed[:, 46, :, 0])
    stem2_score = 9 - K.sum(pwm[:, 12, :, 0]*nt_reversed[:, 50, :, 0] + pwm[:, 13, :, 0]*nt_reversed[:, 51, :, 0]+ pwm[:, 14, :, 0]*nt_reversed[:, 52, :, 0]+ pwm[:, 15, :, 0]*nt_reversed[:, 53, :, 0] + pwm[:, 16, :, 0]*nt_reversed[:, 54, :, 0] + pwm[:, 17, :, 0]*nt_reversed[:,55, :, 0]+ pwm[:, 18,:, 0]*nt_reversed[:, 56, :, 0] + pwm[:, 19, :, 0]*nt_reversed[:,57, :, 0] + pwm[:, 20, :, 0]*nt_reversed[:, 58, :, 0])
    return 10*stem1_score + 10*stem2_score

def loss_func(predictor_outputs) :
    pwm_logits, pwm, sampled_pwm, predicted_out = predictor_outputs
  
    #Create target constant -- want predicted value for modified input to be close to target input 
    target_out = K.tile(K.constant(target), (K.shape(sampled_pwm)[0], 1))
    target_cost = (target_out - predicted_out)**2
    print(target_out, target_cost, predicted_out)
    base_pairing_cost = stem_base_pairing(sampled_pwm)
    print(base_pairing_cost)
    print(K.mean(target_cost + base_pairing_cost, axis=-1))
    
    ## use this return statement to include the basepairing cost
    #return K.mean(target_cost + base_pairing_cost, axis=-1)
    
    ## use this return statement to ignore the basepairing cost
    # modifying so we don't have the base pairing constraint- will make sure complementary after
    return K.mean(target_cost, axis=-1)

In [47]:
def run_gradient_ascent(input_toehold_seq, original_out):

    # build generator network
    _, seqprop_generator = build_generator(seq_length=seq_len, n_sequences=num_samples, batch_normalize_pwm=True,init_sequences = [input_toehold_seq],
                                          sequence_templates=bio_constraints)# batch_normalize_pwm=True)
    
    # build predictor network and hook it on the generator PWM output tensor
    _, seqprop_predictor = build_predictor(seqprop_generator, load_saved_predictor(final_model_path), n_sequences=num_samples, eval_mode='pwm')

    #Build Loss Model (In: Generator seed, Out: Loss function)
    _, loss_model = build_loss_model(seqprop_predictor, loss_func, )

    #Specify Optimizer to use
    opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

    #Compile Loss Model (Minimize self)
    loss_model.compile(loss=lambda true, pred: pred, optimizer=opt)

    #Fit Loss Model
    #seed_input = np.reshape([X[0]], [1,59,4,1]) # any input toehold to be modified

    callbacks =[
                EarlyStopping(monitor='loss', min_delta=0.001, patience=5, verbose=0, mode='auto'),
                #SeqPropMonitor(predictor=seqprop_predictor)#, plot_every_epoch=True, track_every_step=True, )#cse_start_pos=70, isoform_start=target_cut, isoform_end=target_cut+1, pwm_start=70-40, pwm_end=76+50, sequence_template=sequence_template, plot_pwm_indices=[0])
            ]

    num_epochs=50
    train_history = loss_model.fit([], np.ones((1, 1)), epochs=num_epochs, steps_per_epoch=1000, callbacks=callbacks)

    #Retrieve optimized PWMs and predicted (optimized) target
    _, optimized_pwm, optimized_onehot, predicted_out = seqprop_predictor.predict(x=None, steps=1)
    print('Original ON/OFF:', original_out)
    print('Predicted ON/OFF: ', predicted_out)
    
    return optimized_pwm, optimized_onehot, predicted_out

# Part 7. Run gradient ascent on the specified seed inputs. 

In [48]:
def invert_onehot(oh_seq): 
    return ''.join(alph[idx] for idx in np.argmax(oh_seq,axis=1))

In [49]:
optimized_pwms = [] # store the probabilities
optimized_seqs = [] # store the converted sequences to be tested 
predicted_targets = [] # store the original and predicted target values 

# run 5 optimization rounds for each sequence- part of STORM algorithm
num_of_optimization_rounds = 5
for i in range(0, num_of_optimization_rounds):
    for idx, (toehold_seq, original_out) in enumerate(zip(toehold_seqs, y)): 
        optimized_pwm, optimized_onehot, predicted_out = run_gradient_ascent(toehold_seq, original_out)
        optimized_pwms.append(np.reshape(optimized_pwm, [59, 4]))
        predicted_targets.append(predicted_out)
        new_seq = invert_onehot(np.reshape(optimized_onehot, [59,4]))
        optimized_seqs.append(new_seq)

Tensor("lambda_5/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_5/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_12/BiasAdd:0", shape=(1, 1), dtype=float32)
Tensor("lambda_5/add_13:0", shape=(), dtype=float32)
Tensor("lambda_5/Mean:0", shape=(1,), dtype=float32)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Original ON/OFF: -0.0041365
Predicted ON/OFF:  [[0.61054814]]
Tensor("lambda_6/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_6/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_14/BiasAdd:0", shape=(1, 1), dtype=float32)
Tensor("lambda_6/add_13:0", shape=(), dtype=float32)
Tensor("lambda_6/Mean:0", shape=(1,), dtype=float32)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Original ON/OFF: -0.0105771
Predicted ON/OFF:  [[0.9805699]]
Tensor("lambda_7/Tile:0", shape=(1, 1), dtype=float32) Tensor("la

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Original ON/OFF: -0.0105771
Predicted ON/OFF:  [[0.97903997]]
Tensor("lambda_11/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_11/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_24/BiasAdd:0", shape=(1, 1), dtype=float32)
Tensor("lambda_11/add_13:0", shape=(), dtype=float32)
Tensor("lambda_11/Mean:0", shape=(1,), dtype=float32)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Original ON/OFF: -0.0318257
Predicted ON/OFF:  [[0.66565937]]
Tensor("lambda_12/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_12/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_26/BiasAdd:0", s

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Original ON/OFF: 0.00174482
Predicted ON/OFF:  [[0.9804386]]
Tensor("lambda_21/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_21/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_44/BiasAdd:0", shape=(1, 1), dtype=float32)
Tensor("lambda_21/add_13:0", shape=(), dtype=float32)
Tensor("lambda_21/Mean:0", shape=(1,), dtype=float32)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Original ON/OFF: -0.0041365
Predicted ON/OFF:  [[0.6102339]]
Tensor("lambda_22/Tile:0", shape=(1, 1), dtype=float32) Tensor("lambda_22/pow:0", shape=(1, 1), dtype=float32) Tensor("on_output_46/BiasAdd:0", shape=(1, 1), dtype=float32)
Tensor("lambda_22/add_13:0", shape=(), dtype=float32)
Tensor("lambda_22/Mean:0", s

# Part 8. Change toeholds to adhere to basepairing and toehold structure- post processing

In [53]:
data_df = pd.DataFrame()
data_df['old_switches'] = toehold_seqs*num_of_optimization_rounds
data_df['old_onoff'] = storm_pred_onoff_vals*num_of_optimization_rounds
data_df['new_switch'] = optimized_seqs
data_df['predicted_onoff'] = predicted_targets
data_df['optimized_pwm'] = optimized_pwms

In [55]:
rbs = 'AACAGAGGAGA'
start_codon = 'ATG'

# Make function to generate reverse compliment of the DNA strand
def make_rev_complement(string):
    new_str = ''
    for s in string:
        char = ''
        if s == 'A':
            char = 'T'
        elif s == 'T':
            char = 'A'
        elif s == 'C':
            char = 'G'
        elif s == 'G':
            char = 'C'
        else:
            print('UH OH! Character not A, T, C, or G')
        new_str += char
    new_str = new_str[::-1]
    return new_str

# Make function to check for stop codons
def check_for_stop(toehold): 
    stop_codons = ['TAG', 'TAA', 'TGA']
    location_of_start = 47
    search1 = toehold.find(stop_codons[0]) == location_of_start
    search2 = toehold.find(stop_codons[1]) == location_of_start
    search3 = toehold.find(stop_codons[2]) == location_of_start
    return (search1 | search2  | search3)

# Make function to actually turn trigger into toehold
def turn_switch_to_toehold(switch):
    stem1 = make_rev_complement(switch[24:30])
    stem2 = make_rev_complement(switch[12:21])
    toehold = switch + rbs + stem1 + start_codon + stem2
    return toehold

In [56]:
# check rev comp
def check_rev_comp(full_59nt):
    stem1 = make_rev_complement(full_59nt[24:30])
    stem2 = make_rev_complement(full_59nt[12:21])
    stem1_comp = full_59nt[41:47]
    stem2_comp = full_59nt[50:59]
    
    return((stem1 == stem1_comp) and (stem2 == stem2_comp))

In [57]:
# check rbs and start codon are unchanged
def check_rbs_and_start(full_59nt):
    rbs_exists = (full_59nt[30:41] == rbs)
    start_exists = (full_59nt[47:50] == start_codon)
    return(rbs_exists and start_exists)

In [58]:
# convert new switches to bp complementarity / toehold structure
new_fixed_switches = []
for toehold in data_df['new_switch']:
    base_30nt = toehold[0:30]
    print('checking for rev comp: ', check_rev_comp(toehold))
    print('checking for rbs and start codon: ', check_rbs_and_start(toehold))
    new_toehold = turn_switch_to_toehold(base_30nt)
    print(new_toehold)
    print('checking for rev comp: ', check_rev_comp(new_toehold))
    print('checking for rbs and start codon: ', check_rbs_and_start(new_toehold))
    new_fixed_switches.append(new_toehold)

checking for rev comp:  False
checking for rbs and start codon:  True
TCCTTCGGCATCTACATCTATATAAAACGAAACAGAGGAGATCGTTTATGATAGATGTA
checking for rev comp:  True
checking for rbs and start codon:  True
checking for rev comp:  False
checking for rbs and start codon:  True
TTCATTATTATCTGCTGCTCTTCCCCTCCAAACAGAGGAGATGGAGGATGAGAGCAGCA
checking for rev comp:  True
checking for rbs and start codon:  True
checking for rev comp:  False
checking for rbs and start codon:  True
CGCAATATTATCTGCTGGCCTACCCCTCCAAACAGAGGAGATGGAGGATGAGGCCAGCA
checking for rev comp:  True
checking for rbs and start codon:  True
checking for rev comp:  False
checking for rbs and start codon:  True
TTCATTATTATCTGCTGCTCCTCCCCTCCAAACAGAGGAGATGGAGGATGGGAGCAGCA
checking for rev comp:  True
checking for rbs and start codon:  True
checking for rev comp:  False
checking for rbs and start codon:  True
TCCTTCGGCATCTACCTCTATATAAAACGAAACAGAGGAGATCGTTTATGATAGAGGTA
checking for rev comp:  True
checking for rbs and start codon:  True
check

In [59]:
data_df['NEW_fixed_switch'] = new_fixed_switches

In [61]:
X = np.stack([_get_one_hot_encoding(s) for s in new_fixed_switches]).astype(np.float32)

predictions = model.predict(X)
#print(predictions)

data_df['NEW_onoff_preds'] = np.reshape(predictions, [num_seqs*num_of_optimization_rounds,])

[[0.5579137 ]
 [0.54170436]
 [0.3156554 ]
 [0.38759553]
 [0.51075286]
 [0.5078366 ]
 [0.5785158 ]
 [0.54170436]
 [0.5231471 ]
 [0.4552125 ]
 [0.57521087]
 [0.38759553]
 [0.5308206 ]
 [0.4570665 ]
 [0.1490827 ]
 [0.38759553]
 [0.51075286]
 [0.50698686]
 [0.61387956]
 [0.54170436]]


In [63]:
out_dir = 'data/'
data_df.to_csv(out_dir + '4h_final_test_storm_optimized.csv') # select top sequence from each round of 5 afterwards