In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time

In [232]:
# helper for setting up the bidirectional, multilayer lstm
def bidirectional_lstm(input_data, num_layers, rnn_size, keep_prob, lengths=None):
    output = input_data
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer),reuse=tf.AUTO_REUSE):

            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size, initializer=tf.truncated_normal_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob = keep_prob)

            outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                              cell_bw, 
                                                              output,
                                                              dtype=tf.float32,
                                                              sequence_length=lengths)
            output = tf.concat(outputs,2)

    return output

In [6]:
import data_loader as dl
import data_transformer as dt
# change this path to match the path of where you have the
# training_50 and validation files
txt_data_path = '/home/mikey/Data/ProteinNet/casp7_txt/'

In [3]:
file_name = 'training_50'
data_lim = 5000
prim_train, evo_train, dih_train = (dl.parse_primary_from_file(txt_data_path + file_name, data_lim), 
                                    dl.parse_evolutionary_from_file(txt_data_path + file_name, data_lim),
                                    dl.load_file('./'+file_name+'_dih.joblib'))
dih_train = dih_train[:data_lim]
len(prim_train), prim_train[0].shape, len(evo_train), evo_train[0].shape, len(dih_train), dih_train[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 5000/5000 [00:00<00:00, 49078.34it/s]


Grouped 21's together


(5000, (70, 20), 5000, (70, 21), 5000, (70, 3))

In [5]:
file_name = 'validation'
prim_valid, evo_valid, dih_valid = (dl.parse_primary_from_file(txt_data_path + file_name), 
                                    dl.parse_evolutionary_from_file(txt_data_path + file_name),
                                    dl.load_file('./'+ file_name + '_dih.joblib'))
len(prim_valid), prim_valid[0].shape, len(evo_valid), evo_valid[0].shape, len(dih_valid), dih_valid[0].shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 224/224 [00:00<00:00, 75890.48it/s]

Grouped 21's together





(224, (269, 20), 224, (269, 21), 224, (269, 3))

In [188]:
max_len = 200
prim_, evo_, dih_ = dt.limit_length_and_pad(prim_train, evo_train, dih_train, max_len)
x_train, y_train = np.concatenate([prim_, evo_], axis=2), dih_
prim_v, evo_v, dih_v = dt.limit_length_and_pad(prim_valid, evo_valid, dih_valid, max_len)
x_valid, y_valid = np.concatenate([prim_v, evo_v], axis=2), dih_v

x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((3053, 200, 41), (3053, 200, 3), (127, 200, 41), (127, 200, 3))

In [206]:
from sklearn.cluster import KMeans
from keras.utils import to_categorical

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters-1, random_state=0, verbose=0).fit(y_train[:500].reshape(-1,3))

In [207]:
kmeans.cluster_centers_

array([[-1.2905366e+00, -6.7637688e-01, -3.1049449e+00],
       [-1.5044749e-02,  1.5398264e-03, -7.5548887e-06],
       [-2.2788348e+00,  2.4490602e+00,  3.0649731e+00],
       [-1.2594174e+00, -6.4054614e-01,  3.1042385e+00],
       [-1.7908230e+00,  2.2621212e+00, -3.0957763e+00],
       [ 1.4414744e+00,  2.4293438e-01,  3.0961897e+00],
       [-1.2828548e+00,  2.3996851e+00,  3.0832396e+00],
       [ 1.4389775e+00,  8.1029749e-01, -3.0936613e+00],
       [ 1.5655324e+00, -2.3716803e+00, -3.0398128e+00]], dtype=float32)

In [209]:
y_train_labels_ = kmeans.predict(y_train.reshape(-1,3)).reshape(y_train.shape[0], y_train.shape[1])
y_train_labels_[y_train[:,:,0]==0] = n_clusters-1
y_valid_labels_ = kmeans.predict(y_valid.reshape(-1,3)).reshape(y_valid.shape[0], y_valid.shape[1])
y_valid_labels_[y_valid[:,:,0]==0] = n_clusters-1

# ohe = OneHotEncoder(n_values = n_clusters).fit(y_train_labels)
y_train_labels = to_categorical(y_train_labels_)
y_valid_labels = to_categorical(y_valid_labels_)

y_train_labels.shape, y_valid_labels.shape

((3053, 200, 10), (127, 200, 10))

In [236]:
tf.reset_default_graph()

X = tf.placeholder(tf.float32, [None, max_len, 41], name="X")
y = tf.placeholder(tf.float32, [None, max_len, n_clusters], name='y')

mask = tf.not_equal(tf.argmax(y, 2), n_clusters-1)

conv1 = tf.layers.conv1d(X, 32, 5, activation=tf.nn.relu, padding='same')
conv2 = tf.layers.conv1d(conv1, 64, 5, activation=tf.nn.relu, padding='same')
conv3 = tf.layers.conv1d(conv2, 128, 5, activation=tf.nn.relu, padding='same')

lstm = bidirectional_lstm(X, 1, 32, keep_prob=0.05)

logits = tf.layers.dense(lstm, n_clusters, activation='relu')
y_pred = tf.nn.softmax(logits)

y_masked = tf.boolean_mask(y, mask)
shape_after_mask = tf.shape(y_masked)
logits_masked = tf.boolean_mask(logits, mask)
y_pred_masked = tf.boolean_mask(y_pred, mask)
# y_masked = y
# logits_masked = logits
# y_pred_masked = y_pred

# Difine the loss function
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_masked, logits=logits_masked))

# Define the optimizer operation
learning_rate = tf.placeholder(tf.float32)

#optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(loss)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)

# Variables for prediction and accuracy
prediction = tf.argmax(y_pred_masked, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, tf.argmax(y_masked, 1)), tf.float32))

# Initialize the variables (they are assigned default values)
init = tf.global_variables_initializer()

n_parameters = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
print("Number of parameters:", n_parameters)

Number of parameters: 77578


In [238]:
batch_size = 32
n_epochs = 300

# Start as session
init_learning_rate = 0.0001
with tf.Session() as session:

    # Run the initializer
    session.run(init)

    # Training cycle
    for epoch in range(n_epochs):
        print("Epoch:", epoch)
        for b in range(0, x_train.shape[0], batch_size):
            _, loss_value = session.run([optimizer, loss], feed_dict={X: x_train[b:b+batch_size],
                                                                      y: y_train_labels[b:b+batch_size],
                                                                      learning_rate: init_learning_rate})
            
        validation_accuracy = session.run(accuracy, feed_dict={X: x_valid, y: y_valid_labels})
        print("loss[epoch=%04d] = %f, val_acc = %f" % (epoch, loss_value, validation_accuracy))        
    
    if epoch % 10 == 0:
        init_learning_rate = init_learning_rate / 5.
    print("Optimization done")

    # Calculate training accuracy
    train_accuracy_value, pred_train, shape_after_mask_ = session.run([accuracy, prediction, shape_after_mask], feed_dict={X: x_train, y: y_train_labels})
    print("Train accuracy:", train_accuracy_value)
    print("Shape after mask:", shape_after_mask_)
    
    # Calculate test accuracy
    test_accuracy_value, pred_test, shape_after_mask_ = session.run([accuracy, prediction, shape_after_mask], feed_dict={X: x_valid, y: y_valid_labels})
    print("Test accuracy:", test_accuracy_value)
    print("Shape after mask:", shape_after_mask_)

Epoch: 0
loss[epoch=0000] = 2.248126, val_acc = 0.236433
Epoch: 1
loss[epoch=0001] = 2.028672, val_acc = 0.253407
Epoch: 2
loss[epoch=0002] = 1.758819, val_acc = 0.275400
Epoch: 3
loss[epoch=0003] = 1.755347, val_acc = 0.276994
Epoch: 4
loss[epoch=0004] = 1.745063, val_acc = 0.275799
Epoch: 5
loss[epoch=0005] = 1.740462, val_acc = 0.275480
Epoch: 6
loss[epoch=0006] = 1.734956, val_acc = 0.278349
Epoch: 7
loss[epoch=0007] = 1.724093, val_acc = 0.277472
Epoch: 8
loss[epoch=0008] = 1.730413, val_acc = 0.278349
Epoch: 9
loss[epoch=0009] = 1.716775, val_acc = 0.280979
Epoch: 10
loss[epoch=0010] = 1.710742, val_acc = 0.280500
Epoch: 11
loss[epoch=0011] = 1.708093, val_acc = 0.284963
Epoch: 12
loss[epoch=0012] = 1.717878, val_acc = 0.282014
Epoch: 13
loss[epoch=0013] = 1.701837, val_acc = 0.282572
Epoch: 14
loss[epoch=0014] = 1.700846, val_acc = 0.280022
Epoch: 15
loss[epoch=0015] = 1.703632, val_acc = 0.280102
Epoch: 16
loss[epoch=0016] = 1.697209, val_acc = 0.284405
Epoch: 17
loss[epoch=001

loss[epoch=0141] = 1.679277, val_acc = 0.294366
Epoch: 142
loss[epoch=0142] = 1.671144, val_acc = 0.302335
Epoch: 143
loss[epoch=0143] = 1.642289, val_acc = 0.297075
Epoch: 144
loss[epoch=0144] = 1.631769, val_acc = 0.299466
Epoch: 145
loss[epoch=0145] = 1.675978, val_acc = 0.298590
Epoch: 146
loss[epoch=0146] = 1.657535, val_acc = 0.299785
Epoch: 147
loss[epoch=0147] = 1.648376, val_acc = 0.299546
Epoch: 148
loss[epoch=0148] = 1.629224, val_acc = 0.302335
Epoch: 149
loss[epoch=0149] = 1.667367, val_acc = 0.299944
Epoch: 150
loss[epoch=0150] = 1.669453, val_acc = 0.298191
Epoch: 151
loss[epoch=0151] = 1.658844, val_acc = 0.299466
Epoch: 152
loss[epoch=0152] = 1.653977, val_acc = 0.297554
Epoch: 153


KeyboardInterrupt: 

In [235]:
pred_train.shape

(321481,)