In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time
import scipy

from final.model import Model
from final.helpers import Helpers
from final.datahandler import DataHandler
from final.dihedralcalculator import DihedralCalculator

import txt_data_utils.data_transformer as dt

In [13]:
### DATA AND QUEUE CONFIGURATION ###
data_path = "/home/mikey/Data/ProteinNet/"
validation_casps = ['casp11']
training_casps = ['casp11']
training_percentages = [30, 50, 70]
max_len = None # max len of the protein taken into account

num_epochs = 500
batch_size= 32
capacity=1000

### MODEL CONFIGURATION ###
include_evo = True

model_type = 'cnn_big'
dropout_rate = 0.1

mode_a = 'regression' # regression or alphabet
mode_b = 'vectors' # angles or vectors

prediction_mode = mode_a + '_' + mode_b
n_clusters = 50 # only needed when prediction mode == alphabet

angularization_mode = None
regularize_vectors = None
if prediction_mode == 'regression_angles':
    angularization_mode = 'cos'

if prediction_mode == 'regression_vectors':
    regularize_vectors = True # True or False, only works in the regression_vectors mode

loss_mode = 'mae'
n_angles = 2

In [14]:
tf.reset_default_graph()

data_handler = DataHandler(data_path=data_path,
                           casps=validation_casps,
                           num_epochs=1,
                           mode='validation')

ids, one_hot_primary, evolutionary, _, tertiary, ter_mask, pri_length, keep =\
    data_handler.generate_batches(batch_size = 1,
                                  capacity = 1000,
                                  max_protein_len = max_len)

dihedral_calculator = DihedralCalculator()
true_dihedrals = dihedral_calculator.dihedral_pipeline(tertiary, protein_length = tf.shape(one_hot_primary)[1])

with tf.Session() as sess:
    sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()))
    
    coord = tf.train.Coordinator()  
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    try:
        ids_v, ohp_v, evo_v, tert_v, ter_mask_v, dih_v = [], [], [], [], [], []
        while not coord.should_stop(): 
            ids_, one_hot_primary_, evolutionary_, tertiary_, ter_mask_, pri_length_, keep_, true_dihedrals_ = sess.run([ids, one_hot_primary, evolutionary, tertiary, 
                                                                                                        ter_mask, pri_length, keep, true_dihedrals])
        
            ids_v.append(np.squeeze(ids_))
            ohp_v.append(np.squeeze(one_hot_primary_))
            evo_v.append(np.squeeze(evolutionary_))
            tert_v.append(np.squeeze(tertiary_))
            ter_mask_v.append(np.squeeze(ter_mask_))
            dih_v.append(np.squeeze(true_dihedrals_))
        
    except tf.errors.OutOfRangeError:
        print('Done')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()
        
prim_v, evo_v, dih_v, mask_v = dt.limit_length_and_pad(ohp_v, evo_v, dih_v, ter_mask_v, max_length=None)
x_valid, y_valid = np.concatenate([prim_v, evo_v], axis=2), dih_v
x_valid.shape, y_valid.shape

['/home/mikey/Data/ProteinNet/casp11/validation/*']
Training samples available 224
Done
padded


((224, 698, 41), (224, 698, 3))

In [15]:
tf.reset_default_graph()

# define the training data paths and how many epochs they should be queued for
# by instantiating the DataHandler object that takes care of parsing
data_handler = DataHandler(data_path=data_path,
                           casps=training_casps,
                           percentages=training_percentages, 
                           num_epochs=num_epochs,
                           mode='training')

# use DataHandler to generate batches of specific size 
# and optional limit on protein length
# secondary structure is missing from the ProteinNet, thus the underscore
ids, one_hot_primary, evolutionary, _, tertiary, ter_mask, pri_length, keep =\
    data_handler.generate_batches(batch_size = batch_size,
                                  capacity = capacity,
                                  max_protein_len = max_len)

# convert euclidean coordinates to dihedral angles
dihedral_calculator = DihedralCalculator()
true_dihedrals = dihedral_calculator.dihedral_pipeline(tertiary, protein_length = tf.shape(one_hot_primary)[1])
true_dihedrals = true_dihedrals[:,:,:n_angles]

# set up placeholders with batch_size=None to be able to feed them with validation data
# they fall onto default coming from the queue if nothing is fed through feed_dict
true_dihedrals = tf.placeholder_with_default(true_dihedrals, shape=(None, None, n_angles))
true_vectors = Helpers.ang_to_vec_tf(true_dihedrals)
one_hot_primary = tf.placeholder_with_default(one_hot_primary, shape=(None, None, 20))
evolutionary = tf.placeholder_with_default(evolutionary, shape=(None, None, 21))
ter_mask = tf.placeholder_with_default(ter_mask, shape=(None, None))

# build a model and get predicted output
model = Model(n_angles=n_angles, 
              n_clusters=n_clusters,
              output_mask=ter_mask,
              model_type=model_type, 
              prediction_mode=prediction_mode,
              dropout_rate=dropout_rate,
              ang_mode=angularization_mode,
              regularize_vectors=regularize_vectors,
              loss_mode=loss_mode
            )

if include_evo:
    input_data = tf.concat([one_hot_primary, evolutionary], axis=2)
else:
    input_data = one_hot_primary

rad_pred_masked, vec_pred_masked = model.build_model(input_data)

true_dihedrals_masked, true_vectors_masked = model.mask_other([true_dihedrals, true_vectors])

loss, loss_vec = model.calculate_loss(true_dihedrals_masked, rad_pred_masked,
                                      true_vectors_masked, vec_pred_masked)

pcc = Helpers.pearson_tf(rad_pred_masked, true_dihedrals_masked)

# learning rate placeholder for adaptive learning rate
learning_rate = tf.placeholder(tf.float32, name='learning_rate')

# choose an optimizer to minimize the loss
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss, global_step=global_step)

init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
try:
    n_parameters = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])
    print("Parameters:", n_parameters)
except:
    print("Couldn't calculate the number of parameters")

['/home/mikey/Data/ProteinNet/casp11/training/30/*', '/home/mikey/Data/ProteinNet/casp11/training/50/*', '/home/mikey/Data/ProteinNet/casp11/training/70/*']
Training samples available 88285
Parameters: 775130


Training loop

In [None]:
learning_rate_decay = 0.99
steps_to_print_after = 200
init_learning_rate = 0.001

if 'lstm' in model_type:
    init_learning_rate = 0.001
    steps_to_print_after = 200
    
with tf.Session() as sess:
    # important to call both of these, because 
    # otherwise can't specify num_epochs in string_input_producer
    sess.run(init)
    
    coord = tf.train.Coordinator()  
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    try:
        # we can't access information from the queue
        # to know when an epoch ends, so we define our
        # own step counter and calculate an validation loss every n steps
        step = 1
        
        losses = []
        loss_vecs = []
        avg_losses = []
        
        while not coord.should_stop():        
                
            _, true_dihedrals_, rad_pred_masked_, loss_, loss_vec_ = sess.run([train_op, true_dihedrals, 
                                                                    rad_pred_masked, loss, loss_vec], 
                                                                    feed_dict={learning_rate: init_learning_rate})

            losses.append(loss_)
            loss_vecs.append(loss_vec_)
            
            if step % steps_to_print_after == 0:
                avg_loss, avg_loss_vec =  np.mean(losses), np.mean(np.array(loss_vecs), axis=0)
                avg_losses.append(avg_loss)
                print("Train loss:", avg_loss, avg_loss_vec)

                losses = []
                loss_vecs = []
                
                (true_dihedrals_masked_v, rad_pred_masked_v, 
                 loss_, loss_vec_) = sess.run([true_dihedrals_masked, rad_pred_masked, 
                                                   loss, loss_vec], 
                                                   feed_dict={
                                                           one_hot_primary: prim_v,
                                                           evolutionary: evo_v,
                                                           true_dihedrals: dih_v[:,:,:n_angles],
                                                           ter_mask: mask_v
                                                          })
                print("Validation loss:", loss_, loss_vec_)
                print("Validation PCC:", Helpers.pearson_numpy(np.squeeze(true_dihedrals_masked_v)[:,:n_angles], 
                                                               np.array(rad_pred_masked_v)))
            
            if step * batch_size > data_handler.training_samples:
                step = 0
                init_learning_rate = init_learning_rate * learning_rate_decay
                print("EPOCH. New learning rate:", init_learning_rate)
                
            step += 1


    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (num_epochs, step))
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

Train loss: 0.40042 [[0.36059526 0.22162743]
 [0.5692388  0.4502187 ]]
Validation loss: 0.33133882 [[0.319742   0.20129289]
 [0.45140687 0.35291362]]
Validation PCC: [0.46040076, 0.5032341]
Train loss: 0.3113002 [[0.30007192 0.2010552 ]
 [0.41446477 0.3296089 ]]
Validation loss: 0.29613787 [[0.28904602 0.19997355]
 [0.3851328  0.31039906]]
Validation PCC: [0.5687059, 0.5833979]
Train loss: 0.29035965 [[0.28551248 0.19652791]
 [0.37348288 0.30591542]]
Validation loss: 0.28172362 [[0.2817525  0.1809705 ]
 [0.36720082 0.29697064]]
Validation PCC: [0.56943446, 0.6145757]
Train loss: 0.27552235 [[0.27859658 0.16986962]
 [0.35866624 0.29495707]]
Validation loss: 0.27157524 [[0.27470902 0.16747361]
 [0.35571575 0.28840253]]
Validation PCC: [0.59819824, 0.6256365]
Train loss: 0.26667032 [[0.2718317  0.16326924]
 [0.34436148 0.2872188 ]]
Validation loss: 0.26461837 [[0.26865056 0.16417548]
 [0.3445286  0.2811188 ]]
Validation PCC: [0.6169168, 0.6450863]
Train loss: 0.25998354 [[0.26605296 0.157

Plot the avg losses over time

In [None]:
plt.plot(avg_losses)

Get just the dihedral angles to see if they resemble how a ramachadran plot should look like

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)
    
    true_dihedrals_, ohp, tert = sess.run([true_dihedrals, one_hot_primary, tertiary])
    coord.request_stop()

In [None]:
np.mean(np.abs(true_dihedrals_v - pred_v)), loss_

In [None]:
phi.shape

In [None]:
protein_n = 0

plt.figure(figsize=(12,8))

phi, psi, omega = np.split(pred_v, 3, -1)
plt.scatter(phi[:500], psi[:500], s=1, label='pred')

phi, psi, omega = np.split(true_dihedrals_v, 3, -1)
plt.scatter(phi[:500], psi[:500], s=1, label='true')

plt.legend()

In [None]:
plt.hist(omega[protein_n], bins=100)

In [None]:
#                 # access trainable variables to see if they're training
#                 variables_names = [v.name for v in tf.trainable_variables()]
#                 values = sess.run(variables_names)
#                 for k, v in zip(variables_names, values):
#                     print("Variable: ", k)
#                     print("Shape: ", v.shape)
#                     print(v)

In [None]:
# def _coordinates(config, dihedrals):
#     """ Converts dihedrals into full 3D structures. """

#     # converts dihedrals to points ready for reconstruction.
#     points = dihedral_to_point(dihedrals) # [NUM_STEPS x NUM_DIHEDRALS, BATCH_SIZE, NUM_DIMENSIONS]
             
#     # converts points to final 3D coordinates.
#     coordinates = point_to_coordinate(points, num_fragments=config['num_reconstruction_fragments'], 
#                                               parallel_iterations=config['num_reconstruction_parallel_iters']) 
#                   # [NUM_STEPS x NUM_DIHEDRALS, BATCH_SIZE, NUM_DIMENSIONS]

#     return coordinates

# def _drmsds(config, coordinates, targets, weights):
#     """ Computes reduced weighted dRMSD loss (as specified by weights) 
#         between predicted tertiary structures and targets. """
                  
#     # compute per structure dRMSDs
#     drmsds = drmsd(coordinates, targets, weights, name='drmsds') # [BATCH_SIZE]

#     # add to relevant collections for summaries, etc.
#     if config['log_model_summaries']: tf.add_to_collection(config['name'] + '_drmsdss', drmsds)

#     return drmsds