In [9]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time
import scipy

from final.model import Model
from final.helpers import Helpers
from final.datahandler import DataHandler
from final.dihedralcalculator import DihedralCalculator

In [10]:
import data_loader as dl
import data_transformer as dt
# change this path to match the path of where you have the
# training_50 and validation files
txt_data_path = '/home/mikey/Data/ProteinNet/casp7_txt/'

max_len = None # max len of the protein taken into account
n_angles = 3 # 3 if consider all angles or 2 if only phi and psi

file_name = 'validation'
prim_valid, evo_valid, dih_valid, mask_valid = (dl.parse_primary_from_file(txt_data_path + file_name), 
                                                dl.parse_evolutionary_from_file(txt_data_path + file_name),
                                                dl.load_file('./'+ file_name + '_dih.joblib'),
                                                dl.parse_mask_from_file(txt_data_path + file_name))

prim_v, evo_v, dih_v, mask_v = dt.limit_length_and_pad(prim_valid, evo_valid, dih_valid, mask_valid, max_length=max_len)
x_valid, y_valid = np.concatenate([prim_v, evo_v], axis=2), dih_v[:,:,:n_angles]

x_valid.shape, y_valid.shape

Loaded data and filtered line endings
Extracted primary data
Encoded primary sequences
Loaded data and filtered line endings
Extracted evolutionary data


100%|██████████| 224/224 [00:00<00:00, 63874.10it/s]

Grouped 21's together
Loaded data and filtered line endings
Extracted mask data
padded
(224,) (269, 20)
padded
(224,) (269, 21)
padded
(224,) (269, 3)
padded
(224,) (269, 1)





((224, 696, 41), (224, 696, 3))

In [46]:
def _coordinates(config, dihedrals):
    """ Converts dihedrals into full 3D structures. """

    # converts dihedrals to points ready for reconstruction.
    points = dihedral_to_point(dihedrals) # [NUM_STEPS x NUM_DIHEDRALS, BATCH_SIZE, NUM_DIMENSIONS]
             
    # converts points to final 3D coordinates.
    coordinates = point_to_coordinate(points, num_fragments=config['num_reconstruction_fragments'], 
                                              parallel_iterations=config['num_reconstruction_parallel_iters']) 
                  # [NUM_STEPS x NUM_DIHEDRALS, BATCH_SIZE, NUM_DIMENSIONS]

    return coordinates

def _drmsds(config, coordinates, targets, weights):
    """ Computes reduced weighted dRMSD loss (as specified by weights) 
        between predicted tertiary structures and targets. """
                  
    # compute per structure dRMSDs
    drmsds = drmsd(coordinates, targets, weights, name='drmsds') # [BATCH_SIZE]

    # add to relevant collections for summaries, etc.
    if config['log_model_summaries']: tf.add_to_collection(config['name'] + '_drmsdss', drmsds)

    return drmsds

tf.reset_default_graph()

data_path = "/home/mikey/Data/ProteinNet/"
casps = ['casp7', 'casp8', 'casp9', 'casp10', 'casp11', 'casp12']
# casps = ['casp7']
training_percentages = [30, 50, 70]

num_epochs = 100
batch_size= 32
capacity=1000
# min_after_dequeue=100

max_protein_len = None
include_evo = True
apply_mask = True

model_type = 'cnn_small'
dropout_rate = 0.3
prediction_mode = 'regression_vectors'

angularization_mode = None
if prediction_mode == 'regression_angles':
    angularization_mode = 'cos'

regularize_vectors = True
loss_mode = 'angular_mae'
n_angles = 2

# define the training data paths and how many epochs they should be queued for
# by instantiating the DataHandler object that takes care of parsing
data_handler = DataHandler(data_path=data_path,
                           casps=casps,
                           percentages=training_percentages, 
                           num_epochs=num_epochs)

# use DataHandler to generate batches of specific size 
# and optional limit on protein length
# secondary structure is missing from the ProteinNet, thus the underscore
ids, one_hot_primary, evolutionary, _, tertiary, ter_mask, pri_length, keep =\
    data_handler.generate_batches(batch_size = batch_size,
                                  capacity = capacity,
                                  max_protein_len = max_protein_len)
# if max_protein_len:
#     ids, one_hot_primary, evolutionary, tertiary, ter_mask, pri_length =\
#         helpers.mask_all([ids, one_hot_primary, evolutionary, tertiary, ter_mask, pri_length], keep, axis=0)

# convert euclidean coordinates to dihedral angles
dihedral_calculator = DihedralCalculator()
true_dihedrals = dihedral_calculator.dihedral_pipeline(tertiary, protein_length = tf.shape(one_hot_primary)[1])
true_dihedrals = true_dihedrals[:,:,:n_angles]

# set up placeholders with batch_size=None to be able to feed them with validation data
# they fall onto default coming from the queue if nothing is fed through feed_dict
true_dihedrals = tf.placeholder_with_default(true_dihedrals, shape=(None, None, n_angles))
true_vectors = Helpers.ang_to_vec_tf(true_dihedrals)
one_hot_primary = tf.placeholder_with_default(one_hot_primary, shape=(None, None, 20))
evolutionary = tf.placeholder_with_default(evolutionary, shape=(None, None, 21))
ter_mask = tf.placeholder_with_default(ter_mask, shape=(None, None))

if include_evo:
    input_data = tf.concat([one_hot_primary, evolutionary], axis=2)
else:
    input_data = one_hot_primary

# build a model and get predicted output
model = Model(n_angles=n_angles, 
              output_mask=ter_mask,
              model_type=model_type, 
              prediction_mode=prediction_mode,
              dropout_rate=dropout_rate,
              ang_mode=angularization_mode,
              regularize_vectors=regularize_vectors
            )
rad_pred_masked, vec_pred_masked = model.build_model(input_data)

# mask values (gets reshaped into 2 dimensions)
# it's important to note that with apply_mask = False the validation
# loss and trianing loss are not related. This is likely
# due to padding being done differently on validation and training data
true_dihedrals_masked, true_vectors_masked = true_dihedrals, true_vectors
if apply_mask:
    true_dihedrals_masked, true_vectors_masked = Helpers.mask_all([true_dihedrals, true_vectors], ter_mask)

if prediction_mode == 'regression_vectors':
    if loss_mode == 'angular_mae':
        loss_vec = Helpers.loss360(true_dihedrals_masked, rad_pred_masked)
        loss = tf.reduce_mean(loss_vec)
    elif loss_mode == 'mae':
        loss_vec = Helpers.mae(true_vectors_masked, vec_pred_masked)
        loss = tf.reduce_mean(loss_vec)
    elif loss_mode == 'angular_mae_and_mae':
        loss_vec = tf.add(Helpers.loss360(true_dihedrals_masked, rad_pred_masked), 
                          Helpers.mae(true_vectors_masked, vec_pred_masked))
        loss = tf.reduce_mean(loss_vec)
elif prediction_mode == 'regression_angles':
    if loss_mode == 'angular_mae':
        loss_vec = Helpers.loss360(true_dihedrals_masked, rad_pred_masked)
        loss = tf.reduce_mean(loss_vec)
    elif loss_mode == 'mae':
        loss_vec = Helpers.mae(true_dihedrals_masked, rad_pred_masked)
        loss = tf.reduce_mean(loss_vec)

pcc = Helpers.pearson_tf(rad_pred_masked, true_dihedrals_masked)
# include the regularization loss that keeps vectors on unit circle
loss = tf.add_n([loss] + model.regularization_losses)

# learning rate placeholder for adaptive learning rate
learning_rate = tf.placeholder(tf.float32, name='learning_rate')

# choose an optimizer to minimize the loss
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss, global_step=global_step)

init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
n_parameters = np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])

['/home/mikey/Data/ProteinNet/casp7/training/30/*', '/home/mikey/Data/ProteinNet/casp7/training/50/*', '/home/mikey/Data/ProteinNet/casp7/training/70/*', '/home/mikey/Data/ProteinNet/casp8/training/30/*', '/home/mikey/Data/ProteinNet/casp8/training/50/*', '/home/mikey/Data/ProteinNet/casp8/training/70/*', '/home/mikey/Data/ProteinNet/casp9/training/30/*', '/home/mikey/Data/ProteinNet/casp9/training/50/*', '/home/mikey/Data/ProteinNet/casp9/training/70/*', '/home/mikey/Data/ProteinNet/casp10/training/30/*', '/home/mikey/Data/ProteinNet/casp10/training/50/*', '/home/mikey/Data/ProteinNet/casp10/training/70/*', '/home/mikey/Data/ProteinNet/casp11/training/30/*', '/home/mikey/Data/ProteinNet/casp11/training/50/*', '/home/mikey/Data/ProteinNet/casp11/training/70/*', '/home/mikey/Data/ProteinNet/casp12/training/30/*', '/home/mikey/Data/ProteinNet/casp12/training/50/*', '/home/mikey/Data/ProteinNet/casp12/training/70/*']
Training samples available 321692


Training loop

In [48]:
learning_rate_decay = 0.98
steps_to_print_after = 250
init_learning_rate = 0.0001

if 'lstm' in model_type:
    init_learning_rate = 0.001
    steps_to_print_after = 200
    
with tf.Session() as sess:
    # important to call both of these, because 
    # otherwise can't specify num_epochs in string_input_producer
    sess.run(init)
    
    coord = tf.train.Coordinator()  
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    try:
        # we can't access information from the queue
        # to know when an epoch ends, so we define our
        # own step counter and calculate an validation loss every n steps
        step = 1
        
        losses = []
        loss_vecs = []
        avg_losses = []
        
        while not coord.should_stop():        
                
            _, true_dihedrals_, rad_pred_masked_, loss_, loss_vec_ = sess.run([train_op, true_dihedrals, 
                                                                    rad_pred_masked, loss, loss_vec], 
                                                                    feed_dict={learning_rate: init_learning_rate})

            losses.append(loss_)
            loss_vecs.append(loss_vec_)
            if step % steps_to_print_after == 0:
                avg_loss, avg_loss_vec =  np.mean(losses), np.mean(np.array(loss_vecs), axis=0)
                avg_losses.append(avg_loss)
                print("Train loss:", avg_loss, avg_loss_vec)

                losses = []
                loss_vecs = []
                
                (true_dihedrals_masked_v, rad_pred_masked_v, 
                 loss_, loss_vec_) = sess.run([true_dihedrals_masked, rad_pred_masked, 
                                                   loss, loss_vec], 
                                                   feed_dict={
                                                           one_hot_primary: prim_v,
                                                           evolutionary: evo_v,
                                                           true_dihedrals: dih_v[:,:,:n_angles],
                                                           ter_mask: mask_v
                                                          })
                print("Validation loss:", loss_, loss_vec_)
                print("Validation PCC:", Helpers.pearson_numpy(np.squeeze(true_dihedrals_masked_v)[:,:n_angles], 
                                                               np.array(rad_pred_masked_v)))
            
            if step * batch_size > data_handler.training_samples:
                step = 0
                init_learning_rate = init_learning_rate * learning_rate_decay
                print("EPOCH. New learning rate:", init_learning_rate)
                
            step += 1


    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (num_epochs, step))
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

Train loss: 2.4014218 [1.6139904 1.438793 ]
Validation loss: 1.8602278 [0.6995032 1.4618758]
Validation PCC: [-0.10736009, 0.457409]
Train loss: 2.62986 [2.30544   1.5256162]
Validation loss: 2.7191503 [2.3994117 1.5529338]
Validation PCC: [0.14970605, 0.4642586]
Train loss: 2.7000566 [2.373892  1.5611925]
Validation loss: 2.675693 [2.372733  1.5560852]
Validation PCC: [0.17334868, 0.50076085]
Train loss: 2.654337 [2.3415303 1.5574226]
Validation loss: 2.5923758 [2.2988126 1.5381546]
Validation PCC: [0.17947398, 0.5527613]
Train loss: 2.6122828 [2.3220274 1.5512503]
Validation loss: 2.589622 [2.3278732 1.5363407]
Validation PCC: [0.20328878, 0.5514715]
Train loss: 2.586765 [2.311069  1.5344332]
Validation loss: 2.551374 [2.2866392 1.5057542]
Validation PCC: [0.18862274, 0.58277524]
Train loss: 2.4619086 [2.1214828 1.5182723]
Validation loss: 2.331348 [2.3050427 1.3218483]
Validation PCC: [0.1123547, 0.5692802]
Train loss: 1.9133611 [1.7186733 1.3661236]
Validation loss: 1.537757 [1.113

KeyboardInterrupt: 

Plot the avg losses over time

In [None]:
plt.plot(avg_losses)

Get just the dihedral angles to see if they resemble how a ramachadran plot should look like

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)
    
    true_dihedrals_, ohp, tert = sess.run([true_dihedrals, one_hot_primary, tertiary])
    coord.request_stop()

In [None]:
np.mean(np.abs(true_dihedrals_v - pred_v)), loss_

In [None]:
phi.shape

In [None]:
protein_n = 0

plt.figure(figsize=(12,8))

phi, psi, omega = np.split(pred_v, 3, -1)
plt.scatter(phi[:500], psi[:500], s=1, label='pred')

phi, psi, omega = np.split(true_dihedrals_v, 3, -1)
plt.scatter(phi[:500], psi[:500], s=1, label='true')

plt.legend()

In [None]:
plt.hist(omega[protein_n], bins=100)

In [None]:
#                 # access trainable variables to see if they're training
#                 variables_names = [v.name for v in tf.trainable_variables()]
#                 values = sess.run(variables_names)
#                 for k, v in zip(variables_names, values):
#                     print("Variable: ", k)
#                     print("Shape: ", v.shape)
#                     print(v)