In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time

NUM_AAS = 20
NUM_DIMENSIONS = 3

def masking_matrix(mask, name=None):
    """ Constructs a masking matrix to zero out pairwise distances due to missing residues or padding. 

    Args:
        mask: 0/1 vector indicating whether a position should be masked (0) or not (1)

    Returns:
        A square matrix with all 1s except for rows and cols whose corresponding indices in mask are set to 0.
        [MAX_SEQ_LENGTH, MAX_SEQ_LENGTH]
    """

    with tf.name_scope(name, 'masking_matrix', [mask]) as scope:
        mask = tf.convert_to_tensor(mask, name='mask')

        mask = tf.expand_dims(mask, 0)
        base = tf.ones([tf.size(mask), tf.size(mask)])
        matrix_mask = base * mask * tf.transpose(mask)

        return matrix_mask
        
def read_protein(filename_queue, max_length, num_evo_entries=21, name=None):
    """ Reads and parses a ProteinNet TF Record. 

        Primary sequences are mapped onto 20-dimensional one-hot vectors.
        Evolutionary sequences are mapped onto num_evo_entries-dimensional real-valued vectors.
        Secondary structures are mapped onto ints indicating one of 8 class labels.
        Tertiary coordinates are flattened so that there are 3 times as many coordinates as 
        residues.

        Evolutionary, secondary, and tertiary entries are optional.

    Args:
        filename_queue: TF queue for reading files
        max_length:     Maximum length of sequence (number of residues) [MAX_LENGTH]. Not a 
                        TF tensor and is thus a fixed value.

    Returns:
        id: string identifier of record
        one_hot_primary: AA sequence as one-hot vectors
        evolutionary: PSSM sequence as vectors
        secondary: DSSP sequence as int class labels
        tertiary: 3D coordinates of structure
        matrix_mask: Masking matrix to zero out pairwise distances in the masked regions
        pri_length: Length of amino acid sequence
        keep: True if primary length is less than or equal to max_length
    """

    with tf.name_scope(name, 'read_protein', []) as scope:
        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue)

        context, features = tf.parse_single_sequence_example(serialized_example,
                                context_features={'id': tf.FixedLenFeature((1,), tf.string)},
                                sequence_features={
                                    'primary':      tf.FixedLenSequenceFeature((1,),               tf.int64),
                                    'evolutionary': tf.FixedLenSequenceFeature((num_evo_entries,), tf.float32, allow_missing=True),
                                    'secondary':    tf.FixedLenSequenceFeature((1,),               tf.int64,   allow_missing=True),
                                    'tertiary':     tf.FixedLenSequenceFeature((NUM_DIMENSIONS,),  tf.float32, allow_missing=True),
                                    'mask':         tf.FixedLenSequenceFeature((1,),               tf.float32, allow_missing=True)})
        id_ = context['id'][0]
        primary =   tf.to_int32(features['primary'][:, 0])
        evolutionary =          features['evolutionary']
        secondary = tf.to_int32(features['secondary'][:, 0])
        tertiary =              features['tertiary']
        mask =                  features['mask'][:, 0]

        pri_length = tf.size(primary)
        keep = pri_length <= max_length

        one_hot_primary = tf.one_hot(primary, NUM_AAS)

        # Generate tertiary masking matrix--if mask is missing then assume all residues are present
        mask = tf.cond(tf.not_equal(tf.size(mask), 0), lambda: mask, lambda: tf.ones([pri_length]))
        ter_mask = masking_matrix(mask, name='ter_mask')        

        return id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep

  from ._conv import register_converters as _register_converters


In [54]:
def dihedral(p):
    """Praxeolitic formula
    1 sqrt, 1 cross product"""
    p0 = p[0]
    p1 = p[1]
    p2 = p[2]
    p3 = p[3]

    b0 = -1.0*(p1 - p0)
    b1 = p2 - p1
    b2 = p3 - p2

    # normalize b1 so that it does not influence magnitude of vector
    # rejections that come next
    b1 /= np.linalg.norm(b1)

    # vector rejections
    # v = projection of b0 onto plane perpendicular to b1
    #   = b0 minus component that aligns with b1
    # w = projection of b2 onto plane perpendicular to b1
    #   = b2 minus component that aligns with b1
    v = b0 - np.dot(b0, b1)*b1
    w = b2 - np.dot(b2, b1)*b1

    # angle between v and w in a plane is the torsion angle
    # v and w may not be normalized but that's fine since tan is y/x
    x = np.dot(v, w)
    y = np.dot(np.cross(b1, v), w)
    return np.degrees(np.arctan2(y, x))

def tf_rad2deg(rad):
    pi_on_180 = 0.017453292519943295
    return rad / pi_on_180

def dihedral_tf1(p):
    p0 = tf.gather(p, 0)
    p1 = tf.gather(p, 1)
    p2 = tf.gather(p, 2)
    p3 = tf.gather(p, 3)
    
    b0 = -1.0 * (tf.subtract(p1, p0))
    b1 = tf.subtract(p2, p1)
    b2 = tf.subtract(p3, p2)
    
    b1 = tf.divide(b1, tf.norm(b1))
    
    v = tf.subtract(b0, tf.multiply(tf.tensordot(b0, b1, 1), b1))
    w = tf.subtract(b2, tf.multiply(tf.tensordot(b2, b1, 1), b1))
    
    x = tf.tensordot(v, w, 1)
    y = tf.tensordot(tf.cross(b1, v), w, 1)
    
    return tf_rad2deg(tf.atan2(y,x))

def dihedral_tf2(p):
    p0 = tf.gather(p, 0, axis=1)
    p1 = tf.gather(p, 1, axis=1)
    p2 = tf.gather(p, 2, axis=1)
    p3 = tf.gather(p, 3, axis=1)
    
    b0 = -1.0 * (tf.subtract(p1, p0))
    b1 = tf.subtract(p2, p1)
    b2 = tf.subtract(p3, p2)
    
    
    b1 = tf.divide(b1, tf.norm(b1, axis=0))
    
    v = tf.subtract(b0, tf.multiply(tf.tensordot(b0, b1, 2), b1))
    w = tf.subtract(b2, tf.multiply(tf.tensordot(b2, b1, 2), b1))
    

    x = tf.reduce_sum( tf.multiply( v, w ), 1, keep_dims=True )
    y = tf.reduce_sum( tf.multiply( tf.cross(b1, v), w ), 1, keep_dims=True )

    return tf_rad2deg(tf.atan2(y,x))

def dihedral_tf3(p):
    p0 = tf.gather(p, 0, axis=2)
    p1 = tf.gather(p, 1, axis=2)
    p2 = tf.gather(p, 2, axis=2)
    p3 = tf.gather(p, 3, axis=2)
    
    b0 = -1.0 * (tf.subtract(p1, p0))
    b1 = tf.subtract(p2, p1)
    b2 = tf.subtract(p3, p2)
    
    
    b1 = tf.divide(b1, tf.expand_dims(tf.norm(b1, axis=1), axis=1))
    
    v = tf.subtract(b0, tf.multiply(tf.tensordot(b0, b1, 3), b1))
    w = tf.subtract(b2, tf.multiply(tf.tensordot(b2, b1, 3), b1))
    

    x = tf.reduce_sum( tf.multiply( v, w ), 2, keep_dims=True )
    y = tf.reduce_sum( tf.multiply( tf.cross(b1, v), w ), 2, keep_dims=True )

    return tf_rad2deg(tf.atan2(y,x))

In [55]:
list_ = [1,2,3,4,5,6]
list_[slice(1,3)]

[2, 3]

We need 4 coordinates to calculate an angle but the proteins are organized in 3x3 matricies. Thus how do we calculate an angle. Take 

In [60]:
test1 = test[0]
test_full = test
print(test1.shape)

p1 = np.array([[
                [ 1,           0,         0     ],
                [ 0,           0,         0     ],
                [ 0,           0,         1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ]
            ],[
                [ 1,           0,         0     ],
                [ 0,           0,         0     ],
                [ 0,           0,         1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ],
                [ 0.999999,    0.000001,  1     ]
            ]])
print(p1.shape)
p1_tf = tf.convert_to_tensor(p1[0])
test1 = p1[0]
test_full

r = test1.shape[0]
n = 4
a_list = list(range(r))
the_list1 = np.array([a_list[slice(i, i+n)] for i in range(r - n+1)])
print(the_list1.shape)

r = test_full.shape[1]
n = 4
a_list = list(range(r))
the_list2 = [a_list[slice(i, i+n)] for i in range(r - n+1)]
the_list2 = np.array([the_list2 for _ in range(test_full.shape[0])])
the_list2 = the_list2
print(the_list2.shape)
# for i in range(len(r) - n + 1):
#     r[i: i + n]

p1_tf_stacked = tf.stack(tf.gather(test1, the_list1))

p1_tf_stacked_full = tf.stack(tf.gather(test1, the_list2))


angle1 = dihedral_tf1(p1_tf)
angle2 = dihedral_tf2(p1_tf_stacked)
angle3 = dihedral_tf3(p1_tf_stacked_full)


with tf.Session() as sess:
    p1_tf_stacked_, p1_tf_stacked_full_, angle1_, angle2_, angle3_ = sess.run([p1_tf_stacked, p1_tf_stacked_full, angle1, angle2, angle3])

dihedral(p1), np.array(p1_tf_stacked_).shape, np.array(p1_tf_stacked_full_).shape, angle1_, angle2_.shape, angle3_.shape

(2448, 3)
(7, 3)
(2445, 4)
(2, 2445, 4)


(5.729583680890003e-05,
 (2445, 4, 3),
 (2, 2445, 4, 3),
 5.7295836808900025e-05,
 (2445, 1),
 (2, 2445, 1))

In [7]:
tf.reset_default_graph()

num_epochs = 1
a_path = r'C:\Users\Michal\Desktop\ITU NLP\casp7\training\30\*'
# init_op = tf.group(tf.initialize_all_variables(), tf.initialize_local_variables())
base_names = glob.glob(a_path)
base_tensor = tf.convert_to_tensor(base_names[:1])
file_queue = tf.train.string_input_producer(
    base_tensor,
    num_epochs=num_epochs,
    shuffle=False # Note: must set shuffle to False
)

res = read_protein(file_queue, max_length=1000)
id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep = res
# one_hot_primary = tf.slice(one_hot_primary, 0, 10)

lstm_units = 5
batch_size=128
capacity=1000
min_after_dequeue=100
# ids, data, length = tf.train.shuffle_batch(
#       [id_, one_hot_primary, pri_length], 
#       batch_size=batch_size, 
#       capacity=capacity,
#       min_after_dequeue=min_after_dequeue)

ids, data, labels, length = tf.train.batch(
      [id_, one_hot_primary, tertiary, pri_length], 
      batch_size=batch_size, 
      capacity=capacity, dynamic_pad=True)

tile_test = tf.tile([tf.reduce_max(length)], (batch_size,))

# seq_length = tf.reduce_max(length)
cell = tf.nn.rnn_cell.LSTMCell(num_units=lstm_units, state_is_tuple=True)
outputs, states = tf.nn.bidirectional_dynamic_rnn(
    cell_fw=cell,
    cell_bw=cell,
    dtype=tf.float32,
    sequence_length=tile_test,
    inputs=data)
outputs_conc = tf.concat(outputs, 2)
pred = tf.layers.dense(outputs_conc, 3, activation=tf.nn.tanh, use_bias=False)
# loss = tf.losses.mean_squared_error(labels=labels, predictions=pred)

num_examples = 0
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    coord = tf.train.Coordinator()  
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)
#     batch = tf.train.shuffle_batch([id_], 10, 200, 100)
    try:
        step = 0
        while not coord.should_stop():
            start_time = time.time()
#             ids_, data_, labels_, length_ = sess.run([ids, data, labels, length])
            
#             out = sess.run(outputs)
            test, length_, outputs_, outputs_conc_, pred_ = sess.run([tile_test, length, outputs, outputs_conc, pred])
#             print(outputs_[0].shape, outputs_[1].shape, outputs_conc_.shape, pred_.shape, loss)
            
            labels_ = sess.run([labels])
            print(np.array(labels_).shape, np.array(labels_)[0][:2][:9])
            test = np.array(labels_)[0][:2]
#             print(ids_.shape, data_.shape, length_)
#             print "grabbing"
#             e, l = sess.run([example_batch, label_batch])
#             num_examples = num_examples + ids_.shape[0]
#             print "num_examples = " + str(num_examples)
            duration = time.time() - start_time

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (num_epochs, step))
#         print(num_examples)
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

(1, 128, 2448, 3) [[[    0.      0.      0. ]
  [    0.      0.      0. ]
  [    0.      0.      0. ]
  ...
  [    0.      0.      0. ]
  [    0.      0.      0. ]
  [    0.      0.      0. ]]

 [[ 5359.8 -1225.2   739.9]
  [ 5311.6 -1228.6   880.8]
  [ 5275.1 -1087.3   924.9]
  ...
  [    0.      0.      0. ]
  [    0.      0.      0. ]
  [    0.      0.      0. ]]]
Done training for 1 epochs, 0 steps.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Number of parameters: 8604656
