In [41]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import glob
import tensorflow as tf
import time

NUM_AAS = 20
NUM_DIMENSIONS = 3

def masking_matrix(mask, name=None):
    """ Constructs a masking matrix to zero out pairwise distances due to missing residues or padding. 

    Args:
        mask: 0/1 vector indicating whether a position should be masked (0) or not (1)

    Returns:
        A square matrix with all 1s except for rows and cols whose corresponding indices in mask are set to 0.
        [MAX_SEQ_LENGTH, MAX_SEQ_LENGTH]
    """

    with tf.name_scope(name, 'masking_matrix', [mask]) as scope:
        mask = tf.convert_to_tensor(mask, name='mask')

        mask = tf.expand_dims(mask, 0)
        base = tf.ones([tf.size(mask), tf.size(mask)])
        matrix_mask = base * mask * tf.transpose(mask)

        return matrix_mask
        
def read_protein(filename_queue, max_length, num_evo_entries=21, name=None):
    """ Reads and parses a ProteinNet TF Record. 

        Primary sequences are mapped onto 20-dimensional one-hot vectors.
        Evolutionary sequences are mapped onto num_evo_entries-dimensional real-valued vectors.
        Secondary structures are mapped onto ints indicating one of 8 class labels.
        Tertiary coordinates are flattened so that there are 3 times as many coordinates as 
        residues.

        Evolutionary, secondary, and tertiary entries are optional.

    Args:
        filename_queue: TF queue for reading files
        max_length:     Maximum length of sequence (number of residues) [MAX_LENGTH]. Not a 
                        TF tensor and is thus a fixed value.

    Returns:
        id: string identifier of record
        one_hot_primary: AA sequence as one-hot vectors
        evolutionary: PSSM sequence as vectors
        secondary: DSSP sequence as int class labels
        tertiary: 3D coordinates of structure
        matrix_mask: Masking matrix to zero out pairwise distances in the masked regions
        pri_length: Length of amino acid sequence
        keep: True if primary length is less than or equal to max_length
    """

    with tf.name_scope(name, 'read_protein', []) as scope:
        reader = tf.TFRecordReader()
        _, serialized_example = reader.read(filename_queue)

        context, features = tf.parse_single_sequence_example(serialized_example,
                                context_features={'id': tf.FixedLenFeature((1,), tf.string)},
                                sequence_features={
                                    'primary':      tf.FixedLenSequenceFeature((1,),               tf.int64),
                                    'evolutionary': tf.FixedLenSequenceFeature((num_evo_entries,), tf.float32, allow_missing=True),
                                    'secondary':    tf.FixedLenSequenceFeature((1,),               tf.int64,   allow_missing=True),
                                    'tertiary':     tf.FixedLenSequenceFeature((NUM_DIMENSIONS,),  tf.float32, allow_missing=True),
                                    'mask':         tf.FixedLenSequenceFeature((1,),               tf.float32, allow_missing=True)})
        id_ = context['id'][0]
        primary =   tf.to_int32(features['primary'][:, 0])
        evolutionary =          features['evolutionary']
        secondary = tf.to_int32(features['secondary'][:, 0])
        tertiary =              features['tertiary']
        mask =                  features['mask'][:, 0]

        pri_length = tf.size(primary)
        keep = pri_length <= max_length

        one_hot_primary = tf.one_hot(primary, NUM_AAS)

        # Generate tertiary masking matrix--if mask is missing then assume all residues are present
        mask = tf.cond(tf.not_equal(tf.size(mask), 0), lambda: mask, lambda: tf.ones([pri_length]))
        ter_mask = masking_matrix(mask, name='ter_mask')        

        return id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep

['C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\1', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\10', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\11', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\12', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\13', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\14', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\15', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\16', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\17', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\18', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\19', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\2', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\20', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\21', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\casp7\\training\\30\\22', 'C:\\Users\\Michal\\Desktop\\ITU NLP\\cas

In [29]:
id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep = res

In [65]:
tf.reset_default_graph()

num_epochs = 2
a_path = r'C:\Users\Michal\Desktop\ITU NLP\casp7\training\30\*'
# init_op = tf.group(tf.initialize_all_variables(), tf.initialize_local_variables())
base_names = glob.glob(a_path)
base_tensor = tf.convert_to_tensor(base_names[:1])
file_queue = tf.train.string_input_producer(
    base_tensor,
    num_epochs=num_epochs,
    shuffle=False # Note: must set shuffle to False
)

res = read_protein(file_queue, max_length=100)
id_, one_hot_primary, evolutionary, secondary, tertiary, ter_mask, pri_length, keep = res
# one_hot_primary = tf.slice(one_hot_primary, 0, 10)

batch_size=32
capacity=1000
min_after_dequeue=100
# ids, data, length = tf.train.shuffle_batch(
#       [id_, one_hot_primary, pri_length], 
#       batch_size=batch_size, 
#       capacity=capacity,
#       min_after_dequeue=min_after_dequeue)

ids, data, length = tf.train.batch(
      [id_, one_hot_primary, pri_length], 
      batch_size=batch_size, 
      capacity=capacity, dynamic_pad=True)

init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

num_examples = 0
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    coord = tf.train.Coordinator()  
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)
#     batch = tf.train.shuffle_batch([id_], 10, 200, 100)
    try:
        step = 0
        while not coord.should_stop():
            start_time = time.time()
            ids_, data_, length_ = sess.run([ids, data, length])
            print(ids_.shape, data_.shape)
#             print "grabbing"
#             e, l = sess.run([example_batch, label_batch])
            num_examples = num_examples + e.shape[0]
#             print "num_examples = " + str(num_examples)
            duration = time.time() - start_time

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (num_epochs, step))
        print(num_examples)
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()

(32,) (32, 389, 20)
(32,) (32, 858, 20)
(32,) (32, 541, 20)
(32,) (32, 608, 20)
(32,) (32, 495, 20)
(32,) (32, 511, 20)
(32,) (32, 520, 20)
(32,) (32, 816, 20)
(32,) (32, 389, 20)
(32,) (32, 858, 20)
(32,) (32, 541, 20)
(32,) (32, 608, 20)
(32,) (32, 495, 20)
(32,) (32, 511, 20)
(32,) (32, 520, 20)
(32,) (32, 816, 20)
Done training for 2 epochs, 0 steps.
160
