In [1]:
%matplotlib inline
import tensorflow as tf
import os
import utils

In [2]:
num_classes=527
feature_sizes=[128]
feature_names=["audio_embedding"]
max_frames=300

In [3]:
def read_and_decode(filename_queue):
    reader = tf.TFRecordReader()
    
    _, serialized_example = reader.read(filename_queue)
    
    features = tf.parse_single_example(
        serialized_example,
        # Defaults are not specified since both keys are required.
        features={
            'video_id': tf.FixedLenFeature([], tf.string),
            'start_time_seconds': tf.FixedLenFeature([], tf.float32),
            'end_time_seconds': tf.FixedLenFeature([], tf.float32),
            'labels': tf.VarLenFeature(tf.int64),
            'audio_embedding': tf.FixedLenFeature([], tf.string)
        }
    )
    
    # DECODE FEATURES
    
    # Video_id string
    #video_id = tf.decode_raw(features['video_id'], tf.string)
    
    #start_time_seconds = tf.cast(features['start_time_seconds'], tf.float32)
    #end_time_seconds = tf.cast(features['end_time_seconds'], tf.float32)
    
    # Convert label from a scalar int64 tensor to an int32 scalar.
    labels = tf.cast(features['labels'], tf.int32)
    #audio_embedding = tf.decode_raw(features['audio_embedding'], tf.float32)
    
    #audio_embedding_batch, 
    labels_batch = tf.train.shuffle_batch(
        [labels],
        batch_size=10,
        capacity=30,
        num_threads=10,
        min_after_dequeue=10
    )
    
    return labels



In [4]:
def resize_axis(tensor, axis, new_size, fill_value=0):
    """Truncates or pads a tensor to new_size on on a given axis.
    Truncate or extend tensor such that tensor.shape[axis] == new_size. If the
    size increases, the padding will be performed at the end, using fill_value.
    Args:
      tensor: The tensor to be resized.
      axis: An integer representing the dimension to be sliced.
      new_size: An integer or 0d tensor representing the new value for
        tensor.shape[axis].
      fill_value: Value to use to fill any new entries in the tensor. Will be
        cast to the type of tensor.
    Returns:
      The resized tensor.
    """
    tensor = tf.convert_to_tensor(tensor)
    shape = tf.unstack(tf.shape(tensor))

    pad_shape = shape[:]
    pad_shape[axis] = tf.maximum(0, new_size - shape[axis])

    shape[axis] = tf.minimum(shape[axis], new_size)
    shape = tf.stack(shape)

    resized = tf.concat([
        tf.slice(tensor, tf.zeros_like(shape), shape),
        tf.fill(tf.stack(pad_shape), tf.cast(fill_value, tensor.dtype))
    ], axis)

    # Update shape.
    new_shape = tensor.get_shape().as_list()  # A copy is being made.
    new_shape[axis] = new_size
    resized.set_shape(new_shape)
    return resized

In [5]:
def get_video_matrix(features,
                      feature_size,
                      smax_frames,
                      max_quantized_value,
                      min_quantized_value):
    """Decodes features from an input string and quantizes it.
    Args:
      features: raw feature values
      feature_size: length of each frame feature vector
      max_frames: number of frames (rows) in the output feature_matrix
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.
    Returns:
      feature_matrix: matrix of all frame-features
      num_frames: number of frames in the sequence
    """
    decoded_features = tf.reshape(
        tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
        [-1, feature_size])

    num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
    feature_matrix = utils.Dequantize(decoded_features,
                                      max_quantized_value,
                                      min_quantized_value)
    feature_matrix = resize_axis(feature_matrix, 0, max_frames)
    #print feature_matrix, feature_matrix.shape, num_frames
    #sys.exit()
    return feature_matrix, num_frames

In [11]:
def prepare_serialized_examples(serialized_example,
    max_quantized_value=2, min_quantized_value=-2):

    contexts, features = tf.parse_single_sequence_example(
        serialized_example,
        context_features={"video_id": tf.FixedLenFeature(
            [], tf.string),
                          "labels": tf.VarLenFeature(tf.int64)},
        sequence_features={
            feature_name : tf.FixedLenSequenceFeature([], dtype=tf.string)
            for feature_name in feature_names
        })

    # read ground truth labels
    labels = (tf.cast(
        tf.sparse_to_dense(contexts["labels"].values, (num_classes,), 1,
            validate_indices=False),
        tf.bool))

    # loads (potentially) different types of features and concatenates them
    num_features = len(feature_names)
    #print num_features
    assert num_features > 0, "No feature selected: feature_names is empty!"

    assert len(feature_names) == len(feature_sizes), \
    "length of feature_names (={}) != length of feature_sizes (={})".format( \
    len(feature_names), len(feature_sizes))

    num_frames = -1  # the number of frames in the video
    feature_matrices = [None] * num_features  # an array of different features
    for feature_index in range(num_features):
      #print feature_index
      feature_matrix, num_frames_in_this_feature = get_video_matrix(
          features[feature_names[feature_index]],
          feature_sizes[feature_index],
          max_frames,
          max_quantized_value,
          min_quantized_value)
    if num_frames == -1:
        num_frames = num_frames_in_this_feature
    else:
        tf.assert_equal(num_frames, num_frames_in_this_feature)

    feature_matrices[feature_index] = feature_matrix

    # cap the number of frames at self.max_frames
    num_frames = tf.minimum(num_frames, max_frames)

    # concatenate different features
    video_matrix = tf.concat(feature_matrices, 1)

    # convert to batch format.
    video_id=contexts["video_id"]
    
    # TODO: Do proper batch reads to remove the IO bottleneck.
    video_id_batch, video_matrix_batch, labels_batch, num_frames_batch = tf.train.shuffle_batch(
        [video_id, video_matrix, labels, num_frames],
        batch_size=10,
        capacity=30,
        num_threads=5,
        min_after_dequeue=10
    )
    #batch_video_ids = tf.expand_dims(contexts["video_id"], 0)
    #batch_video_matrix = tf.expand_dims(video_matrix, 0)
    #batch_labels = tf.expand_dims(labels, 0)
    #batch_frames = tf.expand_dims(num_frames, 0)

    #return batch_video_ids, batch_video_matrix, batch_labels, batch_frames
    return video_id_batch, video_matrix_batch, labels_batch, num_frames_batch

In [13]:
# Import data
bal_train_dir = '../data/audioset_v1_embeddings/bal_train/' 
directory = os.fsencode(bal_train_dir)

tfrecord_files = []

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".tfrecord"): 
        file_path = os.path.join(os.fsdecode(directory), filename)
        tfrecord_files.append(file_path)

filename_queue = tf.train.string_input_producer(
    tfrecord_files,
    num_epochs=10
)

reader = tf.TFRecordReader()

_, serialized_example = reader.read(filename_queue)

video_id, video_matrix, labels, num_frames = prepare_serialized_examples(
    serialized_example
)

print(video_id)
print(video_matrix)
print(labels)
print(num_frames)


# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(),
                   tf.local_variables_initializer())

with tf.Session() as sess:
    
    sess.run(init_op)
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    # Let's read off 3 batches just for example
    for i in range(3):
    
        vid, vidmat, lab= sess.run([video_id, video_matrix, labels])
        print(vid.shape)
        print(vid)
        print(lab.shape)
        print(lab)
        print(vidmat.shape)
        print(vidmat[0])
        
        print('\n current batch')

#labels_batch = read_and_decode(filename_queue)

#print(labels_batch)

Tensor("shuffle_batch_3:0", shape=(10,), dtype=string)
Tensor("shuffle_batch_3:1", shape=(10, 300, 128), dtype=float32)
Tensor("shuffle_batch_3:2", shape=(10, 527), dtype=bool)
Tensor("shuffle_batch_3:3", shape=(10,), dtype=int32)
(10,)
[b'kWjt5M1_xbY' b'MKwnxr3ypB4' b'MKM3Rs5Kg8c' b'MbOu1rlgL3U' b'MK1NzQm9XaQ'
 b'UlqBFf63wXE' b'9332PjiYkhI' b'MKqXe4X6g-8' b'kWTuTNmyers' b'93b0scnqSOw']
(10, 527)
[[False False False ..., False False False]
 [ True False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [ True False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]]
(10, 300, 128)
[[-0.26669717  0.45487142  0.98820472 ..., -1.9921875  -0.09414816
  -0.32944226]
 [-0.4235599   0.23526359  0.0470283  ..., -1.9921875   2.0078125
  -0.03140306]
 [-0.98826587  0.81565571 -0.07846189 ..., -0.9725796   2.0078125
   0.48624396]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.    