In [1]:
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf

class FrameConverter:
    def __init__(self, X_transforms=[], y_transforms=[], repeat_count=1, n_parallel=1):
        self.filename_base = '/home/data/full/frame/{}{}.tfrecord'
        self.X_transforms = X_transforms
        self.y_transforms = y_transforms
        self.repeat_count = repeat_count
        self.n_parallel = n_parallel
        
        self.keys_to_features = {
            'rgb': tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
            'audio': tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
        }
        self.key_to_label = {
            'labels': tf.VarLenFeature(tf.int64)
        }
        
    def get_train_data(self, filename):
        y, X = tf.parse_single_sequence_example(filename,
                                                self.key_to_label,
                                                self.keys_to_features)
        # X is still bytes; convert to float
        X['audio'] = tf.cast(tf.decode_raw(X['audio'], tf.uint8), tf.float32)
        X['rgb'] = tf.cast(tf.decode_raw(X['rgb'], tf.uint8), tf.float32)

        # now apply custom transformations
        for transform in self.X_transforms:
            X = transform(X)

        y = tf.sparse_to_dense(y['labels'].values, [3862], 1)
        for transform in self.y_transforms:
            y = transform(y)
        return X, y

    def get_test_data(self, filename):
        X = tf.parse_single_sequence_example(filename, None, self.keys_to_features)[1]
        
        # X is still bytes; convert to float
        X['audio'] = tf.cast(tf.decode_raw(X['audio'], tf.uint8), tf.float32)
        X['rgb'] = tf.cast(tf.decode_raw(X['rgb'], tf.uint8), tf.float32)

        # now apply custom transformations
        for transform in self.X_transforms:
            X = transform(X)
        return X
    
    def make_spec(self, subset, record_indices):
        filenames = [self.filename_base.format(subset, index) for index in record_indices]
        
        dataset = tf.data.TFRecordDataset(filenames)
        
        if subset == 'train':
            dataset = dataset.map(self.get_train_data,
                                  num_parallel_calls=self.n_parallel)
            dataset = dataset.repeat(self.repeat_count)
            dataset = dataset.shuffle(buffer_size=256)
        else:
            dataset = dataset.map(self.get_test_data,
                                  num_parallel_calls=self.n_parallel)
        dataset = dataset.batch(1)
        dataset = dataset.prefetch(1)
        iterator = dataset.make_one_shot_iterator()
        return iterator
    
frame_converter = FrameConverter()

train_iterator = frame_converter.make_spec('train', [2500])
train_sess = tf.Session()
train_next_sample = train_iterator.get_next()
train_sample = train_sess.run(train_next_sample)
print(train_sample[0])
print(train_sample[0]['rgb'].shape)
print(train_sample[0]['audio'].shape)

test_iterator = frame_converter.make_spec('test', [2000])
test_sess = tf.Session()
test_next_sample = test_iterator.get_next()
test_sample = test_sess.run(test_next_sample)
print(test_sample)
print(test_sample['rgb'].shape)
print(test_sample['audio'].shape)

{'audio': array([[[140., 105., 169., ..., 140., 199.,  78.],
        [182., 180., 131., ..., 255.,  70.,   0.],
        [202., 134., 182., ..., 112., 189.,   0.],
        ...,
        [173.,  27., 126., ..., 133., 130., 187.],
        [173.,  27., 126., ..., 133., 130., 187.],
        [173.,  27., 126., ..., 133., 130., 187.]]], dtype=float32), 'rgb': array([[[  0.,  88., 144., ..., 169., 122., 121.],
        [137., 106., 148., ..., 203., 100., 158.],
        [ 68., 152., 164., ..., 244.,  67., 121.],
        ...,
        [ 14., 140., 255., ...,  60., 201.,  78.],
        [  3., 152., 255., ...,  63., 129., 143.],
        [ 31., 133., 255., ..., 133., 168., 155.]]], dtype=float32)}
(1, 300, 1024)
(1, 300, 128)
{'audio': array([[[106.,  73., 193., ...,  81.,  82., 255.],
        [108.,  73., 233., ..., 169.,  19.,  70.],
        [101.,  68., 213., ...,  70.,  49., 159.],
        ...,
        [ 78.,  82., 202., ..., 188., 140., 255.],
        [ 84.,  73., 177., ...,   0., 193., 255.],
  

In [2]:
from tensorflow.python.keras.layers import Input, Dense, GRU, Flatten, Add
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.estimator import model_to_estimator

n_classes = 3862
rgb_in = Input((None, 1024), name='rgb')
audio_in = Input((None, 128), name='audio')
rgb_mid = GRU(64, activation='relu')(rgb_in)
audio_mid = GRU(64, activation='relu')(audio_in)
model = Model([rgb_in, audio_in], audio_mid)
combined_mid = Add()([rgb_mid, audio_mid])
out = Dense(32, activation='relu')(combined_mid)
out = Dense(n_classes, activation='softmax')(out)
model = Model([rgb_in, audio_in], out)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
estimator = model_to_estimator(keras_model=model,
                               model_dir='/home/models')

INFO:tensorflow:Using the Keras model provided.
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/models', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f018f8a4400>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [4]:
import numpy as np
model.predict([np.random.random((10, 300, 1024)), np.random.random((10, 300, 128))]).shape

(10, 3862)

In [7]:
train_spec = tf.estimator.TrainSpec(input_fn=lambda: train_next_sample,
                                    max_steps=5)
eval_spec = tf.estimator.EvalSpec(input_fn=lambda: test_next_sample)

In [11]:
train_sess.run(train_next_sample)[0]['rgb'].shape

(1, 235, 1024)

In [13]:
estimator.train(lambda: train_next_sample)

INFO:tensorflow:Calling model_fn.


ValueError: Shape of a new variable (gru_1/gru_cell/kernel) must be fully defined, but instead was (?, 192).

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
rgb (InputLayer)                (None, None, 1024)   0                                            
__________________________________________________________________________________________________
audio (InputLayer)              (None, None, 128)    0                                            
__________________________________________________________________________________________________
gru_1 (GRU)                     (None, 64)           209088      rgb[0][0]                        
__________________________________________________________________________________________________
gru_2 (GRU)                     (None, 64)           37056       audio[0][0]                      
__________________________________________________________________________________________________
add_1 (Add