In [1]:
import itertools
import numpy as np
import pandas as pd
import tensorflow as tf

class FrameConverter:
    def __init__(self, X_transforms=[], y_transforms=[], repeat_count=1, n_parallel=1):
        self.filename_base = '/home/data/full/frame/{}{}.tfrecord'
        self.X_transforms = X_transforms
        self.y_transforms = y_transforms
        self.repeat_count = repeat_count
        self.n_parallel = n_parallel
        
        self.keys_to_features = {
            'rgb': tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
            'audio': tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
        }
        self.key_to_label = {
            'labels': tf.VarLenFeature(tf.int64)
        }
        
    def get_train_data(self, filename):
        y, X = tf.parse_single_sequence_example(filename,
                                                self.key_to_label,
                                                self.keys_to_features)
        # X is still bytes; convert to float
        X['audio'] = tf.cast(tf.decode_raw(X['audio'], tf.uint8), tf.float32)
        X['rgb'] = tf.cast(tf.decode_raw(X['rgb'], tf.uint8), tf.float32)

        # now apply custom transformations
        for transform in self.X_transforms:
            X = transform(X)

        y = tf.sparse_to_dense(y['labels'].values, [3862], 1)
        for transform in self.y_transforms:
            y = transform(y)
        return X, y

    def get_test_data(self, filename):
        X = tf.parse_single_sequence_example(filename, None, self.keys_to_features)[1]
        
        # X is still bytes; convert to float
        X['audio'] = tf.cast(tf.decode_raw(X['audio'], tf.uint8), tf.float32)
        X['rgb'] = tf.cast(tf.decode_raw(X['rgb'], tf.uint8), tf.float32)

        # now apply custom transformations
        for transform in self.X_transforms:
            X = transform(X)
        return X
    
    def make_provider(self, subset, record_indices):
        filenames = [self.filename_base.format(subset, index) for index in record_indices]
        
        dataset = tf.data.TFRecordDataset(filenames)
        
        if subset == 'train':
            dataset = dataset.map(self.get_train_data,
                                  num_parallel_calls=self.n_parallel)
            dataset = dataset.repeat(self.repeat_count)
            dataset = dataset.shuffle(buffer_size=256)
        else:
            dataset = dataset.map(self.get_test_data,
                                  num_parallel_calls=self.n_parallel)
        dataset = dataset.batch(1)
        dataset = dataset.prefetch(1)
        iterator = dataset.make_one_shot_iterator()
        return iterator
    
    def make_generator(self, subset, record_indices):
        provider = self.make_provider(subset, record_indices)
        sess = tf.Session()
        next_sample = provider.get_next()
        while True:
            try:
                yield sess.run(next_sample)
            except tf.errors.OutOfRangeError:
                print("Iterations exhausted")
                break
    
frame_converter = FrameConverter()
train_generator = frame_converter.make_generator('train', [2500])
valid_generator = frame_converter.make_generator('validate', [2000])

In [9]:
next(train_generator)

({'audio': array([[[173.,  27., 126., ..., 133., 130., 187.],
          [139.,  36., 100., ..., 107.,  98.,   0.],
          [138.,  65., 101., ...,  84.,  77., 231.],
          ...,
          [158.,  35., 123., ..., 103.,  87., 223.],
          [173.,  27., 126., ..., 133., 130., 187.],
          [173.,  27., 126., ..., 133., 130., 187.]]], dtype=float32),
  'rgb': array([[[ 72.,  96., 129., ..., 128.,  85., 190.],
          [ 72., 102., 130., ..., 127.,  84., 186.],
          [  0.,  59., 184., ..., 197., 105., 119.],
          ...,
          [176., 115., 117., ...,  26., 189., 166.],
          [ 77.,  96., 131., ..., 124.,  85., 173.],
          [ 76.,  96., 131., ..., 126.,  88., 176.]]], dtype=float32)},
 array([[0, 0, 1, ..., 0, 0, 0]], dtype=int32))

In [2]:
from tensorflow.python.keras.layers import Input, Dense, GRU, Flatten, Add
from tensorflow.python.keras.models import Model

n_classes = 3862
rgb_in = Input((None, 1024), name='rgb')
audio_in = Input((None, 128), name='audio')
rgb_mid = GRU(64, activation='relu')(rgb_in)
audio_mid = GRU(64, activation='relu')(audio_in)
combined_mid = Add()([rgb_mid, audio_mid])
out = Dense(32, activation='relu')(combined_mid)
out = Dense(n_classes, activation='softmax')(out)
model = Model([rgb_in, audio_in], out)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [4]:
model.fit_generator(train_generator, steps_per_epoch=30, epochs=1,
                    validation_data=valid_generator,
                    validation_steps=20)

Epoch 1/1

AttributeError: 'str' object has no attribute 'ndim'

In [10]:
import numpy as np
model.predict([np.random.random((10, 300, 1024)), np.random.random((10, 300, 128))]).shape

(10, 3862)