Trying to train Hyperface model using VGG-16 as main model, as you can see it has like 34M of parameters... it will throw OOM.

**Important**. First you need to generate the dataset, see `aflw-dataset` notebook.

In [1]:
from keras import backend as K
from keras.models import Model 
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
import time
import numpy as np
import h5py

Using TensorFlow backend.


In [2]:
# el paper usa una entrada de 227x227
img_width, img_height=224,224

In [3]:
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

# https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
vgg16 = applications.VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
vgg16.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [4]:
# hyperface features
conv1a_input = vgg16.get_layer('block1_conv2').output
conv3a_input = vgg16.get_layer('block3_conv3').output
conv5a_input = vgg16.get_layer('block5_conv3').output
print(conv1a_input.shape,conv3a_input.shape,conv5a_input.shape)

(?, 224, 224, 64) (?, 56, 56, 256) (?, 14, 14, 512)


<img src="AlexNet-architecture.jpg"/>
<img src="HyperFace-architecture.small.png"/>
<img src="VGG16-architecture.png"/>

In [5]:
from keras.layers import Input, Convolution2D, MaxPooling2D, Activation, Dropout, GlobalAveragePooling2D, merge
from keras.layers.merge import concatenate

# usamos block2_pool (56x56x128) porque se asemeja a paper (51x51x96)
conv1a_input = vgg16.get_layer('block2_pool').output
# con kernel=8x8/8 llegamos a 7x7x256 en paper es 6x6x256
conv1a = Convolution2D(256, (8,8), strides=(8,8), activation='relu', padding='valid', name='conv1a')(conv1a_input)
print(conv1a_input.shape, '->', conv1a.shape)

# block3_pool (28x28x256) paper (13x13x384)
conv3a_input = vgg16.get_layer('block3_pool').output
conv3a = Convolution2D(256, (4,4), strides=(4,4), activation='relu', padding='valid', name='conv3a')(conv3a_input)
print(conv3a_input.shape, '->', conv3a.shape)

# no tiene nombre así que le nombramos conv5a
# block5_pool (7x7x512) paper (13x13x256)
conv5a_input = vgg16.get_layer('block5_pool').output
conv5a = Convolution2D(256, (1,1), strides=(1,1), activation='relu', padding='valid', name='conv5a')(conv5a_input)
print(conv5a_input.shape, '->', conv5a.shape)

# combinación: 7x7x768
concat = concatenate([conv1a, conv3a, conv5a], axis=-1, name='concat')
print(concat.shape)

# reducción de dimensión: 7x7x192
conv_all = Convolution2D(192, (1, 1), activation='relu', padding='valid', name='conv_all')(concat)
print(conv_all.shape)

# completamente conectadas
fc_full = Flatten(input_shape=conv_all.shape, name='fc_full')(conv_all)
print(fc_full.shape)

(?, 56, 56, 128) -> (?, 7, 7, 256)
(?, 28, 28, 256) -> (?, 7, 7, 256)
(?, 7, 7, 512) -> (?, 7, 7, 256)
(?, 7, 7, 768)
(?, 7, 7, 192)
(?, ?)


<img src="HyperFace-architecture.small.png"/>

In [6]:
from keras.regularizers import l2

# face/non-face
fc_detecton = Dense(256, name='fc_detecton', activation='relu', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_full)
face_nonface = Dense(2, name='face_nonface', activation='sigmoid', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_detecton)
# print(fc_detecton.shape, face_nonface.shape)

fc_landmarks = Dense(512, name='fc_landmarks', activation='relu', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_full)
landmarks = Dense(42, name='landmarks', activation='sigmoid', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_landmarks)
# print(fc_landmarks.shape, landmarks.shape)

fc_visibility = Dense(512, name='fc_visibility', activation='relu', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_full)
visibility = Dense(21, name='visibility', activation='sigmoid', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_visibility)
# print(fc_visibility.shape, visibility.shape)

fc_pose = Dense(512, name='fc_pose', activation='relu', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_full)
roll_pitch_yaw = Dense(3, name='roll_pitch_yaw', activation='sigmoid', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_pose)
# print(fc_pose.shape, roll_pitch_yaw.shape)

fc_gender = Dense(256, name='fc_gender', activation='relu', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_full)
male_female = Dense(2, name='male_female', activation='sigmoid', kernel_initializer='he_normal', kernel_regularizer=l2(0.00))(fc_gender)
# print(fc_gender.shape, male_female.shape)

hyperface = Model(inputs=vgg16.input, outputs=[face_nonface, landmarks, visibility, roll_pitch_yaw, male_female])
hyperface.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 224, 224, 3)   0                                            
____________________________________________________________________________________________________
block1_conv1 (Conv2D)            (None, 224, 224, 64)  1792        input_1[0][0]                    
____________________________________________________________________________________________________
block1_conv2 (Conv2D)            (None, 224, 224, 64)  36928       block1_conv1[0][0]               
____________________________________________________________________________________________________
block1_pool (MaxPooling2D)       (None, 112, 112, 64)  0           block1_conv2[0][0]               
___________________________________________________________________________________________

The loss functions for the five networks are:
- Detection: BCE (binary cross-entropy). Detected bounding boxes that have an overlap >=0.5 with an annotated face are considered positive samples, bounding boxes with overlap <0.35 are considered negative samples, everything in between is ignored.
- Landmark localization: Roughly MSE (mean squared error), with some weighting for visibility. Only bounding boxes with overlap >0.35 are considered. Coordinates are normalized with respect to the bounding boxes center, width and height.
- Landmark visibility: MSE (predicted visibility factor vs. expected visibility factor). Only for bounding boxes with overlap >0.35.
- Pose estimation: MSE.
- Gender estimation: BCE.

In [7]:
import tensorflow as tf
import keras.backend as kb
import keras.losses as losses
from keras.optimizers import Adam, RMSprop

# loss from hyperface-with-squeezenet
def custom_mse_lm(y_true,y_pred):
    return kb.sign(kb.sum(kb.abs(y_true),axis=-1))*kb.sum(kb.square(tf.multiply((kb.sign(y_true)+1)*0.5, y_true-y_pred)),axis=-1)/kb.sum((kb.sign(y_true)+1)*0.5,axis=-1)

def custom_mse_pose(y_true,y_pred):
    return kb.sign(kb.sum(kb.abs(y_true),axis=-1))*losses.mean_squared_error(y_true,y_pred)

optimizer = Adam(lr=0.0001)
hyperface.compile(optimizer=optimizer,
                  loss={
                      'face_nonface': 'categorical_crossentropy',
                      'landmarks': custom_mse_lm,
                      'visibility': custom_mse_lm,
                      'roll_pitch_yaw': custom_mse_pose,
                      'male_female': 'categorical_crossentropy'},
                  loss_weights={
                      'face_nonface': 1,
                      'landmarks': 1,
                      'visibility': 1,
                      'roll_pitch_yaw': 1,
                      'male_female': 1})
print("hyperface model compiled")

hyperface model compiled


OutOfMemoryError....

In [8]:
from keras.callbacks import ModelCheckpoint
import hf

print("Building Train Data Generator...")
train_data = hf.ImageDataGeneratorV2()

json_dir = '/home/lmiguel/Projects/deep-learning/hyperface'
train_data_flow = train_data.flow_from_directory(json_dir,
                                                 'positives5k-test.json', 
                                                 'negatives5k-test.json', 
                                                 output_type='hyperface', 
                                                 pos_batch_size=128, 
                                                 neg_batch_size=128)

# checkpoint
filepath="weights-{epoch:02d}-{loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='train_loss', verbose=1, save_best_only=False, mode='min', period=30)
callbacks_list = [checkpoint]

print("Start Training...")
output_file_name = 'hyperface.h5'
hyperface.fit_generator(train_data_flow, steps_per_epoch=100, epochs=300, callbacks=callbacks_list)
hyperface.save(output_file_name)

Building Train Data Generator...
Found 116 positive samples and 4550 negative samples.
Start Training...
Epoch 1/300




ResourceExhaustedError: OOM when allocating tensor with shape[244,224,224,64]
	 [[Node: block1_conv1/convolution = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_151, block1_conv1/kernel/read)]]
	 [[Node: loss/face_nonface_loss/Mean_2/_253 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_4558_loss/face_nonface_loss/Mean_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'block1_conv1/convolution', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/lmiguel/.local/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/lmiguel/.local/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/lmiguel/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/lmiguel/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-639b7bc8a558>", line 7, in <module>
    vgg16 = applications.VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
  File "/usr/local/lib/python3.5/dist-packages/keras/applications/vgg16.py", line 112, in VGG16
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1')(img_input)
  File "/usr/local/lib/python3.5/dist-packages/keras/engine/topology.py", line 602, in __call__
    output = self.call(inputs, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/keras/layers/convolutional.py", line 164, in call
    dilation_rate=self.dilation_rate)
  File "/usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py", line 3164, in conv2d
    data_format='NHWC')
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 672, in convolution
    op=op)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 338, in with_space_to_batch
    return op(input, num_spatial_dims, padding)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 664, in op
    name=name)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/ops/nn_ops.py", line 131, in _non_atrous_convolution
    name=name)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 397, in conv2d
    data_format=data_format, name=name)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/lmiguel/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[244,224,224,64]
	 [[Node: block1_conv1/convolution = Conv2D[T=DT_FLOAT, data_format="NHWC", padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_input_1_0_2/_151, block1_conv1/kernel/read)]]
	 [[Node: loss/face_nonface_loss/Mean_2/_253 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_4558_loss/face_nonface_loss/Mean_2", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


Other custom costs

In [None]:
# # from takiyu hyperface
# loss_detection = F.softmax_cross_entropy(h_detection, t_detection)
# loss_landmark = F.mean_squared_error(h_landmark, t_landmark)
# loss_visibility = F.mean_squared_error(h_visibility, t_visibility)
# loss_pose = F.mean_squared_error(h_pose, t_pose)
# loss_gender = F.softmax_cross_entropy(h_gender, t_gender)

# # from realtime hyperface
# self.lambda_face = 1
# self.lambda_pose = 5
# self.lambda_gender = 1
# self.lambda_viz = 0.5
# self.lambda_landmark = 10
# self.lambda_regr = 1

# classifier_loss = [losses.class_loss_face(),losses.class_loss_pose(),losses.class_loss_gender(), losses.class_loss_viz(),losses.class_loss_landmark(),losses.class_loss_regr()]
# classifier_loss_weight = [C.lambda_face,C.lambda_pose,C.lambda_gender,C.lambda_viz,C.lambda_landmark, C.lambda_regr]

# # from realtime hyperface
# def custom_loss_face_nonface(y_true, y_pred):
#     face_true = y_true[0,:,0]
#     num_face = K.sum(face_true)
#     ll =  K.sum( categorical_crossentropy(y_true[0, :, :], y_pred[0, :, :]) )/32
#     return ll

# def custom_loss_landmarks(coord_true, coord_pred):
#     viz_true = coord_true[0,:,1:22]

#     x_true_coord = coord_true[0,:,22:43]
#     y_true_coord = coord_true[0,:,43:64]

#     x_pred_coord = coord_pred[0,:,0:21]
#     y_pred_coord = coord_pred[0,:,21:42]

#     num_viz_feature = K.sum(viz_true)
#     return K.sum(viz_true * (K.square(x_true_coord - x_pred_coord) + K.square(y_true_coord - y_pred_coord)), axis=-1)/(32*42) #(num_viz_feature+0.01)
#     #return K.sum(viz_true[0,:,:] * (K.square(y_true_x[0,:,:] - y_pred_x[0,:,:]) + K.square(y_true_y[0,:,:] - y_pred_y[0,:,:])), axis=-1)/(num_viz_feature+0.01)

# def custom_loss_visibility(y_true, y_pred):
#     face_true = y_true[0,:,0]
#     num_face = K.sum(face_true)
#     return K.sum( face_true * mean_squared_error(y_true[0,:,1:] , y_pred[0,:,:]))/(32*42)

# def custom_loss_roll_pitch_yaw(y_true, y_pred):
#     # bp()
#     face_true = y_true[0,:,0]
#     num_face = K.sum(face_true)
#     return K.sum( face_true * mean_squared_error(y_true[0, :,1:], y_pred[0, :, :]))/32

# def custom_loss_male_female(y_true, y_pred):
#     face_true = y_true[0,:,0]
#     num_face = K.sum(face_true)
#     return K.sum( face_true * categorical_crossentropy(y_true[0, :, 1:], y_pred[0, :, :]))/32
