Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Nitin Kumar Bansal
committed
Oct 26, 2018
1 parent
b7c0563
commit 05d47ab
Showing
1 changed file
with
392 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,392 @@ | ||
"""Trains a ResNet on the CIFAR10 dataset. | ||
ResNet v1 | ||
[a] Deep Residual Learning for Image Recognition | ||
https://arxiv.org/pdf/1512.03385.pdf | ||
ResNet v2 | ||
[b] Identity Mappings in Deep Residual Networks | ||
https://arxiv.org/pdf/1603.05027.pdf | ||
""" | ||
|
||
from __future__ import print_function | ||
import keras | ||
from keras.layers import Dense, Conv2D, BatchNormalization, Activation | ||
from keras.layers import AveragePooling2D, Input, Flatten | ||
from keras.optimizers import Adam | ||
from keras.callbacks import ModelCheckpoint, LearningRateScheduler | ||
from keras.callbacks import ReduceLROnPlateau | ||
from keras.preprocessing.image import ImageDataGenerator | ||
from keras.regularizers import l2 | ||
from keras import backend as K | ||
from keras.models import Model | ||
from keras.datasets import cifar100 | ||
import numpy as np | ||
import tensorflow as tf | ||
import os | ||
|
||
#Defining the Regularization Function for maintiaing the Orthogonality | ||
# Regularizer Function, Forbenius Norm for ||T(W) * W - I||^2 Norm. | ||
|
||
d_rate = K.variable(0.01) | ||
w_rate = K.variable(1e-4) | ||
|
||
##Function to implement the decay rate annealing | ||
def decay_schedule(epoch): | ||
#Initial Learning rate | ||
d_r = 0.01 | ||
|
||
if epoch > 120: | ||
d_r = 0.0 | ||
elif epoch > 70: | ||
d_r = 1e-6 * d_r | ||
elif epoch > 50: | ||
d_r = 1e-4 * d_r | ||
elif epoch > 20: | ||
d_r = 1e-3 * d_r | ||
|
||
print ('Decay rate:', d_r) | ||
return d_r | ||
|
||
|
||
#Defining the Regularization Function for maintiaing the Orthogonality | ||
# Regularizer Function, Forbenius Norm for ||T(W) * W - I||^2 Norm. | ||
|
||
def l2_reg(weights): | ||
|
||
w = weights | ||
inp_shape = K.int_shape(w) | ||
row_dims = inp_shape[0]*inp_shape[1]*inp_shape[2] | ||
col_dims = inp_shape[3] | ||
w = K.reshape(w, (row_dims,col_dims)) | ||
W1 = K.transpose(w) | ||
|
||
Ident = np.eye(col_dims) | ||
W_new = K.dot(W1,w) | ||
Norm = W_new - Ident | ||
|
||
b_k = np.random.rand(Norm.shape[1]) | ||
b_k = np.reshape(b_k, (Norm.shape[1],1)) | ||
v = K.variable(value=b_k) | ||
|
||
v1 = K.dot(Norm, v) | ||
norm1 = K.sum(K.square(v1))**0.5 | ||
|
||
v2 = tf.divide(v1,norm1) | ||
|
||
v3 = K.dot(Norm,v2) | ||
return d_rate*(K.sum(K.square(v3))**0.5) + w_rate*(K.sum(K.square(w))**0.5) | ||
|
||
#CallBack Class for Ortho Decay rate | ||
class DecayRate_Controller(keras.callbacks.Callback): | ||
def __init__(self, controller): | ||
super().__init__() | ||
self.controller = controller | ||
|
||
|
||
def on_epoch_begin(self, epoch, logs=None): | ||
d_r = self.controller(epoch) | ||
K.set_value(d_rate, d_r) | ||
|
||
|
||
# Training parameters | ||
batch_size = 32 # orig paper trained all networks with batch_size=128 | ||
epochs = 200 | ||
data_augmentation = True | ||
num_classes = 100 | ||
|
||
# Subtracting pixel mean improves accuracy | ||
subtract_pixel_mean = True | ||
|
||
|
||
n = 12 | ||
|
||
# Model version | ||
# Orig paper: version = 1 (ResNet v1), Improved ResNet: version = 2 (ResNet v2) | ||
version = 2 | ||
|
||
# Computed depth from supplied model parameter n | ||
if version == 1: | ||
depth = n * 6 + 2 | ||
elif version == 2: | ||
depth = n * 9 + 2 | ||
|
||
|
||
# Model name, depth and version | ||
model_type = 'ResNet%dv%d' % (depth, version) | ||
|
||
# Load the CIFAR100 data. | ||
(x_train, y_train), (x_test, y_test) = cifar100.load_data() | ||
|
||
# Input image dimensions. | ||
input_shape = x_train.shape[1:] | ||
|
||
# Normalize data. | ||
x_train = x_train.astype('float32') / 255 | ||
x_test = x_test.astype('float32') / 255 | ||
|
||
# If subtract pixel mean is enabled | ||
if subtract_pixel_mean: | ||
x_train_mean = np.mean(x_train, axis=0) | ||
x_train -= x_train_mean | ||
x_test -= x_train_mean | ||
|
||
print('x_train shape:', x_train.shape) | ||
print(x_train.shape[0], 'train samples') | ||
print(x_test.shape[0], 'test samples') | ||
print('y_train shape:', y_train.shape) | ||
|
||
# Convert class vectors to binary class matrices. | ||
y_train = keras.utils.to_categorical(y_train, num_classes) | ||
y_test = keras.utils.to_categorical(y_test, num_classes) | ||
|
||
def lr_schedule(epoch): | ||
"""Learning Rate Schedule | ||
Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs. | ||
Called automatically every epoch as part of callbacks during training. | ||
# Arguments | ||
epoch (int): The number of epochs | ||
# Returns | ||
lr (float32): learning rate | ||
""" | ||
lr = 1e-3 | ||
if epoch > 180: | ||
lr *= 0.5e-3 | ||
elif epoch > 160: | ||
lr *= 1e-3 | ||
elif epoch > 120: | ||
lr *= 1e-2 | ||
elif epoch > 80: | ||
lr *= 1e-1 | ||
print('Learning rate: ', lr) | ||
return lr | ||
|
||
def resnet_layer(inputs, | ||
num_filters=16, | ||
kernel_size=3, | ||
strides=1, | ||
activation='relu', | ||
batch_normalization=True, | ||
conv_first=True): | ||
"""2D Convolution-Batch Normalization-Activation stack builder | ||
# Arguments | ||
inputs (tensor): input tensor from input image or previous layer | ||
num_filters (int): Conv2D number of filters | ||
kernel_size (int): Conv2D square kernel dimensions | ||
strides (int): Conv2D square stride dimensions | ||
activation (string): activation name | ||
batch_normalization (bool): whether to include batch normalization | ||
conv_first (bool): conv-bn-activation (True) or | ||
bn-activation-conv (False) | ||
# Returns | ||
x (tensor): tensor as input to the next layer | ||
""" | ||
conv = Conv2D(num_filters, | ||
kernel_size=kernel_size, | ||
strides=strides, | ||
padding='same', | ||
kernel_initializer='he_normal', | ||
kernel_regularizer=l2_reg) | ||
|
||
x = inputs | ||
if conv_first: | ||
x = conv(x) | ||
if batch_normalization: | ||
x = BatchNormalization()(x) | ||
if activation is not None: | ||
x = Activation(activation)(x) | ||
else: | ||
if batch_normalization: | ||
x = BatchNormalization()(x) | ||
if activation is not None: | ||
x = Activation(activation)(x) | ||
x = conv(x) | ||
return x | ||
|
||
|
||
def resnet_v2(input_shape, depth, num_classes=100): | ||
"""ResNet Version 2 Model builder [b] | ||
Stacks of (1 x 1)-(3 x 3)-(1 x 1) BN-ReLU-Conv2D or also known as | ||
bottleneck layer | ||
First shortcut connection per layer is 1 x 1 Conv2D. | ||
Second and onwards shortcut connection is identity. | ||
At the beginning of each stage, the feature map size is halved (downsampled) | ||
by a convolutional layer with strides=2, while the number of filter maps is | ||
doubled. Within each stage, the layers have the same number filters and the | ||
same filter map sizes. | ||
Features maps sizes: | ||
conv1 : 32x32, 16 | ||
stage 0: 32x32, 64 | ||
stage 1: 16x16, 128 | ||
stage 2: 8x8, 256 | ||
# Arguments | ||
input_shape (tensor): shape of input image tensor | ||
depth (int): number of core convolutional layers | ||
num_classes (int): number of classes (CIFAR100 has 100) | ||
# Returns | ||
model (Model): Keras model instance | ||
""" | ||
if (depth - 2) % 9 != 0: | ||
raise ValueError('depth should be 9n+2 (eg 56 or 110 in [b])') | ||
# Start model definition. | ||
num_filters_in = 16 | ||
num_res_blocks = int((depth - 2) / 9) | ||
|
||
inputs = Input(shape=input_shape) | ||
# v2 performs Conv2D with BN-ReLU on input before splitting into 2 paths | ||
x = resnet_layer(inputs=inputs, | ||
num_filters=num_filters_in, | ||
conv_first=True) | ||
|
||
# Instantiate the stack of residual units | ||
for stage in range(3): | ||
for res_block in range(num_res_blocks): | ||
activation = 'relu' | ||
batch_normalization = True | ||
strides = 1 | ||
if stage == 0: | ||
num_filters_out = num_filters_in * 4 | ||
if res_block == 0: # first layer and first stage | ||
activation = None | ||
batch_normalization = False | ||
else: | ||
num_filters_out = num_filters_in * 2 | ||
if res_block == 0: # first layer but not first stage | ||
strides = 2 # downsample | ||
# bottleneck residual unit | ||
y = resnet_layer(inputs=x, | ||
num_filters=num_filters_in, | ||
kernel_size=1, | ||
strides=strides, | ||
activation=activation, | ||
batch_normalization=batch_normalization, | ||
conv_first=False) | ||
y = resnet_layer(inputs=y, | ||
num_filters=num_filters_in, | ||
conv_first=False) | ||
y = resnet_layer(inputs=y, | ||
num_filters=num_filters_out, | ||
kernel_size=1, | ||
conv_first=False) | ||
if res_block == 0: | ||
# linear projection residual shortcut connection to match | ||
# changed dims | ||
x = resnet_layer(inputs=x, | ||
num_filters=num_filters_out, | ||
kernel_size=1, | ||
strides=strides, | ||
activation=None, | ||
batch_normalization=False) | ||
x = keras.layers.add([x, y]) | ||
num_filters_in = num_filters_out | ||
|
||
# Add classifier on top. | ||
# v2 has BN-ReLU before Pooling | ||
x = BatchNormalization()(x) | ||
x = Activation('relu')(x) | ||
x = AveragePooling2D(pool_size=8)(x) | ||
y = Flatten()(x) | ||
outputs = Dense(num_classes, | ||
activation='softmax', | ||
kernel_initializer='he_normal')(y) | ||
|
||
# Instantiate model. | ||
model = Model(inputs=inputs, outputs=outputs) | ||
return model | ||
|
||
if version == 2: | ||
model = resnet_v2(input_shape=input_shape, depth=depth) | ||
|
||
model.compile(loss='categorical_crossentropy', | ||
optimizer=Adam(lr=lr_schedule(0)), | ||
metrics=['accuracy']) | ||
model.summary() | ||
print(model_type) | ||
|
||
# Prepare model model saving directory. | ||
save_dir = os.path.join(os.getcwd(), 'saved_models') | ||
model_name = 'cifar100_%s_model.{epoch:03d}.h5' % model_type | ||
if not os.path.isdir(save_dir): | ||
os.makedirs(save_dir) | ||
filepath = os.path.join(save_dir, model_name) | ||
|
||
# Prepare callbacks for model saving and for learning rate adjustment. | ||
checkpoint = ModelCheckpoint(filepath=filepath, | ||
monitor='val_acc', | ||
verbose=1, | ||
save_best_only=True) | ||
|
||
lr_scheduler = LearningRateScheduler(lr_schedule) | ||
|
||
lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), | ||
cooldown=0, | ||
patience=5, | ||
min_lr=0.5e-6) | ||
dr_schedule = DecayRate_Controller(decay_schedule) | ||
callbacks = [checkpoint, lr_reducer, lr_scheduler, dr_schedule] | ||
|
||
# Run training, with or without data augmentation. | ||
if not data_augmentation: | ||
print('Not using data augmentation.') | ||
model.fit(x_train, y_train, | ||
batch_size=batch_size, | ||
epochs=epochs, | ||
validation_data=(x_test, y_test), | ||
shuffle=True, | ||
callbacks=callbacks) | ||
else: | ||
print('Using real-time data augmentation.') | ||
# This will do preprocessing and realtime data augmentation: | ||
datagen = ImageDataGenerator( | ||
# set input mean to 0 over the dataset | ||
featurewise_center=False, | ||
# set each sample mean to 0 | ||
samplewise_center=False, | ||
# divide inputs by std of dataset | ||
featurewise_std_normalization=False, | ||
# divide each input by its std | ||
samplewise_std_normalization=False, | ||
# apply ZCA whitening | ||
zca_whitening=False, | ||
# epsilon for ZCA whitening | ||
zca_epsilon=1e-06, | ||
# randomly rotate images in the range (deg 0 to 180) | ||
rotation_range=0, | ||
# randomly shift images horizontally | ||
width_shift_range=0.1, | ||
# randomly shift images vertically | ||
height_shift_range=0.1, | ||
# set range for random shear | ||
shear_range=0., | ||
# set range for random zoom | ||
zoom_range=0., | ||
# set range for random channel shifts | ||
channel_shift_range=0., | ||
# set mode for filling points outside the input boundaries | ||
fill_mode='nearest', | ||
# value used for fill_mode = "constant" | ||
cval=0., | ||
# randomly flip images | ||
horizontal_flip=True, | ||
# randomly flip images | ||
vertical_flip=False, | ||
# set rescaling factor (applied before any other transformation) | ||
rescale=None, | ||
# set function that will be applied on each input | ||
preprocessing_function=None, | ||
# image data format, either "channels_first" or "channels_last" | ||
data_format=None, | ||
# fraction of images reserved for validation (strictly between 0 and 1) | ||
validation_split=0.0) | ||
|
||
# Compute quantities required for featurewise normalization | ||
# (std, mean, and principal components if ZCA whitening is applied). | ||
datagen.fit(x_train) | ||
# Fit the model on the batches generated by datagen.flow(). | ||
model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), | ||
validation_data=(x_test, y_test), | ||
epochs=epochs, verbose=1, workers=4, | ||
callbacks=callbacks) | ||
|
||
# Score trained model. | ||
scores = model.evaluate(x_test, y_test, verbose=1) | ||
print('Test loss:', scores[0]) | ||
print('Test accuracy:', scores[1]) |