In [1]:
import fastai
import fastai.utils
from fastai.fautils import *

from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.
Using cuDNN version 6021 on context None
Mapped name None to device cuda: Quadro P6000 (0000:00:05.0)


In [2]:
%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

In [3]:
full_data_path = os.path.expanduser('~/data/state-farm/')
samp_data_path = os.path.expanduser('~/data/sample-state-farm/')
data_path = full_data_path
batch_size = 64

In this notebook, we experiment with splitting VGG into to convolutional and fully connected layers to accelerate the fine-tuning process.  When fine-tuning, we generally don't alter the convolutional layers.  They've been learned from very large datasets and tend to be tuned to the distribution of image content (to some degree more or less depending on the distribution of data and labels -- ala the Imagenet tendency towards dog faces).

The reason for this approach is that most of the computational work is in the convolutional layers.  Since they tend not to change in most experiments (the computational cost of training them anew to convergence is too high), we can precompute them.  Then we can construct a second network that takes the output of the convolution-only network as input and can iterate much more rapidly on that.

We're going to:

- construct a conventional VGG model
- split it at the division between the convolutional layers and the FC layers
- we'll take our entire dataset and run it through the convolutional layers, and store the result
- we'll then construct a secondary model that takes the convolutional output for input, and outputs the desired prediction
- we'll then train that network on the stored convolutional output

This way we can reduce a 10+ minute training per epoch to 22 seconds.

# create batches

In [4]:
gen_t = image.ImageDataGenerator(
    width_shift_range=0.05,
    height_shift_range=0.15,
    shear_range=0.15,
    rotation_range=15,
    channel_shift_range=30
)

t_batches = get_batches(data_path + 'train', batch_size=batch_size, shuffle=False)
v_batches = get_batches(data_path + 'valid', batch_size=2*batch_size, shuffle=False)
a_batches = get_batches(data_path + 'train', gen_t, batch_size=batch_size)

(
    val_classes, trn_classes, 
    val_labels, trn_labels, 
    val_filenames, filenames,
    test_filename
) = get_classes(data_path)

Found 20315 images belonging to 10 classes.
Found 2109 images belonging to 10 classes.
Found 20315 images belonging to 10 classes.
Found 20315 images belonging to 10 classes.
Found 2109 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


# construct initial VGG network, then split

In [5]:
from fastai import vgg162

vgg = vgg_ft(10)
model = vgg.model

In [6]:
lr = 1e-3
epochs = 25.
decay_rate = lr / epochs

model.compile(
    optimizer=Adam(lr=1e-3, decay=decay_rate),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_1_input (InputLayer)  (None, 3, 224, 224)       0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 3, 224, 224)       0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 3, 226, 226)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 224, 224)      1792      
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 64, 226, 226)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 224, 224)      36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 112, 112)      0         
__________

In [7]:
layers = model.layers

last_conv_idx = [
    index 
    for index, layer in enumerate(layers)
    if type(layer) is Conv2D
][-1]

print(last_conv_idx)
layers[last_conv_idx]

31


<keras.layers.convolutional.Conv2D at 0x7f14c560b860>

In [8]:
conv_layers = layers[:last_conv_idx+1]
conv_model = Sequential(conv_layers)

In [9]:
fc_layers = layers[last_conv_idx+1:]
fc_model = Sequential(fc_layers)

# precompute, store reload

In [10]:
# t_features = conv_model.predict_generator(
#     t_batches, math.ceil(t_batches.samples / t_batches.batch_size), verbose=1)
# save_array(data_path + 'train_convlayer_features.bc', t_features)
# del t_features
# print('done train')

v_features = conv_model.predict_generator(
    v_batches, math.ceil(v_batches.samples / v_batches.batch_size), verbose=1)
save_array(data_path + 'valid_convlayer_features.bc', v_features)
del v_features
print('done valid')

done valid


In [11]:
t_features = load_array(data_path + 'train_convlayer_features.bc')
v_features = load_array(data_path + 'valid_convlayer_features.bc')

print(t_features.shape)
print(v_features.shape)

(20315, 512, 14, 14)
(2109, 512, 14, 14)


# create second network and train

**NOTE:** I had hoped to just use the 2nd half of the split network above directly, but that didn't work.  I needed to recreate it with the same configuration as I did here.  Or I could have chose another configuration entirely.

Regardless, I think you need to manually recreate the new network, opposed to borrowing the previously-split one.  That's something to be validated.

The most important part is the first level.  I'm not sure why the tutorial said to include the max pooling layer here, opposed to leave it in the convolutional precomputation.  But the `input_shape` is important to match to the dimensions shown in `t_features.shape` above.  Except the first dimension, that's the numeber of examples.  We exclude that.

In [18]:
lr = 1e-5
epochs = 25.
decay_rate = lr / epochs

model = Sequential([
    MaxPooling2D((2, 2), strides=(2, 2), input_shape=(512, 14, 14)),
    Flatten(),
    Dense(4096, activation='relu'),
    Dropout(.5),
    Dense(4096, activation='relu'),
    Dropout(.5),
    Dense(10, activation='softmax'),
])

model.compile(
    optimizer=Adam(lr=lr, decay=decay_rate),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [19]:
# h = model.fit(
#     t_features, 
#     trn_labels, 
#     nb_epochs=25,
#     batch_size=t_features.batch_size,
#     validation_data=(v_features, val_labels)
# )

rates = [
    (1e-5, 2),
    (1e-4, 8),
    (1e-5, 12),
]

tb_callback = keras.callbacks.TensorBoard(log_dir=data_path, batch_size=batch_size)
callbacks = [
    tb_callback, 
    TQDMNotebookCallback(),
    keras.callbacks.LearningRateScheduler(
        fastai.utils.list_rate_schedule(rates)
    )
]

h = model.fit(
    t_features,
    trn_labels,
    batch_size=t_batches.batch_size,
    epochs=sum([x[1] for x in rates]),
    validation_data=(v_features, val_labels),
    callbacks=callbacks,
)

Train on 20315 samples, validate on 2109 samples


A Jupyter Widget

A Jupyter Widget

Learning rate: 1e-05
Epoch 1/22


A Jupyter Widget

Epoch 2/22


A Jupyter Widget

Learning rate: 0.0001
Epoch 3/22


A Jupyter Widget

Epoch 4/22


A Jupyter Widget

Epoch 5/22


A Jupyter Widget

Epoch 6/22


A Jupyter Widget

Epoch 7/22


A Jupyter Widget

Epoch 8/22


A Jupyter Widget

Epoch 9/22


A Jupyter Widget

Epoch 10/22


A Jupyter Widget

Learning rate: 1e-05
Epoch 11/22


A Jupyter Widget

Epoch 12/22


A Jupyter Widget

Epoch 13/22


A Jupyter Widget

Epoch 14/22


A Jupyter Widget

Epoch 15/22


A Jupyter Widget

Epoch 16/22


A Jupyter Widget

Epoch 17/22


A Jupyter Widget

Epoch 18/22


A Jupyter Widget

Epoch 19/22


A Jupyter Widget

Epoch 20/22


A Jupyter Widget

Epoch 21/22


A Jupyter Widget

Epoch 22/22


In [None]:
plt.plot(h.history['acc'])
plt.plot(h.history['val_acc'])
plt.ylim([0, 1])