In [1]:
from __future__ import print_function, division

import fastai
import fastai.utils
from fastai.fautils import *

import pandas as pd

Using TensorFlow backend.
Using cuDNN version 6021 on context None
Mapped name None to device cuda: Tesla K80 (0000:00:1E.0)


In [2]:
%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

In [3]:
full_data_path = os.path.expanduser('~/data/state-farm/')
samp_data_path = os.path.expanduser('~/data/sample-state-farm/')
data_path = samp_data_path
batch_size = 64

# overview

This notebook shows a lot of exploratory work with the State Farm kaggle challenge.  The biggest takeaways are:

1. Find the smallest sample size that produces consistent results
1. Start with very small models and quickly work up in complexity, till you're overfitting
1. Selecting the initial training rate, and adjusting it through training is really important
1. Get familiar with data augmentation, but remember you can't precompute your convolutional layers
1. Dropout is super important, but the value is dependent on your training set size, so you need to relearn it after you finish with the sample set

# create batches

In [4]:
t_batches = get_batches(data_path + 'train', batch_size=batch_size)
v_batches = get_batches(data_path + 'valid', batch_size=2*batch_size, shuffle=False)

# (
#     val_classes, trn_classes, 
#     val_labels, trn_labels, 
#     val_filenames, filenames,
#     test_filename
# ) = get_classes(data_path)

Found 1500 images belonging to 10 classes.
Found 1000 images belonging to 10 classes.


# linear model

In [14]:
def get_lin_model():

    # starting with BatchNormalization saves us from having to normalize our input manually
    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224, 224, 3)),
        Flatten(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [25]:
lm = get_lin_model()
lm.summary()

lm.fit_generator(
    t_batches, 
    steps_per_epoch=t_batches.samples//t_batches.batch_size,
    validation_data=v_batches, 
    validation_steps=v_batches.samples//v_batches.batch_size,
    epochs=1
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_10 (Batc (None, 224, 224, 3)       896       
_________________________________________________________________
flatten_10 (Flatten)         (None, 150528)            0         
_________________________________________________________________
dense_10 (Dense)             (None, 10)                1505290   
Total params: 1,506,186
Trainable params: 1,505,738
Non-trainable params: 448
_________________________________________________________________
Epoch 1/1


<keras.callbacks.History at 0x7f6ba6f303c8>

In [26]:
np.round(lm.predict_generator(t_batches, t_batches.samples//t_batches.batch_size)[:10],2)

array([[ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ],
       [ 0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ,  0.78,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.22],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ]], dtype=float32)

Model is entirely predicting 2 of the classes.  Not very useful.  Lower the learning rate and try again.

In [27]:
lm = get_lin_model()

lm.optimizer.lr.assign(1e-5)
lm.fit_generator(
    t_batches, 
    t_batches.samples//t_batches.batch_size,
    validation_data=v_batches, 
    validation_steps=v_batches.samples//v_batches.batch_size,
    epochs=2, 
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6ba67fb9b0>

In [29]:
lm.optimizer.lr.assign(1e-3)
lm.fit_generator(
    t_batches, 
    t_batches.samples//t_batches.batch_size,
    validation_data=v_batches, 
    validation_steps=v_batches.samples//v_batches.batch_size,
    epochs=4,
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f6ba61ec4e0>

Validation accuracy of 0.43 -- almost too good to be true, and definitely better than chance.

Now lets make sure validation set is large enough to ensure a stable accuracy metric across runs, so we aren't making the wrong generalization about the results we're observing.

In [31]:
r_batches = get_batches(data_path+'valid', batch_size=2*batch_size)
val_res = [lm.evaluate_generator(r_batches, r_batches.samples//r_batches.batch_size) for i in range(10)]
np.round(val_res, 2)

Found 1000 images belonging to 10 classes.


array([[ 14.19,   0.11],
       [ 14.28,   0.11],
       [ 14.35,   0.1 ],
       [ 14.19,   0.11],
       [ 14.31,   0.11],
       [ 14.3 ,   0.11],
       [ 14.19,   0.11],
       [ 14.23,   0.11],
       [ 14.24,   0.11],
       [ 14.18,   0.11]])

Consisent results.  Accuracy increases larger than 2% can't be attributed to chance.

In [32]:
lm2 = get_lin_model()

rates = [
    (1e-5, 2),
    (1e-4, 4)
]

fastai.utils.fit_generator(lm2, t_batches, rates, val_batches=v_batches)

# lrsched = keras.callbacks.LearningRateScheduler(
#     fastai.utils.list_rate_schedule([
#         (1e-5, 2),
#         (1e-4, 4)
#     ],
#     output=True
#     )
# )

# lm2.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=6, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample,
#     callbacks=[lrsched]
# )

Learning rate: 1e-05
Epoch 1/6
Epoch 2/6
Learning rate: 0.0001
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f6ba601eb70>

In [35]:
%run ../scripts/state-farm/linear-model.py

Found 1500 images belonging to 10 classes.
Found 1000 images belonging to 10 classes.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_15 (Batc (None, 224, 224, 3)       896       
_________________________________________________________________
flatten_15 (Flatten)         (None, 150528)            0         
_________________________________________________________________
dense_15 (Dense)             (None, 10)                1505290   
Total params: 1,506,186
Trainable params: 1,505,738
Non-trainable params: 448
_________________________________________________________________
Learning rate: 1e-05
Epoch 1/6
Epoch 2/6
Learning rate: 0.0001
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Found 1000 images belonging to 10 classes.
[[ 1.72  0.58]
 [ 1.74  0.57]
 [ 1.75  0.58]
 [ 1.76  0.57]
 [ 1.76  0.57]
 [ 1.73  0.58]
 [ 1.8   0.57]
 [ 1.8   0.56]
 [ 1.77  0.57]
 [ 1.75  0.57]]


# regularized linear model

In [38]:
def get_reglin_model():

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224, 224, 3)),
        Flatten(),
        Dense(10, activation='softmax', kernel_regularizer=l2(0.01))
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [39]:
rlm = get_reglin_model()

rates = [
    (1e-5, 2),
    (1e-4, 4)
]

fastai.utils.fit_generator(rlm, t_batches, rates, val_batches=v_batches)

# rlm.optimizer.lr.set_value(1e-5)
# rlm.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

# rlm.optimizer.lr.set_value(1e-4)
# rlm.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=4, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

Learning rate: 1e-05
Epoch 1/6
Epoch 2/6
Learning rate: 0.0001
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f6b9f849eb8>

Single linear model with regularization is getting 45% accuracy.  Dipped at the end which means it's continuing to overfit.

# single dense layer

In [40]:
def get_fc_model():

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224, 224, 3)),
        Flatten(),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [41]:
fc = get_fc_model()
fc.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_18 (Batc (None, 224, 224, 3)       896       
_________________________________________________________________
flatten_18 (Flatten)         (None, 150528)            0         
_________________________________________________________________
dense_18 (Dense)             (None, 100)               15052900  
_________________________________________________________________
batch_normalization_19 (Batc (None, 100)               400       
_________________________________________________________________
dense_19 (Dense)             (None, 10)                1010      
Total params: 15,055,206
Trainable params: 15,054,558
Non-trainable params: 648
_________________________________________________________________


In [42]:
rates = [
    (1e-5, 2),
    (1e-2, 5)
]

fastai.utils.fit_generator(fc, t_batches, rates, val_batches=v_batches)

# fc.optimizer.lr.set_value(1e-5)
# fc.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

# fc.optimizer.lr.set_value(0.01)
# fc.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

Learning rate: 1e-05
Epoch 1/7
Epoch 2/7
Learning rate: 0.01
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f6b9e921208>

Beginning to learn the training set, but failing at validation.

# simple conv layers

In [46]:
def get_conv_model(t_batches=t_batches, v_batches=v_batches, train=True):

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(224, 224, 3)),
        Conv2D(32, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Conv2D(64, (3,3), activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    if not train:
        return model
    
    rates = [
        (1e-4, 2),
        (1e-3, 4)
    ]

    fastai.utils.fit_generator(model, t_batches, rates, val_batches=v_batches)
    
#     model.optimizer.lr.set_value(1e-4)
#     h = model.fit_generator(
#         t_batches, 
#         t_batches.nb_sample, 
#         nb_epoch=2, 
#         validation_data=v_batches, 
#         nb_val_samples=v_batches.nb_sample
#     )

#     model.optimizer.lr.set_value(1e-3)
#     h = model.fit_generator(
#         t_batches, 
#         t_batches.nb_sample, 
#         nb_epoch=4, 
#         validation_data=v_batches, 
#         nb_val_samples=v_batches.nb_sample
#     )
    
    return model

In [None]:
cm = get_conv_model()
# cm.summary()

Learning rate: 0.0001
Epoch 1/6
Epoch 2/6
Learning rate: 0.001
Epoch 3/6
Epoch 4/6
Epoch 5/6

Very rapidly learning the training data and failing to generalize to the validation dataset.

# data augmentation

In [None]:
gen_t = image.ImageDataGenerator(width_shift_range=0.1)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

In [None]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

In [None]:
gen_t = image.ImageDataGenerator(shear_range=0.1)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

In [None]:
gen_t = image.ImageDataGenerator(rotation_range=15)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

In [None]:
gen_t = image.ImageDataGenerator(channel_shift_range=20)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

# all together

In [None]:
gen_t = image.ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.05,
    shear_range=0.1,
    rotation_range=15,
    channel_shift_range=20
)
batches = get_batches(data_path + 'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

In [None]:
rates = [
    (1e-4, 30),
]

fastai.utils.fit_generator(model, batches, rates, val_batches=v_batches)

# model.optimizer.lr.set_value(0.0001)
# model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=25, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)

In [None]:
vf_batches = get_batches(full_data_path + 'valid', batch_size=2*batch_size, shuffle=False)
model.evaluate_generator(vf_batches, vf_batches.nb_sample)

In [None]:
model.summary()

In [None]:
model.save(data_path + 'state-farm-cnn.h5')

# reload model

# full training plot

In [None]:
# history = []

# gen_t = image.ImageDataGenerator(
#     width_shift_range=0.1,
#     height_shift_range=0.05,
#     shear_range=0.1,
#     rotation_range=15,
#     channel_shift_range=20
# )
# batches = get_batches(data_path + 'train', gen_t, batch_size=batch_size)

# model = get_conv_model(train=False)
# model.optimizer.lr.set_value(1e-4)
# h = model.fit_generator(
#     batches, 
#     batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )
# history.append(h)

# model.optimizer.lr.set_value(1e-3)
# h = model.fit_generator(
#     batches, 
#     batches.nb_sample, 
#     nb_epoch=4, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )
# history.append(h)

In [None]:
# model.optimizer.lr.set_value(0.0001)
# h = model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# history.append(h)
# h = model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=25, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# history.append(h)

In [None]:
# acc = []
# val_acc = []
# for h in history:
#     acc += h.history['acc']
#     val_acc += h.history['val_acc']
    
# plt.plot(acc)
# plt.plot(val_acc)

# submit

Compute test set output and actually submit.

In [None]:
from fastai import kaggle

def submission_df(preds, test_batches, classes):
    # construct dataframe of the submission
    index = pd.Series(
        [f.split('/')[-1] for f in test_batches.filenames],
        name='img'
    )

    df = pd.DataFrame(
        preds,
        index=index,
        columns=classes
    )

    return df.sort_index()

In [None]:
test_batches = get_batches(
    full_data_path + 'test', shuffle=False, batch_size=batch_size * 2,
    class_mode=None)
train_batches = get_batches(
    full_data_path + 'train', shuffle=False, batch_size=batch_size,
    class_mode=None)
classes = sorted(train_batches.class_indices)

In [None]:
model = keras.models.load_model(samp_data_path + 'state-farm-cnn.h5')

In [None]:
# predict
preds = model.predict_generator(test_batches, test_batches.samples // test_batches.batch_size)

In [None]:
df = submission_df(preds, test_batches, classes)
df = df.clip(0.05, 0.95)
df.to_csv(full_data_path + 'submission.csv', index=True)

In [None]:
import subprocess

cmd = [
    'kg',
    'submit',
    '-u', os.environ['KAGGLE_USERNAME'],
    '-p', os.environ['KAGGLE_PASSWORD'],
    '-c', 'state-farm-distracted-driver-detection',
    full_data_path + 'submission.csv'
]

subprocess.call(cmd)