In [1]:
from __future__ import print_function, division

import fastai
import fastai.utils
from fastai.fautils import *

import pandas as pd

Using cuDNN version 5110 on context None
Mapped name None to device cuda: Tesla K80 (0000:00:1E.0)
Using Theano backend.


In [2]:
%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

In [3]:
full_data_path = os.path.expanduser('~/data/state-farm/')
samp_data_path = os.path.expanduser('~/data/sample-state-farm/')
data_path = samp_data_path
batch_size = 64

# overview

This notebook shows a lot of exploratory work with the State Farm kaggle challenge.  The biggest takeaways are:

1. Find the smallest sample size that produces consistent results
1. Start with very small models and quickly work up in complexity, till you're overfitting
1. Selecting the initial training rate, and adjusting it through training is really important
1. Get familiar with data augmentation, but remember you can't precompute your convolutional layers
1. Dropout is super important, but the value is dependent on your training set size, so you need to relearn it after you finish with the sample set

# create batches

In [4]:
t_batches = get_batches(data_path + 'train', batch_size=batch_size)
v_batches = get_batches(data_path + 'valid', batch_size=2*batch_size, shuffle=False)

(
    val_classes, trn_classes, 
    val_labels, trn_labels, 
    val_filenames, filenames,
    test_filename
) = get_classes(data_path)

Found 1500 images belonging to 10 classes.
Found 1000 images belonging to 10 classes.
Found 1500 images belonging to 10 classes.
Found 1000 images belonging to 10 classes.
Found 0 images belonging to 0 classes.


# linear model

In [5]:
def get_lin_model():

    # starting with BatchNormalization saves us from having to normalize our input manually
    model = Sequential([
        BatchNormalization(axis=1, input_shape=(3, 224, 224)),
        Flatten(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [6]:
lm = get_lin_model()
lm.summary()

lm.fit_generator(
    t_batches, 
    t_batches.nb_sample, 
    nb_epoch=1, 
    validation_data=v_batches, 
    nb_val_samples=v_batches.nb_sample
)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
batchnormalization_1 (BatchNorma (None, 3, 224, 224)   12          batchnormalization_input_1[0][0] 
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 150528)        0           batchnormalization_1[0][0]       
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 10)            1505290     flatten_1[0][0]                  
Total params: 1,505,302
Trainable params: 1,505,296
Non-trainable params: 6
____________________________________________________________________________________________________
Epoch 1/1


<keras.callbacks.History at 0x7fef672de510>

In [7]:
np.round(lm.predict_generator(t_batches, t_batches.nb_sample)[:10],2)

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.]], dtype=float32)

Model is entirely predicting 2 of the classes.  Not very useful.  Lower the learning rate and try again.

In [8]:
lm = get_lin_model()

lm.optimizer.lr.set_value(1e-5)
lm.fit_generator(
    t_batches, 
    t_batches.nb_sample, 
    nb_epoch=2, 
    validation_data=v_batches, 
    nb_val_samples=v_batches.nb_sample
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fef64ee3310>

In [9]:
lm.optimizer.lr.set_value(1e-3)
lm.fit_generator(
    t_batches, 
    t_batches.nb_sample, 
    nb_epoch=4,
    validation_data=v_batches, 
    nb_val_samples=v_batches.nb_sample
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fef64ee3d90>

Validation accuracy of 0.43 -- almost too good to be true, and definitely better than chance.

Now lets make sure validation set is large enough to ensure a stable accuracy metric across runs, so we aren't making the wrong generalization about the results we're observing.

In [10]:
r_batches = get_batches(data_path+'valid', batch_size=2*batch_size)
val_res = [lm.evaluate_generator(r_batches, r_batches.nb_sample) for i in range(10)]
np.round(val_res, 2)

Found 1000 images belonging to 10 classes.


array([[ 9.19,  0.41],
       [ 9.26,  0.41],
       [ 9.46,  0.39],
       [ 9.08,  0.41],
       [ 9.23,  0.41],
       [ 9.24,  0.4 ],
       [ 9.41,  0.4 ],
       [ 9.04,  0.42],
       [ 9.19,  0.41],
       [ 8.98,  0.42]])

Consisent results.  Accuracy increases larger than 2% can't be attributed to chance.

In [16]:
lm2 = get_lin_model()

rates = [
    (1e-5, 2),
    (1e-4, 4)
]

fastai.utils.fit_generator(lm2, t_batches, rates, val_batches=v_batches)

# lrsched = keras.callbacks.LearningRateScheduler(
#     fastai.utils.list_rate_schedule([
#         (1e-5, 2),
#         (1e-4, 4)
#     ],
#     output=True
#     )
# )

# lm2.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=6, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample,
#     callbacks=[lrsched]
# )

Learning rate: 1e-05
Epoch 1/6
Epoch 2/6
Learning rate: 0.0001
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f5197c20ed0>

In [11]:
# %run ../fastai/state_farm.py reg-lin-model

# regularized linear model

In [12]:
def get_reglin_model():

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(3, 224, 224)),
        Flatten(),
        Dense(10, activation='softmax', W_regularizer=l2(0.01))
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [14]:
rlm = get_reglin_model()

rates = [
    (1e-5, 2),
    (1e-4, 4)
]

fastai.utils.fit_generator(rlm, t_batches, rates, val_batches=v_batches)

# rlm.optimizer.lr.set_value(1e-5)
# rlm.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

# rlm.optimizer.lr.set_value(1e-4)
# rlm.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=4, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fef5eb68350>

Single linear model with regularization is getting 45% accuracy.  Dipped at the end which means it's continuing to overfit.

# single dense layer

In [15]:
def get_fc_model():

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(3, 224, 224)),
        Flatten(),
        Dense(100, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    return model

In [16]:
fc = get_fc_model()
fc.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
batchnormalization_5 (BatchNorma (None, 3, 224, 224)   12          batchnormalization_input_5[0][0] 
____________________________________________________________________________________________________
flatten_5 (Flatten)              (None, 150528)        0           batchnormalization_5[0][0]       
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 100)           15052900    flatten_5[0][0]                  
____________________________________________________________________________________________________
batchnormalization_6 (BatchNorma (None, 100)           400         dense_5[0][0]                    
___________________________________________________________________________________________

In [17]:
rates = [
    (1e-5, 2),
    (1e-2, 5)
]

fastai.utils.fit_generator(fc, t_batches, rates, val_batches=v_batches)

# fc.optimizer.lr.set_value(1e-5)
# fc.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

# fc.optimizer.lr.set_value(0.01)
# fc.fit_generator(
#     t_batches, 
#     t_batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )

Epoch 1/2
Epoch 2/2
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fef5dc68290>

Beginning to learn the training set, but failing at validation.

# simple conv layers

In [17]:
def get_conv_model(t_batches=t_batches, v_batches=v_batches, train=True):

    model = Sequential([
        BatchNormalization(axis=1, input_shape=(3, 224, 224)),
        Convolution2D(32,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Convolution2D(64,3,3, activation='relu'),
        BatchNormalization(axis=1),
        MaxPooling2D((3,3)),
        Flatten(),
        Dense(200, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
    model.compile(
        Adam(), 
        loss='categorical_crossentropy', 
        metrics=['accuracy']
    )
    
    if not train:
        return model
    
    rates = [
        (1e-4, 2),
        (1e-3, 4)
    ]

    fastai.utils.fit_generator(model, t_batches, rates, val_batches=v_batches)
    
#     model.optimizer.lr.set_value(1e-4)
#     h = model.fit_generator(
#         t_batches, 
#         t_batches.nb_sample, 
#         nb_epoch=2, 
#         validation_data=v_batches, 
#         nb_val_samples=v_batches.nb_sample
#     )

#     model.optimizer.lr.set_value(1e-3)
#     h = model.fit_generator(
#         t_batches, 
#         t_batches.nb_sample, 
#         nb_epoch=4, 
#         validation_data=v_batches, 
#         nb_val_samples=v_batches.nb_sample
#     )
    
    return model

In [20]:
cm = get_conv_model()
# cm.summary()

Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Very rapidly learning the training data and failing to generalize to the validation dataset.

# data augmentation

In [21]:
gen_t = image.ImageDataGenerator(width_shift_range=0.1)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [22]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [23]:
gen_t = image.ImageDataGenerator(shear_range=0.1)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [24]:
gen_t = image.ImageDataGenerator(rotation_range=15)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:
gen_t = image.ImageDataGenerator(channel_shift_range=20)
batches = get_batches(data_path+'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.
Epoch 1/2
Epoch 2/2
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# all together

In [18]:
gen_t = image.ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.05,
    shear_range=0.1,
    rotation_range=15,
    channel_shift_range=20
)
batches = get_batches(data_path + 'train', gen_t, batch_size=batch_size)

model = get_conv_model(batches)

Found 1500 images belonging to 10 classes.


KeyboardInterrupt: 

In [None]:
rates = [
    (1e-4, 30),
]

fastai.utils.fit_generator(model, t_batches, rates, val_batches=v_batches)

# model.optimizer.lr.set_value(0.0001)
# model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=25, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)

In [31]:
vf_batches = get_batches(full_data_path + 'valid', batch_size=2*batch_size, shuffle=False)
model.evaluate_generator(vf_batches, vf_batches.nb_sample)

Found 2109 images belonging to 10 classes.


[1.0532610309016597, 0.71692745405217939]

In [29]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
batchnormalization_31 (BatchNorm (None, 3, 224, 224)   12          batchnormalization_input_12[0][0]
____________________________________________________________________________________________________
convolution2d_13 (Convolution2D) (None, 32, 222, 222)  896         batchnormalization_31[0][0]      
____________________________________________________________________________________________________
batchnormalization_32 (BatchNorm (None, 32, 222, 222)  128         convolution2d_13[0][0]           
____________________________________________________________________________________________________
maxpooling2d_13 (MaxPooling2D)   (None, 32, 74, 74)    0           batchnormalization_32[0][0]      
___________________________________________________________________________________________

In [30]:
model.save(data_path + 'state-farm-cnn.h5')

# reload model

# full training plot

In [32]:
# history = []

# gen_t = image.ImageDataGenerator(
#     width_shift_range=0.1,
#     height_shift_range=0.05,
#     shear_range=0.1,
#     rotation_range=15,
#     channel_shift_range=20
# )
# batches = get_batches(data_path + 'train', gen_t, batch_size=batch_size)

# model = get_conv_model(train=False)
# model.optimizer.lr.set_value(1e-4)
# h = model.fit_generator(
#     batches, 
#     batches.nb_sample, 
#     nb_epoch=2, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )
# history.append(h)

# model.optimizer.lr.set_value(1e-3)
# h = model.fit_generator(
#     batches, 
#     batches.nb_sample, 
#     nb_epoch=4, 
#     validation_data=v_batches, 
#     nb_val_samples=v_batches.nb_sample
# )
# history.append(h)

In [33]:
# model.optimizer.lr.set_value(0.0001)
# h = model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=5, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# history.append(h)
# h = model.fit_generator(
#     batches, batches.nb_sample, 
#     nb_epoch=25, 
#     validation_data=v_batches, nb_val_samples=v_batches.nb_sample)
# history.append(h)

In [34]:
# acc = []
# val_acc = []
# for h in history:
#     acc += h.history['acc']
#     val_acc += h.history['val_acc']
    
# plt.plot(acc)
# plt.plot(val_acc)

# submit

Compute test set output and actually submit.

In [4]:
from fastai import kaggle

def submission_df(preds, test_batches, classes):
    # construct dataframe of the submission
    index = pd.Series(
        [f.split('/')[-1] for f in test_batches.filenames],
        name='img'
    )

    df = pd.DataFrame(
        preds,
        index=index,
        columns=classes
    )

    return df.sort_index()

In [5]:
test_batches = get_batches(
    full_data_path + 'test', shuffle=False, batch_size=batch_size * 2,
    class_mode=None)
train_batches = get_batches(
    full_data_path + 'train', shuffle=False, batch_size=batch_size,
    class_mode=None)
classes = sorted(train_batches.class_indices)

Found 79726 images belonging to 1 classes.
Found 20315 images belonging to 10 classes.


In [None]:
model = keras.models.load_model(samp_data_path + 'state-farm-cnn.h5')

In [10]:
# predict
preds = model.predict_generator(test_batches, test_batches.nb_sample)

KeyboardInterrupt: 

In [None]:
df = submission_df(preds, test_batches, classes)
df = df.clip(0.05, 0.95)
df.to_csv(full_data_path + 'submission.csv', index=True)

In [None]:
import subprocess

cmd = [
    'kg',
    'submit',
    '-u', os.environ['KAGGLE_USERNAME'],
    '-p', os.environ['KAGGLE_PASSWORD'],
    '-c', 'state-farm-distracted-driver-detection',
    full_data_path + 'submission.csv'
]

subprocess.call(cmd)