# State Farm Competition

In [None]:
%pwd

In [None]:
#Import all the required files. Here we are not using VGG16 it seems!
%matplotlib inline
from __future__ import print_function, division
path = "data/state/"
import utils; reload(utils)
from utils import *
from IPython.display import FileLink
from shutil import copyfile

In [None]:
#Set batch size
batch_size = 4

## Create Validation Data
### *To be run only the first time to set up validation datasets**

In [None]:
%cd data/state/train

In [None]:
for d in glob('c?'):
    os.mkdir('../valid/'+d)

In [None]:
g = glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], '../valid/' + shuf[i])

In [None]:
%cd ../../..

## Create batches

In [None]:
batches = get_batches(path+'train', batch_size=batch_size)
val_batches = get_batches(path+'valid', batch_size=batch_size*2)

In [None]:
(val_classes, trn_classes, val_labels, trn_labels, val_filenames, filenames, test_filenames) = get_classes(path)

## Using Imagenet Features

In [None]:
vgg = Vgg16()
model=vgg.model

In [None]:
# Remove all the layers in the VGG model until the convolution layer
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()
model.pop()

In [None]:
# Set the last convolutional layer as trainable
for layer in model.layers: layer.trainable=False

def find():
    last_conv_idx = [i for i,l in enumerate(model.layers) if type(l) is Convolution2D][-1]
    return (last_conv_idx)       
        
last_conv_idx = find()
model.layers[last_conv_idx].trainable = True

In [None]:
# Add Batchnorm, dropout to the Fully Connected Layers
model.add(BatchNormalization(axis=1))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

In [None]:
# Compile the changes in the model
model.compile(optimizer=Adam(lr=0.00001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.optimizer.lr = 0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

## Data Augmentation

Using data augmentation one by one seems to work better than applying all the changes at a go. Still not understood the reason. The model learns a lot better this way and overfitting is reduced substantially

In [None]:
gen_t = image.ImageDataGenerator(width_shift_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(shear_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(channel_shift_range=20)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size, shuffle=False)

In [None]:
model.optimizer.lr=0.001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 15, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

### Submission

In [None]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [None]:
test_batches = get_batches(path+'test', batch_size=batch_size)
test_feat = bn_model.predict_generator(test_batches, test_batches.nb_sample)

In [None]:
subm = do_clip(test_feat,0.93)

In [None]:
subm_name = path+'results/subm.gz'

In [None]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [None]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[4:] for a in test_filenames])
submission.head()

In [None]:
submission.to_csv(subm_name, index=False, compression='gzip')

In [None]:
FileLink(subm_name)