# State Farm Competition

In [None]:
%pwd

In [None]:
#Import all the required files. Here we are not using VGG16 it seems!
%matplotlib inline
from __future__ import print_function, division
path = "data/state/"
import utils; reload(utils)
from utils import *
from IPython.display import FileLink
from shutil import copyfile

In [None]:
#Set batch size
batch_size = 4

## Create Validation Data
### *To be run only the first time to set up validation datasets**

In [None]:
%cd data/state/train

In [None]:
for d in glob('c?'):
    os.mkdir('../valid/'+d)

In [None]:
g = glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], '../valid/' + shuf[i])

In [None]:
%cd ../../..

## Create batches

In [None]:
batches = get_batches(path+'train', batch_size=batch_size)
val_batches = get_batches(path+'valid', batch_size=batch_size*2)

In [None]:
(val_classes, trn_classes, val_labels, trn_labels, val_filenames, filenames, test_filenames) = get_classes(path)

## Using Imagenet Features

In [None]:
from vgg16bn import Vgg16BN
model = vgg_ft_bn(10)

In [None]:
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.optimizer.lr = 0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/simple_vgg.h5')

## Data Augmentation

Using data augmentation one by one seems to work better than applying all the changes at a go. Still not understood the reason. The model learns a lot better this way and overfitting is reduced substantially

In [None]:
gen_t = image.ImageDataGenerator(width_shift_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(height_shift_range=0.05)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(shear_range=0.1)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
gen_t = image.ImageDataGenerator(channel_shift_range=20)
batches = get_batches(path+'train', gen_t, batch_size=batch_size)

In [None]:
model.optimizer.lr=0.0001
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/data_aug.h5')

## Pseudo Labeling

In [None]:
test_batches = get_batches(path+'test', batch_size=batch_size)
test_feat = model.predict_generator(test_batches, test_batches.nb_sample)

In [None]:
result_name = path+'results/result.xlsx'

In [None]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/9, mx)

In [None]:
subm = do_clip(test_feat,0.93)

In [None]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)

In [None]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[4:] for a in test_filenames])

In [None]:
submission.to_excel(result_name, index=False)

In [None]:
from openpyxl import *

In [None]:
wb = load_workbook(path+'results/result.xlsx')
sheet = wb.get_sheet_by_name('Sheet1')

In [None]:
for i in range(2, test_batches.nb_sample):
    val = sheet.cell(row=i, column=2).value
    col = 2
    for j in range(3, batches.n):
        if sheet.cell(row=i, column=j).value > val: 
            val = sheet.cell(row=i,column=j).value
            col = j
    img = sheet.cell(row=i,column=1).value
    f = 'c'+str(col-2)
    %cd data/state/test/unknown
    g = glob('*.jpg')
    for i in range (test_batches.nb_sample):
        if g[i]==str(img):
            copyfile(g[i], '../../train/' + str(f) + '/' + g[i])
    %cd ../../../..

In [None]:
batches = get_batches(path+'train', batch_size=4)

In [None]:
model.fit_generator(batches, samples_per_epoch = batches.nb_sample, nb_epoch = 3, 
                    validation_data = val_batches, nb_val_samples = val_batches.nb_sample)

In [None]:
model.save_weights(path+'results/pseudo_lab.h5')

### Submission

In [None]:
model.load_weights(path+'results/simple_vgg.h5')

In [None]:
test_batches = get_batches(path+'test', batch_size=batch_size)
test_feat = model.predict_generator(test_batches, test_batches.nb_sample)

In [None]:
subm = do_clip(test_feat,0.93)

In [None]:
subm_name = path+'results/subm.csv'

In [None]:
submission = pd.DataFrame(subm, columns=classes)
submission.insert(0, 'img', [a[4:] for a in test_filenames])
submission.head()

In [None]:
submission.to_csv(subm_name, index=False)

In [None]:
FileLink(subm_name)