In [1]:
# Rather than importing everything manually, we'll make things easy
#   and load them all in utils.py, and just import them from there.
%matplotlib inline
import utils; reload(utils)
from utils import *

Using gpu device 0: Tesla K80 (CNMeM is disabled)
Using Theano backend.


In [2]:
%matplotlib inline
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
import scipy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import utils; reload(utils)
from utils import plots, get_batches, plot_confusion_matrix, get_data

In [3]:
from numpy.random import random, permutation
from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers import Input
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop
from keras.preprocessing import image

In [4]:
#path = "../data/dogsandcats_small/" # we copied a fraction of the full set for tests
path = "../data/dogsandcats/"
model_path = path + "models/"
if not os.path.exists(model_path):
    os.mkdir(model_path)
    print('Done')

In [5]:
from vgg16 import Vgg16

In [16]:
batch_size = 100

In [17]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, 
                batch_size=batch_size, class_mode='categorical'):
    return gen.flow_from_directory(path+dirname, target_size=(224,224), 
                class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

In [18]:
# Use batch size of 1 since we're just doing preprocessing on the CPU
val_batches = get_batches('valid', shuffle=False, batch_size=batch_size) # no shuffle as we store conv output
trn_batches = get_batches('train', shuffle=False, batch_size=batch_size) # no shuffle as we store conv output

Found 4000 images belonging to 2 classes.
Found 21000 images belonging to 2 classes.


In [19]:
val_batches.filenames[0:10]

['cat/cat.1262.jpg',
 'cat/cat.9495.jpg',
 'cat/cat.3044.jpg',
 'cat/cat.1424.jpg',
 'cat/cat.8210.jpg',
 'cat/cat.8847.jpg',
 'cat/cat.308.jpg',
 'cat/cat.10802.jpg',
 'cat/cat.5060.jpg',
 'cat/cat.10406.jpg']

In [20]:
val_labels = onehot(val_batches.classes)
trn_labels = onehot(trn_batches.classes)

In [39]:
# DONT USE IT FOR NOW
if False:
    realvgg = Vgg16()
    conv_layers, fc_layers = split_at(realvgg.model, Convolution2D)
    conv_model = Sequential(conv_layers)

In [21]:
vggbase = Vgg16()
vggbase.model.pop()
vggbase.model.pop()

  .format(self.name, input_shape))


### Will take 1 or 2 minutes to complete the 1st time

In [40]:
# DONT USE IT FOR NOW
if False:
    try:
        val_features = load_array(model_path+'valid_convlayer_features.bc')
        if False: # force update
            raise
    except:
        print('Missing file')
        val_features = conv_model.predict_generator(val_batches, val_batches.nb_sample)
        save_array(model_path + 'valid_convlayer_features.bc', val_features)

In [22]:
try:
    val_vggfeatures = load_array(model_path+'valid_vggbase_features.bc')
    if False: # force update
        raise
except:
    print('Missing file')
    val_vggfeatures = vggbase.model.predict_generator(val_batches, val_batches.nb_sample)
    save_array(model_path + 'valid_vggbase_features.bc', val_vggfeatures)

### Will take a few minutes (maybe 10) to complete the 1st time

In [41]:
# DONT USE IT FOR NOW
if False:
    try:
        trn_features = load_array(model_path+'train_convlayer_features.bc')
        if False: # force update
            raise
    except:
        print('Missing file')
        trn_features = conv_model.predict_generator(trn_batches, trn_batches.nb_sample)
        save_array(model_path + 'train_convlayer_features.bc', trn_features)

In [23]:
try:
    trn_vggfeatures = load_array(model_path+'train_vggbase_features.bc')
    if False: # force update
        raise
except:
    print('Missing file')
    trn_vggfeatures = vggbase.model.predict_generator(trn_batches, trn_batches.nb_sample)
    save_array(model_path + 'train_vggbase_features.bc', trn_vggfeatures)

### Ready to train the model

In [24]:
ll_layers = [BatchNormalization(input_shape=(4096,)),
             Dropout(0.25),
             Dense(2, activation='softmax')]
ll_model = Sequential(ll_layers)
ll_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
ll_model.optimizer.lr = 0.01*1e-5
ll_model.fit(trn_vggfeatures, trn_labels, validation_data=(val_vggfeatures, val_labels), nb_epoch=10)

Train on 21000 samples, validate on 4000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fca7b32e290>

In [25]:
#ll_model.save_weights(model_path+'llmodel_finetune1.h5')
#ll_model.load_weights(model_path+'llmodel_finetune1.h5')

In [32]:
test_batches = get_batches('test', shuffle=False, batch_size=batch_size, class_mode=None)
testfiles = test_batches.filenames
testfiles[0:10]

Found 12500 images belonging to 1 classes.


['test/10592.jpg',
 'test/7217.jpg',
 'test/3653.jpg',
 'test/4382.jpg',
 'test/2924.jpg',
 'test/10.jpg',
 'test/10916.jpg',
 'test/12374.jpg',
 'test/1871.jpg',
 'test/11645.jpg']

### Will take a few minutes (maybe 5) to complete the 1st time

In [33]:
try:
    test_vggfeatures = load_array(model_path+'test_vggbase_features.bc')
    if False: # force update
        raise
except:
    print('Missing file')
    test_vggfeatures = vggbase.model.predict_generator(test_batches, test_batches.nb_sample)
    save_array(model_path + 'test_vggbase_features.bc', test_vggfeatures)

In [34]:
test_preds = ll_model.predict_on_batch(test_vggfeatures)

In [35]:
len(test_preds)

12500

In [36]:
test_preds[0:10]

array([[  9.9998e-01,   1.8327e-05],
       [  9.9998e-01,   1.5564e-05],
       [  8.9789e-07,   1.0000e+00],
       [  9.6074e-01,   3.9264e-02],
       [  2.0748e-02,   9.7925e-01],
       [  9.9999e-01,   1.4624e-05],
       [  1.9318e-03,   9.9807e-01],
       [  1.0000e+00,   6.9623e-07],
       [  7.3551e-05,   9.9993e-01],
       [  9.5497e-06,   9.9999e-01]], dtype=float32)

In [44]:
dog_idx = 1
Z1 = [{'id':int(f.split('/')[-1].split('.')[0]), 'label':min(max(round(p[dog_idx],5),0.0001),0.9999)} 
      for f, p in zip(testfiles, test_preds)]
def comp(x,y):
    return int(x['id']) - int(y['id'])
Z1 = sorted(Z1, comp)
Z1[0:18]

[{'id': 1, 'label': 0.99986},
 {'id': 2, 'label': 0.9999},
 {'id': 3, 'label': 0.9999},
 {'id': 4, 'label': 0.99985},
 {'id': 5, 'label': 0.0001},
 {'id': 6, 'label': 0.00019},
 {'id': 7, 'label': 0.0001},
 {'id': 8, 'label': 0.0001},
 {'id': 9, 'label': 0.00055},
 {'id': 10, 'label': 0.0001},
 {'id': 11, 'label': 0.0001},
 {'id': 12, 'label': 0.99988},
 {'id': 13, 'label': 0.00523},
 {'id': 14, 'label': 0.00335},
 {'id': 15, 'label': 0.0001},
 {'id': 16, 'label': 0.00025},
 {'id': 17, 'label': 0.9506},
 {'id': 18, 'label': 0.9999}]

In [45]:
import csv
        
with open('predictions.csv', 'w') as csvfile:
    fieldnames = ['id', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for z in Z1:
        writer.writerow(z)