In [24]:
import numpy as np
import os
from glob import glob
from shutil import copyfile

import csv
from scipy import misc
from matplotlib import pyplot as plt

from keras.models import Model, Sequential
from keras.layers import Lambda, Dense, Conv2D, BatchNormalization, Activation, \
Flatten, MaxPooling2D, Dropout, ZeroPadding2D
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, LearningRateScheduler

import keras.backend as K
K.set_image_data_format('channels_last')

%matplotlib inline

# Set up directory structure


In [2]:
%ls

cats-vs-dogs-redux.ipynb  [34mtest[m[m/                     [34mtrain[m[m/
sample_submission.csv     test.zip                  train.zip


In [3]:
# Create a directory for validation images
%mkdir validation
%cd train

/Users/johnlu/Desktop/deep_learning/kaggle/train


We will move a random sample of 2000 images into the validation set. Additionally, we create a small set of samples (100 images) out of the validation set to be used for testing.

In [4]:
g = glob('*.jpg')
TRAINING_TO_VAL_FRAC = .9  # Train set size / Total number examples
NUM_TR_EX = len(g)
TRAINING_SIZE = int(np.ceil(NUM_TR_EX * TRAINING_TO_VAL_FRAC))
VALIDATION_SIZE = NUM_TR_EX - TRAINING_SIZE

print('num_examples: ' + str(NUM_TR_EX))
perm = np.random.permutation(g)
print('some_shuffled_examples: ' + str(perm[0:3]))

print('Moving %d to validation directory' % VALIDATION_SIZE)
for i in range(VALIDATION_SIZE):
    os.rename(perm[i], '../validation/'+perm[i])

%cd ..

num_examples: 25000
some_shuffled_examples: ['cat.10834.jpg' 'cat.10533.jpg' 'cat.11168.jpg']
Moving 2500 to validation directory
/Users/johnlu/Desktop/deep_learning/kaggle


In [5]:
# Create a sample directory
SAMPLE_SIZE = 150
%cd './validation/'
g = glob('*.jpg')
%cd '..'
%mkdir sample
perm = np.random.permutation(g)
print(perm[0])
for i in range(SAMPLE_SIZE):
    copyfile('./validation/'+perm[i], './sample/'+perm[i])

/Users/johnlu/Desktop/deep_learning/kaggle/validation
/Users/johnlu/Desktop/deep_learning/kaggle
cat.3269.jpg


In [6]:
# For each directory (train/, validation/, and sample/), create separate subdirectories
# for cats and dogs.
%cd ./train/
%mkdir cats dogs
%mv cat.* ./cats
%mv dog.* ./dogs
%cd ..

%cd ./sample/
%mkdir cats dogs
%mv cat.* ./cats
%mv dog.* ./dogs
%cd ..

%cd ./validation/
%mkdir cats dogs
%mv cat.* ./cats
%mv dog.* ./dogs
%cd ..

/Users/johnlu/Desktop/deep_learning/kaggle/train
/Users/johnlu/Desktop/deep_learning/kaggle
/Users/johnlu/Desktop/deep_learning/kaggle/sample
/Users/johnlu/Desktop/deep_learning/kaggle
/Users/johnlu/Desktop/deep_learning/kaggle/validation
/Users/johnlu/Desktop/deep_learning/kaggle


In [14]:
# Move all test examples into a single subdirectory of ./test (required by ImageDataGenerator.flow_from_directory())
%cd ./test/
%mkdir test
%mv * test
%cd ..

/Users/johnlu/Desktop/deep_learning/kaggle/cats-vs-dogs-redux/test
mv: rename test to test/test: Invalid argument
/Users/johnlu/Desktop/deep_learning/kaggle/cats-vs-dogs-redux


# Model Parameters

In [2]:
# Directory Structure
TRAIN_DIR = './train/'
VAL_DIR = './validation/'
TEST_DIR = './test/'
SAMPLES_DIR = './sample/'

In [93]:
# Number of examples
tr_size = len(glob(TRAIN_DIR+'cats/*.jpg')) + len(glob(TRAIN_DIR+'dogs/*.jpg'))
val_size = len(glob(VAL_DIR+'cats/*.jpg')) + len(glob(VAL_DIR+'dogs/*.jpg'))
test_size = len(glob(TEST_DIR+'*.jpg'))
print('Number Training Examples: ' + str(tr_size))
print('Number Validation Examples: ' + str(val_size))
print('Number Testing Examples: ' + str(test_size))
# Image Parameters
IMG_HEIGHT = 224
IMG_WIDTH = 224
N_CHANNELS = 3
input_shape = (IMG_HEIGHT, IMG_WIDTH, N_CHANNELS)

# Model Parameters
BATCH_SIZE = 16
NUM_EPOCHS = 2

# To grab batches of images from directory. We have 3 generators in case we want to augment images
# for, say, the test examples and not the validation examples.
img_gen = ImageDataGenerator()
sample_gen = ImageDataGenerator()
val_data_gen = ImageDataGenerator()


Number Training Examples: 22500
Number Validation Examples: 2500
Number Testing Examples: 0


In [96]:
def get_data(path):
    batches = img_gen.flow_from_directory(path, target_size=(IMG_HEIGHT, IMG_WIDTH), batch_size=1)
    return np.concatenate([batch for batch in batches])

## Save final layer activations
Training would be much faster if we first save the final layer activations from VGG, which should have shape (n_examples, 1000). This last layer would then be fed into whatever model we decide on for finetuning. 

In [None]:
### TO IMPLEMENT ###
# tr_final_act = get_data(TRAIN_DIR)
# ...

Found 22500 images belonging to 2 classes.


# Transfer Learning using VGG-16

In [4]:
vgg_mean = np.array([123.68, 116.779, 103.939]).reshape((1,1,3))

def preprocess_vgg(x):
    x = x - vgg_mean     # subtract mean
    return x[:,:,::-1]    # reverse axis bgr->rgb

In [5]:
def create_vgg_model(load_vgg_weights=True):
    model = Sequential()
    model.add(Lambda(preprocess_vgg, input_shape=input_shape))
    
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(64, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(64, (3,3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(128, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(128, (3,3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3,3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3,3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))
    
    # Load weights
    if (load_vgg_weights):
        model.load_weights('./vgg16_weights_tf_dim_ordering_tf_kernels.h5')

    return model

# Finetuning VGG

In [91]:
vgg_16 = create_vgg_model()

# Remove last layer
vgg_16.pop()
for layer in vgg_16.layers:
    layer.trainable = False

# Add Dense layer
vgg_16.add(Dense(2, activation='softmax'))


## Initial Round Training

In [None]:
# ===================== INITIAL ROUND =========================
# Compile
adamOpt = optimizers.Adam(0.001)
vgg_16.compile(adamOpt, loss='categorical_crossentropy', metrics=['accuracy'])

# img_gen augments data, so we will train on some constant factor 
# more than number training examples available
AUGMENTATION_FACTOR = 1.2
num_batches = tr_size // BATCH_SIZE  # number of batches (rounded down)
steps_per_epoch = num_batches * AUGMENTATION_FACTOR

vgg_16.fit_generator(train_batches, 
                     steps_per_epoch=steps_per_epoch,
                     validation_data=val_batches, 
                     validation_steps=val_size//BATCH_SIZE, 
                     epochs=1)

vgg_16.save_weights('first')

## Round 2 Training

In [9]:
def load_weights(model, filename):
    if os.path.exists(filename):
        print('loading weights from file: ' + filename)
        model.load_weights(filename)
        print('loaded weights')

def lr_schedule(epoch):
    return 10e-4 / (10**epoch)

In [None]:
# ======================== ROUND2 =========================
# Decrease Learning Rate by factor of 10
adamOpt = optimizers.Adam(0.0001)
vgg_16.compile(adamOpt, loss='categorical_crossentropy', metrics=['accuracy'])

# Adjust Model Parameters
BATCH_SIZE = 32
AUGMENTATION_FACTOR = 1.5
num_batches = tr_size // BATCH_SIZE  # number of batches (rounded down)
steps_per_epoch = num_batches * AUGMENTATION_FACTOR

# Load weights from previous round(s)
load_weights(vgg_16, './weights_1.hdf5')
# ------------------- create callbacks (if any) ----------------
callbacks = []
## save_file = 'weights.{val_loss:.2f}.hdf5'
## callbacks.append(ModelCheckpoint(save_file, monitor='val_loss', save_weights_only=True))
callbacks.append(LearningRateScheduler(lr_schedule))
    
vgg_16.fit_generator(train_batches, steps_per_epoch=steps_per_epoch,
                     validation_data=val_batches, validation_steps=val_size//BATCH_SIZE, 
                     epochs=1, callbacks=callbacks)


# Predict

In [76]:
load_weights(vgg_16, './weights_1.hdf5')
test_gen = ImageDataGenerator()
test_batches = test_gen.flow_from_directory(directory=TEST_DIR, 
                                            class_mode=None,
                                            target_size=(IMG_HEIGHT,IMG_WIDTH), 
                                            batch_size=1) 



loading weights from file: ./weights_1.hdf5
loaded weights
Found 12500 images belonging to 1 classes.


In [23]:
preds = vgg_16.predict_generator(test_batches, steps=test_size, verbose=1)



# (Kaggle) Generate csv submission file

In [31]:
with open('eggs.csv', 'w') as csvfile:
    fieldnames = ['first_name', 'last_name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})
    writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})
    writer.writerow({'first_name': 'Wonderful', 'last_name': 'Spam'})

In [90]:
clipped_preds = np.clip(preds, a_max=0.975, a_min=0.025)
with open('submission.csv', 'w') as submission:
    fieldnames = ['id', 'label']
    writer = csv.DictWriter(submission, fieldnames=fieldnames)
    writer.writeheader()

    for i, pred_one_hot in enumerate(clipped_preds):
        pred_dog = pred_one_hot[1]
        writer.writerow({'id': i+1, 'label': pred_dog})
    