# Setup

Load all relevant packages. I will not load the util.py, since I'd like to have everything visible for practice.

In [1]:
#### Setup ####

from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

# For confusion matrix plot
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

# Pandas to save csv
import pandas as pd

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential, Model
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.normalization import BatchNormalization
from keras.layers import Input
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop, Adam
from keras.preprocessing import image

from __future__ import division,print_function

import os, json, importlib
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)

import matplotlib

# Link to file
from IPython.lib.display import FileLink

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [7]:
### Define data folders ####
wdir = "/home/ubuntu/"
train_folder = "data/train/"
val_folder =   "data/validation/"
test_folder = "data/test_stg1/"
sample_folder ="data/sample/"

# Setup directories

In [8]:
#%cd /home/ubuntu/data/fisheries/
os.chdir(wdir)

In [9]:
import os
import shutil
import numpy as np

# Get class folders from train path
folders = os.listdir(wdir + train_folder)

In [11]:
if os.path.isdir(wdir+val_folder+"ALB"): print("Validation data found")
else: 
    # For every class folder, move a random set of observations of size x% of folder to the same folder
    # in the validation path
    for f in folders:
        if not f.startswith("."):
            os.mkdir(wdir+ val_folder + f)
            shuf = np.random.permutation(os.listdir(wdir+train_folder + f))
            select = int(len(shuf)*0.2)
            for i in shuf[:select]:
                shutil.move(wdir+train_folder + f + "/" + i, wdir+val_folder + f + "/" + i)

Validation data found


In [13]:
# Put class observations in a single 'unknown' folder
#%mkdir test_stg1/unknown

for i in os.listdir(test_folder):
    if os.path.isfile(test_folder + i):
        shutil.move(test_folder + i, test_folder + 'unknown/' + i)

In [14]:
# Make a smaller sample training folder for faster processing
if os.path.isdir(wdir+val_folder+"ALB"): print("Sample data found")
else: 
# this takes 10% from the files left in the train folder and copies them to subsample folder
    for f in folders:
        if not f.startswith("."):
            os.mkdir(wdir+ sample_folder + f)
            shuf = np.random.permutation(os.listdir(train_folder + f))
            select = int(len(shuf)*0.1)
            for i in shuf[:select]:
                shutil.copy(train_folder + f + "/" + i, sample_folder + f + "/" + i)

    # how many files in the sample folders?
    for f in folders:
        if not f.startswith("."):
            print(str(f))
            print(len(os.listdir(train_folder + f)))

Sample data found


# Model setup

Step 1: Build the structure of the pre-calculated model.    
Step 2: Load the weights in to that model

In [18]:
def VGG_16():
    model = Sequential() # We build a sequential architecture, with one layer after the other
    
    # The first layer of the model needs the input size information. This is done, among others,
    # by the input_shape argument
    
    # We need to standardize the image input to fit the original model
    # Lambda: Used for evaluating an arbitrary Theano / TensorFlow expression on the output of the previous layer.
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224)))
    
    # (how many layers of this size, which model, how many filters)
    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)
    
    # Flatten the output tensor to a vector
    model.add(Flatten())
    # Add fully connected layer still with relu activation 
    # but also dropout of 0.5 to control for overfitting
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model

def vgg_preprocess(x):
    # Mean of each channel as provided by VGG researchers
    vgg_mean = np.array([123.68, 116.779, 103.939]).reshape((3,1,1))
    x = x - vgg_mean     # subtract mean
    return x[:, ::-1]    # reverse axis bgr->rgb
    
def ConvBlock(layers, model, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))

def FCBlock(model):
    model.add(Dense(4096, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

In [20]:
model = VGG_16()
# Get weights from fast.ai or local directory 
fpath = get_file('vgg16_bn.h5', 'http://www.platform.ai/models/vgg16_bn.h5', cache_subdir='models')
#model.load_weights(wdir+'vgg16_weights/vgg16_bn.h5')
model.load_weights(fpath)

Downloading data from http://www.platform.ai/models/vgg16_bn.h5

The imagenet model is now set. Now to customize it to our problem.
For faster experimenting, split the model into the convolutional and the fully-connected part.

Optional: Step 1: Split the model into the two parts.    
Step 2: Customize the output part

In [21]:
# Copy the weights from the pre-trained model.
# NB: Since we're removing dropout, we want to half the weights
def proc_wgts(layer): return [o/2 for o in layer.get_weights()]

def get_fc_model():
    model = Sequential([
        MaxPooling2D(input_shape=conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dense(4096, activation='relu'),
        # The dropout is changed here, increase/decrease in case of overfitting/underfitting
        Dropout(0.),
        Dense(4096, activation='relu'),
        Dropout(0.),
        Dense(2, activation='softmax')
        ])
    
    # zip combines n lists into one list with n-tuples 
    for l1,l2 in zip(model.layers, fc_layers): l1.set_weights(proc_wgts(l2))

    # Such a finely tuned model needs to be updated very slowly!
    opt = RMSprop(lr=0.00001, rho=0.7)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [22]:
## NOT YET CHANGED TO WORK WITH BATCH NORMALIZATION
#layers = model.layers
#last_conv_idx = [index for index,layer in enumerate(layers) 
#                     if type(layer) is Convolution2D][-1]
#conv_layers = layers[:last_conv_idx+1]
#conv_model = Sequential(conv_layers)
## Dense layers - also known as fully connected or 'FC' layers
#fc_layers = layers[last_conv_idx+1:]
#
#fc_model = get_fc_model()

In [23]:
# Replace last layer and 8 class output layer
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(8, activation='softmax'))

In [24]:
# Compile new model
# categorical_crossentropy is equal to multiclass logloss
model.compile(Adam(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
#model.summary()

# Data preparation

Need to define some process to read in and pre-process the images. We do this with a generator from a directory.

In [25]:
def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, batch_size=4, class_mode='categorical',
                target_size=(224,224)):
    return gen.flow_from_directory(dirname, target_size=target_size,
            class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

# Rotate and flip the images at random for each batch
generator = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1, 
                                     horizontal_flip = True, vertical_flip = True)

In [27]:
batch_size = 24
train_batches = get_batches(train_folder, gen = generator, shuffle=True, batch_size=batch_size)
val_batches = get_batches(val_folder, shuffle=False, batch_size=batch_size)

Found 3024 images belonging to 8 classes.
Found 752 images belonging to 8 classes.


# Model finetuning/training

Now that the model structure is set, we'll finetune it.    
First, could get the output of the convolutional layers as input for the fully-connected layers to save time. This means, we won't be able to do image rotation, etc. So I don't do it.  
Second, finetune the last (new) layer to set up weights that fit the original fully-connected layers.     
Third, finetune all fully connected layers.

In [31]:
nb_epoch = 1

In [32]:
# Finetune the output layer first, do get some reasonable weights
# For debbuging:
model.fit_generator(train_batches, samples_per_epoch=100, nb_epoch = nb_epoch,
                    validation_data=val_batches, nb_val_samples=50)

#model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch = nb_epoch,
#                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)


Epoch 1/1


<keras.callbacks.History at 0x7f742027c890>

In [74]:
# Finetune deeper layers
layers = model.layers
# Get the index of the first dense layer...
first_dense_idx = [index for index,layer in enumerate(layers) if type(layer) is Dense][0]
# ...and set this and all subsequent layers to trainable
for layer in layers[first_dense_idx:]: layer.trainable=True
model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#model.summary

In [77]:
model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch = nb_epoch,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)

Epoch 1/5

KeyboardInterrupt: 

In [14]:
#fish.save_weights(data_path+'fish_finetune_3epochs.h5')
#fish.load_weights(data_path+'fish_finetune2.h5')

# Model analysis

Some functions to analyze the most/least sure images and incorrect predictions as a sanity check

In [78]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    (This function is copied from the scikit docs.)
    """
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [91]:
def get_data(path, target_size=(224,224)):
    batches = get_batches(path, shuffle=False, batch_size=1, class_mode=None, target_size=target_size)
    return np.concatenate([batches.next() for i in range(batches.nb_sample)])

# Get validation classes 
val_classes = val_batches.classes
# Get validation class and prob predictions
val_data = get_data(wdir + val_folder)

Found 752 images belonging to 8 classes.


KeyboardInterrupt: 

In [None]:
preds = model.predict(val_data, batch_size=batch_size)
pred[1:5]

In [None]:
cm = confusion_matrix(val_classes, preds)
plot_confusion_matrix(cm, val_batches.class_indices)

# Make predictions

In [26]:
test_batches = get_batches(test_folder, shuffle=False, batch_size=batch_size, class_mode=None)
predictions = fish.model.predict_generator(test_batches, test_batches.nb_sample)
predictions_clipped = pd.DataFrame(predictions.clip(min=0.05, max=0.95))
predictions_clipped.columns = train_batches.class_indices.keys()

Found 1000 images belonging to 1 classes.


# Make and upload submission

In [28]:
filenames = test_batches.filenames
ids = [f[f.find('/')+1:] for f in filenames]
ids = pd.DataFrame(ids)

predictions_clipped.insert(0, "image",ids)

ValueError: cannot insert image, already exists

In [36]:
prediction_path = wdir + 'sample_submission3.csv'
predictions_clipped.to_csv(prediction_path + str(prediction_name), index=False)

In [43]:
FileLink(prediction_path)
#!kg submit wdir+"submissions/sample_submission3.csv" -u '' -p '' -m "3rd try, standard model" -c "the-nature-conservancy-fisheries-monitoring" 

Starting new HTTPS connection (1): www.kaggle.com

