# Setup

In [1]:
#### Setup ####

from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import pandas as pd

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential, Model
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers import Input
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop
from keras.preprocessing import image

from __future__ import division,print_function

import os, json, importlib
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)

import matplotlib

Using Theano backend.


In [2]:
### Define data folders ####
data_path = "/home/ubuntu/data/fisheries/"
train_folder = "/home/ubuntu/data/fisheries/train/"
val_folder =   "/home/ubuntu/data/fisheries/validation/"
sample_folder ="/home/ubuntu/data/fisheries/sample/"

# Setup directories

In [8]:
%cd /home/ubuntu/data/fisheries/
%mkdir validation
%mkdir validation/ALB
%mkdir validation/BET
%mkdir validation/NoF
%mkdir validation/YFT
%mkdir validation/OTHER
%mkdir validation/SHARK
%mkdir validation/DOL
%mkdir validation/LAG

import os
import shutil
import numpy as np

# Get class folders from train path
folders = os.listdir(train_folder)

# For every class folder, move a random set of observations of size x% of folder to the same folder
# in the validation path
for f in folders:
    if not f.startswith("."):
        shuf = np.random.permutation(os.listdir(train_folder + f))
        select = int(len(shuf)*0.2)
        for i in shuf[:select]:
            shutil.move(train_folder + f + "/" + i, val_folder + f + "/" + i)


# Make a smaller sample training folder for faster processing
%mkdir sample
%mkdir sample/ALB
%mkdir sample/BET
%mkdir sample/NoF
%mkdir sample/YFT
%mkdir sample/OTHER
%mkdir sample/SHARK
%mkdir sample/DOL
%mkdir sample/LAG

# this takes 10% from the files left in the train folder and copies them to subsample folder

for f in folders:
    if not f.startswith("."):
        shuf = np.random.permutation(os.listdir(train_folder + f))
        select = int(len(shuf)*0.1)
        for i in shuf[:select]:
            shutil.copy(train_folder + f + "/" + i, sample_folder + f + "/" + i)

# how many files in the sample folders?
for f in folders:
    if not f.startswith("."):
        print(str(f))
        print(len(os.listdir(train_folder + f)))

/home/ubuntu/data/fisheries
mkdir: cannot create directory ‘validation’: File exists
mkdir: cannot create directory ‘validation/ALB’: File exists
mkdir: cannot create directory ‘validation/BET’: File exists
mkdir: cannot create directory ‘validation/NoF’: File exists
mkdir: cannot create directory ‘validation/YFT’: File exists
mkdir: cannot create directory ‘validation/OTHER’: File exists
mkdir: cannot create directory ‘validation/SHARK’: File exists
mkdir: cannot create directory ‘validation/DOL’: File exists
mkdir: cannot create directory ‘validation/LAG’: File exists
/home/ubuntu/data/fisheries
mkdir: cannot create directory ‘sample’: File exists
mkdir: cannot create directory ‘sample/ALB’: File exists
mkdir: cannot create directory ‘sample/BET’: File exists
mkdir: cannot create directory ‘sample/NoF’: File exists
mkdir: cannot create directory ‘sample/YFT’: File exists
mkdir: cannot create directory ‘sample/OTHER’: File exists
mkdir: cannot create directory ‘sample/SHARK’: File exi

# Function definitions

In [6]:
# Mean of each channel as provided by VGG researchers
vgg_mean = np.array([123.68, 116.779, 103.939]).reshape((3,1,1))

def vgg_preprocess(x):
    x = x - vgg_mean     # subtract mean
    return x[:, ::-1]    # reverse axis bgr->rgb
    
def ConvBlock(layers, model, filters):
    for i in range(layers): 
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

def FCBlock(model):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    
def VGG_16():
    model = Sequential()
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224)))

    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)

    model.add(Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model

# Finetuning
def fit_model(model, batches, val_batches, nb_epoch=1):
    model.fit_generator(batches, samples_per_epoch=batches.N, nb_epoch=nb_epoch, 
                        validation_data=val_batches, nb_val_samples=val_batches.N)
    
def pred_batch(imgs):
    preds = model.predict(imgs)
    idxs = np.argmax(preds, axis=1)

    print('Shape: {}'.format(preds.shape))
    #print('First 5 classes: {}'.format(classes[:5]))
    #print('First 5 probabilities: {}\n'.format(preds[0, :5]))
    print('Predictions prob/class: ')
    
    for i in range(len(idxs)):
        idx = idxs[i]
        print ('  {:.4f}/{}'.format(preds[i, idx], classes[idx]))

# Model setup

In [11]:
fish = VGG_16()

# Get weights from fast.ai or local directory 
#fpath = get_file('vgg16.h5', 'http://www.platform.ai/models/vgg16.h5', cache_subdir='models')
fish.load_weights('../data/vgg_weights/vgg16.h5')

# Replace last layer by dropout and 8 class output layer
fish.pop()
for layer in fish.layers: layer.trainable=False
fish.add(Dropout(0.4))
fish.add(Dense(8, activation='softmax'))
# Compile new model
opt = keras.optimizers.Adam(lr = 0.001)
# categorical_crossentropy is equal to multiclass logloss
fish.compile(optimizer = opt, loss='categorical_crossentropy', metrics=['accuracy'])
fish.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_3 (Lambda)                (None, 3, 224, 224)   0           lambda_input_3[0][0]             
____________________________________________________________________________________________________
zeropadding2d_27 (ZeroPadding2D) (None, 3, 226, 226)   0           lambda_3[0][0]                   
____________________________________________________________________________________________________
convolution2d_27 (Convolution2D) (None, 64, 224, 224)  0           zeropadding2d_27[0][0]           
____________________________________________________________________________________________________
zeropadding2d_28 (ZeroPadding2D) (None, 64, 226, 226)  0           convolution2d_27[0][0]           
___________________________________________________________________________________________

# Finetune last model layer

In [15]:
# Define function to create a batch generator
def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, class_mode='categorical'):
    return gen.flow_from_directory(path, target_size=(224,224),
            class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

generator = image.ImageDataGenerator(rotation_range=15, height_shift_range=0.05, 
                shear_range=0.1, channel_shift_range=20, width_shift_range=0.1)


In [17]:
batch_size = 32
nb_epoch = 10

train_batches = get_batches(train_folder, gen = generator, shuffle=True, batch_size=batch_size)
val_batches = get_batches(val_folder, gen = generator, shuffle=True, batch_size=batch_size)
# Finetune the model
fish.model.optimizer.lr = 0.01
fish.model.fit_generator(train_batches, samples_per_epoch=train_batches.nb_sample, nb_epoch = nb_epoch,
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)


Found 1942 images belonging to 8 classes.
Found 1835 images belonging to 8 classes.
Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [None]:
fish.save_weights(data_path+'fish_finetune_10epochs.h5')
#fish.load_weights(data_path+'fish_finetune2.h5')

# Make predictions

In [None]:
test_batches = get_batches(test_path, shuffle=False, batch_size=batch_size, class_mode=None)
predictions = fish.model.predict_generator(test_batches, test_batches.nb_sample)
predictions_clipped = pd.DataFrame(predictions.clip(min=0.05, max=0.95))
predictions_clipped.columns = train_batches.class_indices.keys()

# Make and upload submission

In [4]:
filenames = test_batches.filenames
ids = [f[f.find('/')+1:] for f in filenames]
ids = pd.DataFrame(ids)

predictions_clipped.insert(0, "image",ids)
predictions_clipped.to_csv(data_path + "/submissions/sample_submission3.csv", index=False)

NameError: name 'test_batches' is not defined