Split training data into correct folders
--------------------------------------

In [1]:
%matplotlib inline
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

path = 'data/invasive-species-monitoring/'
labels = np.genfromtxt(path + 'train_labels.csv', dtype=None, delimiter=',', names=True)
%mkdir {path + 'train/non-invasive'}
%mkdir {path + 'train/invasive'}

for record in labels:
    folder = 'invasive/' if np.int(record[1]) == 1 else 'non-invasive/'
    %cp {path}train/{record[0]}.jpg {path}train/{folder}{record[0]}.jpg


mkdir: data/invasive-species-monitoring/train/non-invasive: File exists
mkdir: data/invasive-species-monitoring/train/invasive: File exists


Create validation set
---------------------

In [2]:
%mkdir -p {path + 'valid/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200):
    %mv {shuf[i]} {path + 'valid/invasive/.'}
    
%mkdir -p {path + 'valid/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200): 
    %mv {shuf[i]} {path + 'valid/non-invasive/.'}


Create sample dir
------------------

In [3]:
%mkdir -p {path + 'sample/train/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/invasive/.'}

%mkdir -p {path + 'sample/train/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/non-invasive/.'}

%mkdir -p {path + 'sample/valid/invasive'}
g = glob(path + 'valid/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/invasive/.'}

%mkdir -p {path + 'sample/valid/non-invasive'}
g = glob(path + 'valid/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/non-invasive/.'}
    
%mkdir -p {path + '/sample/test/unknown'}
g = glob(path + 'test/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/test/unknown/.'}

In [4]:
# Need to also move test data to a label "unknown"
%mkdir -p {path + '/test/unknown'}
%cp {path + 'test/*.jpg'} {path + 'test/unknown/.'}

Train
-------

In [5]:
path = 'data/invasive-species-monitoring/sample/'
path

'data/invasive-species-monitoring/sample/'

Make predictions
-----------------

In [6]:
from utils import *
from vgg16 import Vgg16
vgg = Vgg16()

#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=5

Using Theano backend.


In [25]:
#Finetune the model
batches = vgg.get_batches(path + '/train', batch_size=batch_size)
val_batches = vgg.get_batches(path + '/valid', batch_size=batch_size*2)

vgg.finetune(batches)

Found 40 images belonging to 2 classes.
Found 37 images belonging to 2 classes.


In [30]:
vgg.model.optimizer.lr = 0.005

In [31]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
#latest_weights_filename = None
vgg.model.load_weights(path + '/results' + latest_weights_filename)
for epoch in range(no_of_epochs):
    print('Running epoch: %d' % epoch)
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(path + '/results' + latest_weights_filename)
print("Completed %s fit operations" % no_of_epochs)

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Running epoch: 2
Epoch 1/1
Running epoch: 3
Epoch 1/1
Running epoch: 4
Epoch 1/1
Completed 5 fit operations


Create prediction csv
---------------------

In [34]:
%config IPCompleter.greedy=True 
batches, preds = vgg.test(path + 'test')

Found 20 images belonging to 1 classes.


In [35]:
preds[:5]

array([[  7.6071e-09,   1.0000e+00],
       [  1.1330e-14,   1.0000e+00],
       [  9.9829e-01,   1.7142e-03],
       [  3.3616e-01,   6.6384e-01],
       [  6.9432e-04,   9.9931e-01]], dtype=float32)

In [36]:
batches.filenames[:5]

['unknown/114.jpg',
 'unknown/1191.jpg',
 'unknown/1342.jpg',
 'unknown/1372.jpg',
 'unknown/1412.jpg']

In [37]:
results = zip(batches.filenames, preds)
results[:5]

[('unknown/114.jpg', array([  7.6071e-09,   1.0000e+00], dtype=float32)),
 ('unknown/1191.jpg', array([  1.1330e-14,   1.0000e+00], dtype=float32)),
 ('unknown/1342.jpg', array([ 0.9983,  0.0017], dtype=float32)),
 ('unknown/1372.jpg', array([ 0.3362,  0.6638], dtype=float32)),
 ('unknown/1412.jpg', array([  6.9432e-04,   9.9931e-01], dtype=float32))]

In [38]:
import re

def format(x):
    id = re.sub('unknown/([0-9]+)\.jpg', r'\1', x[0])
    return [np.int(id), np.clip(np.float(x[1][0]), 0.025, 0.975)]
    
formattedResults = np.array(map(format, results))
formattedResults[:5]

array([[  1.1400e+02,   2.5000e-02],
       [  1.1910e+03,   2.5000e-02],
       [  1.3420e+03,   9.7500e-01],
       [  1.3720e+03,   3.3616e-01],
       [  1.4120e+03,   2.5000e-02]])

In [39]:
np.savetxt('data/invasive-species-monitoring/jp_invasive.csv', formattedResults, fmt='%d,%.5f', delimiter=',', header='name,invasive', comments='')

In [40]:
from IPython.display import FileLink
FileLink('data/invasive-species-monitoring/jp_invasive.csv')