Split training data into correct folders
--------------------------------------

In [14]:
%matplotlib inline
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

path = 'data/invasive-species-monitoring/'
labels = np.genfromtxt(path + 'train_labels.csv', dtype=None, delimiter=',', names=True)
%mkdir {path + 'train/non-invasive'}
%mkdir {path + 'train/invasive'}

for record in labels:
    folder = 'invasive/' if np.int(record[1]) == 1 else 'non-invasive/'
    %cp {path}train/{record[0]}.jpg {path}train/{folder}{record[0]}.jpg


mkdir: data/invasive-species-monitoring/train/non-invasive: File exists
mkdir: data/invasive-species-monitoring/train/invasive: File exists


Create validation set
---------------------

In [22]:
%mkdir -p {path + 'valid/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200):
    %mv {shuf[i]} {path + 'valid/invasive/.'}
    
%mkdir -p {path + 'valid/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200): 
    %mv {shuf[i]} {path + 'valid/non-invasive/.'}


Create sample dir
------------------

In [44]:
%mkdir -p {path + 'sample/train/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/invasive/.'}

%mkdir -p {path + 'sample/train/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/non-invasive/.'}

%mkdir -p {path + 'sample/valid/invasive'}
g = glob(path + 'valid/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/invasive/.'}

%mkdir -p {path + 'sample/valid/non-invasive'}
g = glob(path + 'valid/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/non-invasive/.'}
    
%mkdir -p {path + '/sample/test/unknown'}
g = glob(path + 'test/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/test/unknown/.'}

In [45]:
# Need to also move test data to a label "unknown"
%mkdir -p {path + '/test/unknown'}
%cp {path + 'test/*.jpg'} {path + 'test/unknown/.'}

Train
-------

In [33]:
path = 'data/invasive-species-monitoring/sample/'
path

'data/invasive-species-monitoring/sample/'

Make predictions
-----------------

In [34]:
from utils import *
from vgg16 import Vgg16
vgg = Vgg16()

#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=3

In [35]:
#Finetune the model
batches = vgg.get_batches(path + '/train', batch_size=batch_size)
val_batches = vgg.get_batches(path + '/valid', batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01

Found 20 images belonging to 2 classes.
Found 20 images belonging to 2 classes.


In [42]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print('Running epoch: %d' % epoch)
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(path + '/results' + latest_weights_filename)
print("Completed %s fit operations" % no_of_epochs)

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Running epoch: 2
Epoch 1/1
Completed 3 fit operations


Create prediction csv
---------------------

In [46]:
%config IPCompleter.greedy=True 
batches, preds = vgg.test(path + 'test')

Found 10 images belonging to 1 classes.


In [47]:
preds[:5]

array([[  1.2748e-04,   9.9987e-01],
       [  2.1495e-02,   9.7851e-01],
       [  1.2417e-03,   9.9876e-01],
       [  9.9948e-01,   5.2212e-04],
       [  4.3444e-01,   5.6556e-01]], dtype=float32)

In [48]:
batches.filenames[:5]

['unknown/114.jpg',
 'unknown/1342.jpg',
 'unknown/1432.jpg',
 'unknown/26.jpg',
 'unknown/282.jpg']

In [49]:
results = zip(batches.filenames, preds)
results[:5]

[('unknown/114.jpg', array([  1.2748e-04,   9.9987e-01], dtype=float32)),
 ('unknown/1342.jpg', array([ 0.0215,  0.9785], dtype=float32)),
 ('unknown/1432.jpg', array([ 0.0012,  0.9988], dtype=float32)),
 ('unknown/26.jpg', array([  9.9948e-01,   5.2212e-04], dtype=float32)),
 ('unknown/282.jpg', array([ 0.4344,  0.5656], dtype=float32))]

In [56]:
import re

def format(x):
    id = re.sub('unknown/([0-9]+)\.jpg', r'\1', x[0])
    return [np.int(id), np.round(x[1][0])]
    
formattedResults = np.array(map(format, results))
formattedResults[:5]

array([[  1.1400e+02,   0.0000e+00],
       [  1.3420e+03,   0.0000e+00],
       [  1.4320e+03,   0.0000e+00],
       [  2.6000e+01,   1.0000e+00],
       [  2.8200e+02,   0.0000e+00]])

In [57]:
np.savetxt('data/invasive-species-monitoring/jp_invasive.csv', formattedResults, fmt='%d,%d', delimiter=',', header='id,invasive', comments='')

In [58]:
from IPython.display import FileLink
FileLink('data/invasive-species-monitoring/jp_invasive.csv')