Split training data into correct folders
--------------------------------------

In [2]:
%matplotlib inline
from __future__ import division,print_function
import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

path = 'data/invasive-species-monitoring/'
labels = np.genfromtxt(path + 'train_labels.csv', dtype=None, delimiter=',', names=True)

In [None]:
%mkdir {path + 'train/non-invasive'}
%mkdir {path + 'train/invasive'}

for record in labels:
    folder = 'invasive/' if np.int(record[1]) == 1 else 'non-invasive/'
    %cp {path}train/{record[0]}.jpg {path}train/{folder}{record[0]}.jpg

Create validation set
---------------------

In [2]:
%mkdir -p {path + 'valid/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200):
    %mv {shuf[i]} {path + 'valid/invasive/.'}
    
%mkdir -p {path + 'valid/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(200): 
    %mv {shuf[i]} {path + 'valid/non-invasive/.'}


Create sample dir
------------------

In [3]:
%mkdir -p {path + 'sample/train/invasive'}
g = glob(path + 'train/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/invasive/.'}

%mkdir -p {path + 'sample/train/non-invasive'}
g = glob(path + 'train/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/train/non-invasive/.'}

%mkdir -p {path + 'sample/valid/invasive'}
g = glob(path + 'valid/invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/invasive/.'}

%mkdir -p {path + 'sample/valid/non-invasive'}
g = glob(path + 'valid/non-invasive/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/valid/non-invasive/.'}
    
%mkdir -p {path + '/sample/test/unknown'}
g = glob(path + 'test/*.jpg')
shuf = np.random.permutation(g)
for i in range(10):
    %cp {shuf[i]} {path + 'sample/test/unknown/.'}

In [4]:
# Need to also move test data to a label "unknown"
%mkdir -p {path + '/test/unknown'}
%cp {path + 'test/*.jpg'} {path + 'test/unknown/.'}

Train
-------

In [8]:
path = 'data/invasive-species-monitoring/'
path

'data/invasive-species-monitoring/'

Make predictions
-----------------

In [9]:
from utils import *
from vgg16 import Vgg16
vgg = Vgg16()

#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=5

In [31]:
#Finetune the model
batches = vgg.get_batches(path + '/train', batch_size=batch_size)
val_batches = vgg.get_batches(path + '/valid', batch_size=batch_size*2)

vgg.finetune(batches)

Found 1895 images belonging to 2 classes.
Found 400 images belonging to 2 classes.


In [32]:
vgg.model.optimizer.lr = 0.01

In [36]:
#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
#latest_weights_filename = None
vgg.model.load_weights(path + '/resultsft4.h5')
no_of_epochs = 10
for epoch in range(no_of_epochs):
    print('Running epoch: %d' % epoch)
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(path + '/results' + latest_weights_filename)
print("Completed %s fit operations" % no_of_epochs)

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Running epoch: 2
Epoch 1/1
Running epoch: 3
Epoch 1/1
Running epoch: 4
Epoch 1/1
Running epoch: 5
Epoch 1/1
Running epoch: 6
Epoch 1/1
Running epoch: 7
Epoch 1/1
Running epoch: 8
Epoch 1/1
Running epoch: 9
Epoch 1/1
Completed 10 fit operations


Create prediction csv
---------------------

In [37]:
%config IPCompleter.greedy=True 
batches, preds = vgg.test(path + 'test')

Found 1531 images belonging to 1 classes.


In [38]:
preds[:5]

array([[  1.0000e+00,   8.0102e-15],
       [  1.3041e-15,   1.0000e+00],
       [  1.8968e-14,   1.0000e+00],
       [  1.0000e+00,   2.3797e-13],
       [  1.2596e-04,   9.9987e-01]], dtype=float32)

In [39]:
batches.filenames[:5]

['unknown/779.jpg',
 'unknown/1261.jpg',
 'unknown/878.jpg',
 'unknown/552.jpg',
 'unknown/400.jpg']

In [40]:
results = zip(batches.filenames, preds)
results[:5]

[('unknown/779.jpg', array([  1.0000e+00,   8.0102e-15], dtype=float32)),
 ('unknown/1261.jpg', array([  1.3041e-15,   1.0000e+00], dtype=float32)),
 ('unknown/878.jpg', array([  1.8968e-14,   1.0000e+00], dtype=float32)),
 ('unknown/552.jpg', array([  1.0000e+00,   2.3797e-13], dtype=float32)),
 ('unknown/400.jpg', array([  1.2596e-04,   9.9987e-01], dtype=float32))]

In [41]:
import re

def format(x):
    id = re.sub('unknown/([0-9]+)\.jpg', r'\1', x[0])
    return [np.int(id), np.float(x[1][0])]
    
formattedResults = np.array(map(format, results))
formattedResults[:5]

array([[  7.7900e+02,   1.0000e+00],
       [  1.2610e+03,   1.3041e-15],
       [  8.7800e+02,   1.8968e-14],
       [  5.5200e+02,   1.0000e+00],
       [  4.0000e+02,   1.2596e-04]])

In [42]:
np.savetxt('data/invasive-species-monitoring/jp_invasive.csv', formattedResults, fmt='%d,%.5f', delimiter=',', header='name,invasive', comments='')

In [43]:
from IPython.display import FileLink
FileLink('data/invasive-species-monitoring/jp_invasive.csv')