# Prepare the data

###### Define root directory for data.
This directory should already contain the test.zip and train.zip files from Kaggle.

In [2]:
import os
DATA_PATH = os.environ['DATA'] + '/dog-breed-identification'

'/Users/gzpjpk/dev/data/dog-breed-identification'

In [3]:
from utils import *
from vgg16 import Vgg16
%matplotlib inline

Using TensorFlow backend.


###### Unzip labels

In [9]:
!unzip $DATA_PATH/labels.csv.zip -d $DATA_PATH > /dev/null

Archive:  /Users/gzpjpk/dev/fastai/data/dog-breed-identification/labels.csv.zip
  inflating: /Users/gzpjpk/dev/fastai/data/dog-breed-identification/labels.csv  


###### Unzip data downloaded from Kaggle into `test/` and `train/` folders.

In [10]:
!unzip $DATA_PATH/train.zip -d $DATA_PATH > /dev/null
!unzip $DATA_PATH/test.zip -d $DATA_PATH > /dev/null

In [11]:
!mkdir    $DATA_PATH/valid
!mkdir    $DATA_PATH/results

!mkdir -p $DATA_PATH/sample/train
!mkdir    $DATA_PATH/sample/test
!mkdir    $DATA_PATH/sample/valid
!mkdir    $DATA_PATH/sample/results

###### Create validation dataset

In [50]:
g = glob(DATA_PATH + '/train/*.jpg')
shuf = np.random.permutation(g)
for filepath in shuf[:2000]:
    os.rename(filepath, DATA_PATH+'/valid/' + os.path.basename(filepath))

###### Copy out some sample data

In [118]:
from shutil import copyfile

def copyNFromTo(n, src, dest):
    g = glob(src + '/*.jpg')
    shuf = np.random.permutation(g)
    for i in range(n):
        filepath = shuf[i]
        copyfile(filepath, dest + '/' + os.path.basename(filepath))

        
copyNFromTo(500, DATA_PATH+'/train', DATA_PATH+'/sample/train')
copyNFromTo(100,  DATA_PATH+'/valid', DATA_PATH+'/sample/valid')
copyNFromTo(100,  DATA_PATH+'/test',  DATA_PATH+'/sample/test' )

###### Move  images into separate  directories for labels

In [126]:
import csv
allLabels = { row['breed'] for row in csv.DictReader(open(DATA_PATH + '/labels.csv', 'rb')) }

def get_label_for_image(filepath):
    name, ext = os.path.splitext(os.path.basename(filepath))
    for row in csv.DictReader(open(DATA_PATH+'/labels.csv')):
        if name == row['id']:
            return row['breed']
    raise Exception('Did not find filename match in labels CSV file ' + name)

def separateByLabel(dir):
    for label in allLabels: os.makedirs(dir + '/' + label)
    for filepath in glob(dir + '/*.jpg'):
        label = get_label_for_image(filepath)
        filename = os.path.basename(filepath)
        os.rename(filepath, dir + '/' + label + '/' + filename)

separateByLabel(DATA_PATH + '/train')
separateByLabel(DATA_PATH + '/valid')
separateByLabel(DATA_PATH + '/sample/train')
separateByLabel(DATA_PATH + '/sample/valid')

###### Move test images into unknown folder

In [123]:
def moveToUnknown(base_path):
    os.makedirs(base_path + '/unknown')
    for filepath in glob(base_path+'/*.jpg'):
        filename = os.path.basename(filepath)
        os.rename(filepath, base_path + '/unknown/' + filename)

moveToUnknown(DATA_PATH + '/test')
moveToUnknown(DATA_PATH + '/sample/test')

# Train VGG16 Neural Net

In [7]:
path = DATA_PATH + '/sample' # use sample data
# path = DATA_PATH # use real data

vgg = Vgg16()
batches = vgg.get_batches(path+'/train', batch_size=64)
val_batches = vgg.get_batches(path+'/valid', batch_size=128)
vgg.finetune(batches)

Found 500 images belonging to 120 classes.
Found 100 images belonging to 120 classes.


In [129]:
latest_weights_filename = None

In [130]:
no_of_epochs = 2

for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(DATA_PATH + '/results/' + latest_weights_filename) # saving weights after each epoch
print "Completed %s fit operations" % no_of_epochs

Running epoch: 0
Epoch 1/1
Running epoch: 1
Epoch 1/1
Completed 2 fit operations


# Generate Predictions

In [None]:
batch_size = 64
batches, preds = vgg.test(path+'/test', batch_size = batch_size*2)

Verify the column ordering (appears that cats are column 1 and dogs are column 2) by viewing some images

In [None]:
print preds[:5]
print batches.filenames[:5]

In [None]:
from PIL import Image
Image.open(path + '/test/' + batches.filenames[0])

Save test results arrays

In [None]:
save_array(path + '/results/test_preds.dat', preds)
save_arrayray(path + '/results/filenames.dat', batches.filenames)

# Validate Predictions

In [8]:
vgg.model.load_weights(DATA_PATH + '/results/ft1_full.h5')

In [9]:
val_batches, probs = vgg.test(DATA_PATH + '/valid', batch_size = 64)

Found 2000 images belonging to 120 classes.


In [29]:
expected_labels = val_batches.classes 

ids = np.array([ f[f.find('/')+1:testname.find('.')] for f in val_batches.filenames ])


##Round our predictions to 0/1 to generate labels
## our_predictions = probs[:,0]
## our_labels = np.round(1-our_predictions)

['00ca18751837cd6a22813f8e221f7819' '11b60d8d86f14a601ca290909a17cbc6'
 '19de1db12b3ddc7f2af6d9453c977083' ..., '9afd17ba252823662440863d6c0e'
 'b9b54494a2ed02ea74f0ef26a8cc' 'c2c60183f18666aaa714efeff54a']


In [None]:
#from keras.preprocessing import image
#
##Helper function to plot images by index in the validation set 
##Plots is a helper function in utils.py
#def plots_idx(idx, titles=None):
#    plots([image.load_img(DATA_PATH + '/valid/' + val_batches.filenames[i]) for i in idx], titles=titles)
#    
##Number of images to view for each visualization task
#n_view = 4

In [None]:
# #1. A few correct labels at random
# correct = np.where(our_labels==expected_labels)[0]
# print "Found %d correct labels" % len(correct)
# idx = permutation(correct)[:n_view]
# plots_idx(idx, our_predictions[idx])

# Create Kaggle Submission

In [68]:
#Extract imageIds from the filenames in our test/unknown directory 
filenames = load_array(DATA_PATH + '/results/filenames.dat')
# ids = np.array([ f[f.find('/')+1:testname.find('.')] for f in batches.filenames ])
ids = np.array([ f[f.find('/')+1:f.find('.')] for f in filenames ])

Join the columns into an array of [id, prob1, prob2, ... , prob120]

In [79]:
pred_percents = load_array(DATA_PATH + '/results/test_preds.dat')
reshaped_ids = np.reshape(ids, (-1,1)) # convert from 1d to 2d
subm = np.append(reshaped_ids, pred_percents, axis=1)

f = open(DATA_PATH + '/sample_submission.csv')
header = f.readline().strip()

submission_file_path = DATA_PATH + '/results/submission.csv'
format = ['%s' for _ in range(121)]
print subm.shape
np.savetxt(submission_file_path, subm, fmt=format, delimiter=',', header=header, comments='')
print submission_file_path

(10357, 121)
/Users/gzpjpk/dev/data/dog-breed-identification/results/submission.csv
