# Prepare the data

###### Define root directory for data.
This directory should already contain the test.zip and train.zip files from Kaggle.

In [None]:
DATA_PATH = "/home/gzpjpk/data/dogs-vs-cats-redux-kernels-edition"

In [None]:
from utils import *
from vgg16 import Vgg16
%matplotlib inline

###### Unzip data downloaded from Kaggle into `test/` and `train/` folders.

In [None]:
!unzip $DATA_PATH/train.zip -d $DATA_PATH > /dev/null
!unzip $DATA_PATH/test.zip -d $DATA_PATH > /dev/null

In [None]:
!mkdir    $DATA_PATH/valid
!mkdir    $DATA_PATH/results

!mkdir -p $DATA_PATH/sample/train
!mkdir    $DATA_PATH/sample/test
!mkdir    $DATA_PATH/sample/valid
!mkdir    $DATA_PATH/sample/results

###### Create validation dataset

In [None]:
g = glob(DATA_PATH + '/train/*.jpg')
shuf = np.random.permutation(g)
for filepath in shuf[:2000]:
    os.rename(filepath, DATA_PATH+'/valid/' + os.path.basename(filepath))

###### Copy out some sample data

In [None]:
from shutil import copyfile

def copyNFromTo(n, src, dest):
    g = glob(src + '/*.jpg')
    shuf = np.random.permutation(g)
    for i in range(n):
        filepath = shuf[i]
        copyfile(filepath, dest + '/' + os.path.basename(filepath))

        
copyNFromTo(200, DATA_PATH+'/train', DATA_PATH+'/sample/train')
copyNFromTo(50,  DATA_PATH+'/valid', DATA_PATH+'/sample/valid')
copyNFromTo(50,  DATA_PATH+'/test',  DATA_PATH+'/sample/test' )

###### Move cat and dog images into separate `dogs/` and `cats/` directories

In [None]:
def separateDogsAndCats(dir):
    !mkdir "$dir/cats"
    !mkdir "$dir/dogs"
    for filepath in glob(dir + '/*.jpg'):
        filename = os.path.basename(filepath)
        if (filename.startswith('cat')): 
            os.rename(filepath, dir + '/cats/' + filename)
        elif (filename.startswith('dog')): 
            os.rename(filepath, dir + '/dogs/' + filename)
        else:
            raise Exception('Unexpected file: ' + filepath)
                
    
separateDogsAndCats(DATA_PATH + '/sample/train')
separateDogsAndCats(DATA_PATH + '/sample/valid')
separateDogsAndCats(DATA_PATH + '/valid')
separateDogsAndCats(DATA_PATH + '/train')

In [None]:
def moveToUnknown(base_path):
    !mkdir $base_path/unknown
    for filepath in glob(base_path+'/*.jpg'):
        filename = os.path.basename(filepath)
        os.rename(filepath, base_path + '/unknown/' + filename)

moveToUnknown(DATA_PATH + '/test')
moveToUnknown(DATA_PATH + '/sample/test')

# Train VGG16 Neural Net

In [None]:
# path = DATA_PATH + '/sample' # use sample data
path = DATA_PATH # use real data

vgg = Vgg16()
batches = vgg.get_batches(path+'/train', batch_size=64)
val_batches = vgg.get_batches(path+'/valid', batch_size=128)
vgg.finetune(batches)

In [None]:
latest_weights_filename = None

In [None]:
no_of_epochs = 3

for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(DATA_PATH + '/results/' + latest_weights_filename) # saving weights after each epoch
print "Completed %s fit operations" % no_of_epochs

# Generate Predictions

In [None]:
batch_size = 64
batches, preds = vgg.test(path+'/test', batch_size = batch_size*2)

Verify the column ordering (appears that cats are column 1 and dogs are column 2) by viewing some images

In [None]:
print preds[:5]
print batches.filenames[:5]

In [None]:
from PIL import Image
Image.open(path + '/test/' + batches.filenames[0])

Save test results arrays

In [None]:
save_array(path + '/results/test_preds.dat', preds)
save_array(path + '/results/filenames.dat', batches.filenames)

Kaggle requires the following format for new submissions:

```
imageId,isDog
1242, .3984
3947, .1000
4539, .9082
2345, .0000
```

Log Loss is used to evalutate submissions.

In [None]:
preds = load_array(path + '/results/test_preds.dat')
filenames = load_array(path + '/results/filenames.dat')

Because log loss punishes confidently wrong answers more than it punishes confidently correct answers, tweak our results to be a bit less confident

In [None]:
#Grab the dog prediction column
original_pred_percent = preds[:,1]
clipped_pred_percent = original_pred_percent.clip(min=0.05, max=0.95)

In [None]:
#Extract imageIds from the filenames in our test/unknown directory 
ids = np.array([int(f[8:f.find('.')]) for f in batches.filenames])

Here we join the two columns into an array of [imageId, isDog]

In [None]:
subm = np.stack([ids, clipped_pred_percent], axis=1)
subm[:5]

In [None]:
submission_file_path = path + '/submission.csv'
np.savetxt(submission_file_path, subm, fmt='%d,%.5f', header='id,label', comments='')
print submission_file_path