In [119]:
%matplotlib inline

# bunch of required imports
from __future__ import division, print_function
import os, json
from glob import glob
import numpy as np
from matplotlib import pyplot as plt
import utils; reload(utils)
import md_utils; reload(md_utils)
from md_utils import KaggleClient
from utils import plots
import math

from zipfile import ZipFile

np.set_printoptions(precision=4, linewidth=100)

In [120]:
base_data_path = 'data/dogscats-submission'
kaggle_client = KaggleClient()

## Process
#### Data Prep
- fetch the dataset, if it's not already present
- create dirs for training, validation, and test
    - separate dogs/cats into their respective dirs (for training and validation)
    - create an `unknown` directory under `test`: Keras looks for directories of content under a specified path, so this just makes things simpler

#### Model Prep
- load the VGG16 model and ImageNet trained weights
- finetune the model using the training and validation sets
    - prune the last layer of the VGG model
    - add a dense layer to categorize dogs vs. cats

#### Execute Predictions
- run predictions on against `test` and format the predictions accordingly

In [121]:
# Here's our current data layout
!tree -d data

data
└── dogscats
    ├── models
    ├── sample
    │   ├── train
    │   │   ├── cats
    │   │   └── dogs
    │   └── valid
    │       ├── cats
    │       └── dogs
    ├── test1
    │   └── unknown
    ├── train
    │   ├── cats
    │   └── dogs
    └── valid
        ├── cats
        └── dogs

17 directories


## Data Prep

In [122]:
# check to see if we data under `base_data_path`
data_exists = os.path.exists(base_data_path)

if not data_exists:
    kaggle_client.download_dataset('dogs-vs-cats-redux-kernels-edition', base_data_path)
    
    # segment the `train` dataset into training and validation sets
    training_set = np.random.permutation(glob(os.path.join(base_data_path, 'train', '*.jpg')))
    validation_size = int(len(training_set) * 0.2)
    for validation_file in training_set[:validation_size]:
        os.renames(validation_file, os.path.join(base_data_path, 'valid', os.path.basename(validation_file).split('.')[0], os.path.basename(validation_file)))
    
    for training_file in training_set[validation_size:]:
        os.renames(training_file, os.path.join(base_data_path, 'train', os.path.basename(training_file).split('.')[0], os.path.basename(training_file)))
        
    # move all test files into an `unknown` sub-directory
    # do this my moving the directory into a tmp location, then moving into the desired path
    os.renames(os.path.join(base_data_path, 'test'), os.path.join(base_data_path, 'tmp', 'unknown'))
    os.renames(os.path.join(base_data_path, 'tmp', 'unknown'), os.path.join(base_data_path, 'test', 'unknown'))

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/test.zip



test.zip 100% |#####################################| Time: 0:00:50   5.4 MiB/s



downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/train.zip



train.zip 100% |####################################| Time: 0:01:35   5.7 MiB/s



downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/sample_submission.csv



sample_submission.csv 100% |########################| Time: 0:00:00 262.1 KiB/s





In [202]:
# using the sample dir locally
base_data_path = 'data/dogscats/sample'

# set a conservative batch size
batch_size = 8
nb_epoch = 1

# batch_size = 64
# nb_epoch = 5

In [203]:
from vgg16 import Vgg16
vgg = Vgg16()

# set the learning rate
# vgg.model.optimizer.lr = 0.01

train_batches = vgg.get_batches(os.path.join(base_data_path, 'train'), batch_size=batch_size)
validation_batches = vgg.get_batches(os.path.join(base_data_path, 'valid'), batch_size=batch_size * 2)

Found 16 images belonging to 2 classes.
Found 8 images belonging to 2 classes.


In [204]:
vgg.finetune(train_batches)
vgg.fit(train_batches, validation_batches, nb_epoch=nb_epoch)

Epoch 1/1


In [205]:
test_batches, preds = vgg.test('data/dogscats/sample/train')
# test_batches, preds = vgg.test(os.path.join(base_data_path, 'test'))

dog_pred_index = test_batches.class_indices['dogs']

Found 16 images belonging to 2 classes.


In [206]:
ids = [os.path.basename(result).split('.')[1] for result in test_batches.filenames]
results = np.column_stack((ids, preds[:, dog_pred_index]))

In [207]:
np.savetxt('data/test-results.csv', results, fmt='%s', delimiter=',', header='id,label', comments='')
from IPython.display import FileLink
FileLink('data/test-results.csv')