# Preamble

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set(style="white")

# Allows for interactive shell - outputs all non variable statements
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
np.set_printoptions(precision=4, linewidth=100)

from matplotlib import pyplot as plt

from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input, decode_predictions
import numpy as np

model = VGG16(weights='imagenet', include_top=True)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [2]:
import os
import shutil
from glob import glob
np.random.seed(10)

current_dir = os.getcwd()
DATASET_DIR=os.path.join(current_dir, 'dataset')
CROSSVALID_DIR=os.path.join(DATASET_DIR, 'cross_valid')
TRAIN_DIR = os.path.join(DATASET_DIR, 'train')
TEST_DIR = os.path.join(DATASET_DIR, 'test')
CROSSVALID_DIR = os.path.join(DATASET_DIR, 'cross_valid')
SAMPLE_DIR = os.path.join(DATASET_DIR, 'sample')

WEIGHTS_DIR = os.path.join(current_dir, 'weights')

# Kaggle Competition

## Prepare dataset

### Download the dataset

```
kg download -c 'dogs-vs-cats-redux-kernels-edition'
```

### Unzip training and test dataset

In [None]:
# Start from fresh
!rm -rf dataset
!mkdir dataset
!unzip -q train.zip -d $DATASET_DIR
!unzip -q test.zip -d $DATASET_DIR
!tree -d

!find dataset -maxdepth 5 -type d -exec sh -c "echo '{}'; ls -1 '{}' | wc -l" \; | xargs -n 2 | awk '{print $1" "$2}'

#### Prepare the training, crossvalidation, sample dataset along with classification

### Create the training, validation, sample batch dataset

1. There are 12,500 images in the test set.
1. There are 25,000 images in the train set.
1. We need to create and move 10% of `train` to a cross validation set.
1. We will also create a `sample` set containing 10% of the remaining `train` set which will be copied from `train`. The `sample` will be used to test the training process of the model before fully training the model using the `train` model.
    1. `sample/train` can contain 200 samples from `train`.
    1. `sample/cross_valid` can contain 50 samples from `train`.

So, the directory structure would be:
```
dataset/train/
dataset/cross_valid/
dataset/sample/train/
dataset/sample/cross_valid/

dataset/test/
```

In [None]:
def create_crossvalidation(perc = 0.1):
    """
    moves `perc` of train dir to cross validation dir
    """
    os.makedirs(CROSSVALID_DIR, exist_ok=True)
    g = glob(os.path.join(TRAIN_DIR, '*.jpg'))
    shuf = np.random.permutation(g)
    for i in range(int(shuf.shape[0] * perc)):
        filename = os.path.basename(shuf[i])
        os.rename(shuf[i], os.path.join(CROSSVALID_DIR, filename))

def create_sample(sample_train_size=200, sample_crossvalid_size=50):
    """
    sample perc of train data is copied to sample directory
    creates sample train and sample test directories
    """
    
    sample_train_dir = os.path.join(SAMPLE_DIR, 'train')
    sample_crossvalid_dir = os.path.join(SAMPLE_DIR, 'cross_valid')
    
    g = glob(os.path.join(TRAIN_DIR, '*.jpg'))
    shuf = np.random.permutation(g)
    
    ## SPLIT
    train_set = shuf[0:sample_train_size]
    crossvalid_set = shuf[sample_train_size:sample_train_size + sample_crossvalid_size]
    
    os.makedirs(sample_train_dir, exist_ok=True)
    for i in train_set:
        filename = os.path.basename(i)
        shutil.copy(i, os.path.join(sample_train_dir, filename))
    
    os.makedirs(sample_crossvalid_dir, exist_ok=True)
    for i in crossvalid_set:
        filename = os.path.basename(i)
        shutil.copy(i, os.path.join(sample_crossvalid_dir, filename))

def create_labels(abs_directory, labels = ['cat', 'dog']):
    """
    partitions the directories into new directory which is the label
    """
    if labels is None:
        label = 'unknown'
        target_dir=os.path.join(abs_directory, label)
        os.makedirs(target_dir, exist_ok=True)
        for file in glob(os.path.join(abs_directory, '*.jpg')):
            target = os.path.join(target_dir, os.path.basename(file))
            shutil.move(file, target)
    else:
        for label in labels:
            target_dir=os.path.join(abs_directory, label)
            os.makedirs(target_dir, exist_ok=True)
            for file in glob(os.path.join(abs_directory, label + '.*.jpg')):
                target = os.path.join(target_dir, os.path.basename(file))
                shutil.move(file, target)
    
# Create the sample set
create_sample()

# Create the cross validation set
create_crossvalidation()

### Create the labels

In [None]:
# Create labeled directories for each of the sets
create_labels(TRAIN_DIR)
create_labels(CROSSVALID_DIR)
create_labels(SAMPLE_DIR+'/train')
create_labels(SAMPLE_DIR+'/cross_valid')
create_labels(TEST_DIR, labels=None)

!find $DATASET_DIR -maxdepth 5 -type d -exec \
   sh -c "echo '{}'; ls -1 '{}' | wc -l" \; | xargs -n 2 | awk '{print $1" "$2}'