# Dogs versus Cats Redux Competition on Kaggle

## Setup

In [1]:
#reset python environment
%reset -f

from pathlib import Path

import numpy as np
import tensorflow as tf
import time
import os

current_dir = os.getcwd()
home_directory = Path(os.getcwd())

dataset_directory = home_directory / 'datasets' / 'dogs-vs-cats-redux-kernels-edition'

training_dataset_dir = dataset_directory / 'train'
validation_dataset_dir = dataset_directory / 'valid'
test_dataset_dir = dataset_directory / 'test1'

sample_dataset_directory = home_directory / 'datasets' / 'dogs-vs-cats-redux-kernels-edition' / 'sample'
sample_training_dataset_dir = sample_dataset_directory / 'train'
sample_validation_dataset_dir = sample_dataset_directory / 'valid'
sample_test_dataset_dir = sample_dataset_directory / 'test1'

dogs_dir = 'dog'
cats_dir = 'cat'

default_device = '/gpu:0'
# default_device = '/cpu:0'
classes = [l.strip() for l in open('synset.txt').readlines()]

## Prepare Data

* Pick random files from training set and use as validation set
* Pick a subset of files for experimentation (`sample`)

In [2]:
from zipfile import ZipFile

# Create base directory
dataset_directory.mkdir(parents=True)
    
zips_directory = Path('zips') / 'dogs-vs-cats-redux-kernels-edition'

with ZipFile(str(zips_directory / 'train.zip')) as train_zip:
    train_zip.extractall(dataset_directory)
    
with ZipFile(str(zips_directory / 'test1.zip')) as test_zip:
    test_zip.extractall(dataset_directory)

In [3]:
import os
import shutil
from glob import glob

valid_percentage = 0.1
sample_percentage = 0.1

def pick_random(files, percentage, target_dir, copy=True):
    shuffled = np.random.permutation(files)
    num_files = int(len(shuffled) * percentage)

    for f in shuffled[:num_files]:
        if copy:
            f.rename(target_dir / f.name)
        else:
            shutil.copy(str(f), str(target_dir / f.name))
                
try:    
    # Create directory for training and validation images
    cats_training_dataset_dir = training_dataset_dir / cats_dir
    dogs_training_dataset_dir = training_dataset_dir / dogs_dir
    
    cats_training_dataset_dir.mkdir()
    dogs_training_dataset_dir.mkdir()
    
    cats_validation_dataset_dir = validation_dataset_dir / cats_dir
    dogs_validation_dataset_dir = validation_dataset_dir / dogs_dir
    
    cats_validation_dataset_dir.mkdir(parents=True)
    dogs_validation_dataset_dir.mkdir(parents=True)
    
    # Move classes to their respective directories
    for f in training_dataset_dir.glob('cat.*.jpg'):
        f.rename(cats_training_dataset_dir / f.name)

    for f in training_dataset_dir.glob('dog.*.jpg'):
        f.rename(dogs_training_dataset_dir / f.name)
    
    # Move randomly picked validation files
    pick_random(
        list(cats_training_dataset_dir.glob('*.jpg')), valid_percentage,
        cats_validation_dataset_dir, copy=False)
    
    pick_random(
        list(dogs_training_dataset_dir.glob('*.jpg')), valid_percentage,
        dogs_validation_dataset_dir, copy=False)
    
    # Create directories for sample data
    cats_sample_training_dataset_dir = (sample_training_dataset_dir / cats_dir)
    dogs_sample_training_dataset_dir = (sample_training_dataset_dir / dogs_dir)
    
    cats_sample_training_dataset_dir.mkdir(parents=True)
    dogs_sample_training_dataset_dir.mkdir(parents=True)
    
    cats_sample_validation_dataset_dir = sample_validation_dataset_dir / cats_dir
    dogs_sample_validation_dataset_dir = sample_validation_dataset_dir / dogs_dir
    
    cats_sample_validation_dataset_dir.mkdir(parents=True)
    dogs_sample_validation_dataset_dir.mkdir(parents=True)
    
    sample_test_dataset_dir.mkdir(parents=True)
    
    # Copy randomly picked training and test files to samples
    pick_random(
        list(cats_training_dataset_dir.glob('*.jpg')), sample_percentage,
        cats_sample_training_dataset_dir, copy=True)
    
    pick_random(
        list(dogs_training_dataset_dir.glob('*.jpg')), sample_percentage,
        dogs_sample_training_dataset_dir, copy=True)
    
    pick_random(
        list(test_dataset_dir.glob('*.jpg')), sample_percentage,
        sample_test_dataset_dir, copy=True)
        
    # Move randomly picked validation files
    pick_random(
        list(cats_sample_training_dataset_dir.glob('*.jpg')), valid_percentage,
        cats_sample_validation_dataset_dir, copy=False)
    
    pick_random(
        list(dogs_sample_training_dataset_dir.glob('*.jpg')), valid_percentage,
        dogs_sample_validation_dataset_dir, copy=False)
    
except FileExistsError as e:
    print("Error: Looks like data has already been prepared. Delete everything except the zip files to recreate.")

# Extract Features

In [4]:
from glob import glob
                    
def filenames_and_labels(path):
    cat_filenames = np.array(glob("{}/cat/*.jpg".format(path)))
    cat_labels = np.zeros_like(cat_filenames, dtype='float')
    dog_filenames = np.array(glob("{}/dog/*.jpg".format(path)))
    dog_labels = np.ones_like(dog_filenames, dtype='float')
    
    return np.concatenate([cat_filenames, dog_filenames]), np.concatenate([cat_labels, dog_labels])

In [5]:
import time
import tensorflow as tf

import tensorflow_image_utils as tiu
from vgg16 import Vgg16Model

batch_size = 32
augmentation_epochs = 4
    
tf.reset_default_graph()

with tf.Session() as sess:
    filenames, labels = filenames_and_labels(sample_training_dataset_dir)
    
    filename_queue, label_queue = tf.train.slice_input_producer(
                        [
                            tf.convert_to_tensor(filenames, dtype=tf.string),
                            tf.convert_to_tensor(labels, dtype=tf.float32)
                        ], num_epochs=augmentation_epochs, shuffle=False)
    
    image = tiu.load_image(filename_queue, size=(224, 224))
    image = tiu.distort_image(image)
    image = tiu.vgg16_preprocess(image, shape=(224, 224, 3))
    
    batched_data = tf.train.batch(
        [image, label_queue, filename_queue],
        batch_size=batch_size,
        num_threads=4,
        enqueue_many=False,
        capacity=3 * batch_size)
    
    inputs = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name="input")
    model = Vgg16Model()
    model.build(inputs)

    sess.run([
        tf.local_variables_initializer(),
        tf.global_variables_initializer()
    ])
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess,coord=coord)
    
    codes = []
    
    num_unique_files = len(filenames)
    num_files_to_process = num_unique_files * augmentation_epochs
    num_batches = num_unique_files // batch_size
    
    if num_unique_files % batch_size != 0:
        num_batches = num_batches + 1
    
    num_processed_files = 0
    
    current_epoch = 0
    current_batch = 0
    
    try:
        while not coord.should_stop():
            current_epoch = num_processed_files // num_unique_files
            current_batch = (num_processed_files - (current_epoch * num_unique_files)) // batch_size
            
            print("\nEpoch {}/{}, Batch {}/{}:".format(
                current_epoch + 1, augmentation_epochs, current_batch + 1, num_batches
            ))
            
            t0 = time.perf_counter()
            batch_images, batch_labels, batch_filenames = sess.run(batched_data)
            t1 = time.perf_counter()
            print("\tFetching batch took {:.3f} seconds".format(t1-t0))
            
            # flatten shape of maxpool5: (7, 7, 512) -> 7 * 7 * 512
            flattened = tf.reshape(model.max_pool5, shape=(-1, 7 * 7 * 512))
            
            features = sess.run(flattened, feed_dict={inputs: batch_images})
            t2 = time.perf_counter()
            print("\tExtracting features took {:.3f} seconds".format(t2-t1))
            
            for i, batch_filename in enumerate(batch_filenames):
                codes.append([batch_labels[i], batch_filename, features[i]])
            
            t3 = time.perf_counter()            
            num_processed_files = num_processed_files + len(batch_filenames)
            print("\tProcessing {} images {:.3f} seconds".format(len(batch_filenames), t3-t0))
    except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()
        coord.join(threads)

np.save('sample_training_codes.npy', np.array(codes, dtype='object'))


Epoch 1/4, Batch 1/79:
	Fetching batch took 0.063 seconds
	Extracting features took 2.148 seconds
	Processing 32 images 2.211 seconds

Epoch 1/4, Batch 2/79:
	Fetching batch took 0.006 seconds
	Extracting features took 0.347 seconds
	Processing 32 images 0.354 seconds

Epoch 1/4, Batch 3/79:
	Fetching batch took 0.006 seconds
	Extracting features took 0.345 seconds
	Processing 32 images 0.353 seconds

Epoch 1/4, Batch 4/79:
	Fetching batch took 0.007 seconds
	Extracting features took 0.352 seconds
	Processing 32 images 0.360 seconds

Epoch 1/4, Batch 5/79:
	Fetching batch took 0.007 seconds
	Extracting features took 0.344 seconds
	Processing 32 images 0.353 seconds

Epoch 1/4, Batch 6/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.354 seconds
	Processing 32 images 0.378 seconds

Epoch 1/4, Batch 7/79:
	Fetching batch took 0.031 seconds
	Extracting features took 0.355 seconds
	Processing 32 images 0.386 seconds

Epoch 1/4, Batch 8/79:
	Fetching batch took 0.030 seco

	Extracting features took 0.346 seconds
	Processing 32 images 0.375 seconds

Epoch 1/4, Batch 62/79:
	Fetching batch took 0.029 seconds
	Extracting features took 0.358 seconds
	Processing 32 images 0.388 seconds

Epoch 1/4, Batch 63/79:
	Fetching batch took 0.034 seconds
	Extracting features took 0.348 seconds
	Processing 32 images 0.383 seconds

Epoch 1/4, Batch 64/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.357 seconds
	Processing 32 images 0.385 seconds

Epoch 1/4, Batch 65/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.359 seconds
	Processing 32 images 0.384 seconds

Epoch 1/4, Batch 66/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.358 seconds
	Processing 32 images 0.385 seconds

Epoch 1/4, Batch 67/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.361 seconds
	Processing 32 images 0.386 seconds

Epoch 1/4, Batch 68/79:
	Fetching batch took 0.031 seconds
	Extracting features took 0.366 seconds
	Process

	Extracting features took 0.363 seconds
	Processing 32 images 0.391 seconds

Epoch 2/4, Batch 44/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.345 seconds
	Processing 32 images 0.371 seconds

Epoch 2/4, Batch 45/79:
	Fetching batch took 0.030 seconds
	Extracting features took 0.350 seconds
	Processing 32 images 0.381 seconds

Epoch 2/4, Batch 46/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.359 seconds
	Processing 32 images 0.387 seconds

Epoch 2/4, Batch 47/79:
	Fetching batch took 0.023 seconds
	Extracting features took 0.355 seconds
	Processing 32 images 0.379 seconds

Epoch 2/4, Batch 48/79:
	Fetching batch took 0.030 seconds
	Extracting features took 0.350 seconds
	Processing 32 images 0.381 seconds

Epoch 2/4, Batch 49/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.358 seconds
	Processing 32 images 0.383 seconds

Epoch 2/4, Batch 50/79:
	Fetching batch took 0.024 seconds
	Extracting features took 0.356 seconds
	Process

	Extracting features took 0.359 seconds
	Processing 32 images 0.389 seconds

Epoch 3/4, Batch 27/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.356 seconds
	Processing 32 images 0.384 seconds

Epoch 3/4, Batch 28/79:
	Fetching batch took 0.026 seconds
	Extracting features took 0.358 seconds
	Processing 32 images 0.384 seconds

Epoch 3/4, Batch 29/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.359 seconds
	Processing 32 images 0.384 seconds

Epoch 3/4, Batch 30/79:
	Fetching batch took 0.026 seconds
	Extracting features took 0.365 seconds
	Processing 32 images 0.391 seconds

Epoch 3/4, Batch 31/79:
	Fetching batch took 0.030 seconds
	Extracting features took 0.358 seconds
	Processing 32 images 0.389 seconds

Epoch 3/4, Batch 32/79:
	Fetching batch took 0.030 seconds
	Extracting features took 0.357 seconds
	Processing 32 images 0.388 seconds

Epoch 3/4, Batch 33/79:
	Fetching batch took 0.024 seconds
	Extracting features took 0.359 seconds
	Process

	Extracting features took 0.362 seconds
	Processing 32 images 0.386 seconds

Epoch 4/4, Batch 10/79:
	Fetching batch took 0.023 seconds
	Extracting features took 0.368 seconds
	Processing 32 images 0.391 seconds

Epoch 4/4, Batch 11/79:
	Fetching batch took 0.023 seconds
	Extracting features took 0.365 seconds
	Processing 32 images 0.388 seconds

Epoch 4/4, Batch 12/79:
	Fetching batch took 0.026 seconds
	Extracting features took 0.362 seconds
	Processing 32 images 0.388 seconds

Epoch 4/4, Batch 13/79:
	Fetching batch took 0.021 seconds
	Extracting features took 0.344 seconds
	Processing 32 images 0.365 seconds

Epoch 4/4, Batch 14/79:
	Fetching batch took 0.030 seconds
	Extracting features took 0.364 seconds
	Processing 32 images 0.395 seconds

Epoch 4/4, Batch 15/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.365 seconds
	Processing 32 images 0.392 seconds

Epoch 4/4, Batch 16/79:
	Fetching batch took 0.027 seconds
	Extracting features took 0.369 seconds
	Process

	Extracting features took 0.361 seconds
	Processing 32 images 0.387 seconds

Epoch 4/4, Batch 71/79:
	Fetching batch took 0.023 seconds
	Extracting features took 0.367 seconds
	Processing 32 images 0.391 seconds

Epoch 4/4, Batch 72/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.367 seconds
	Processing 32 images 0.393 seconds

Epoch 4/4, Batch 73/79:
	Fetching batch took 0.024 seconds
	Extracting features took 0.366 seconds
	Processing 32 images 0.390 seconds

Epoch 4/4, Batch 74/79:
	Fetching batch took 0.033 seconds
	Extracting features took 0.364 seconds
	Processing 32 images 0.398 seconds

Epoch 4/4, Batch 75/79:
	Fetching batch took 0.026 seconds
	Extracting features took 0.365 seconds
	Processing 32 images 0.391 seconds

Epoch 4/4, Batch 76/79:
	Fetching batch took 0.026 seconds
	Extracting features took 0.373 seconds
	Processing 32 images 0.399 seconds

Epoch 4/4, Batch 77/79:
	Fetching batch took 0.025 seconds
	Extracting features took 0.372 seconds
	Process

In [None]:
d = np.load('sample_training_codes.npy')