# Dogs versus Cats Redux Competition on Kaggle

## Setup

In [1]:
#reset python environment
%reset -f

import numpy as np
import tensorflow as tf
import time
import os

current_dir = os.getcwd()

home_directory = os.getcwd()
dataset_directory = home_directory + '/datasets/dogs-vs-cats-redux-kernels-edition/'

training_dataset_dir = dataset_directory + '/train/'
validation_dataset_dir = dataset_directory + '/valid/'
test_dataset_dir = dataset_directory + '/test/'

sample_dataset_directory = home_directory + '/datasets/dogs-vs-cats-redux-kernels-edition/sample/'
sample_training_dataset_dir = sample_dataset_directory + '/train/'
sample_validation_dataset_dir = sample_dataset_directory + '/valid/'
sample_test_dataset_dir = sample_dataset_directory + '/test/'

# default_device = '/gpu:0'
default_device = '/cpu:0'
classes = [l.strip() for l in open('synset.txt').readlines()]

# Extract Features

In [2]:
from glob import glob
                    
def filenames_and_labels(path):
    cat_filenames = np.array(glob("{}/cat/*.jpg".format(path)))
    cat_labels = np.zeros_like(cat_filenames, dtype='float')
    dog_filenames = np.array(glob("{}/dog/*.jpg".format(path)))
    dog_labels = np.ones_like(dog_filenames, dtype='float')
    
    return np.concatenate([cat_filenames, dog_filenames]), np.concatenate([cat_labels, dog_labels])

In [None]:
import time
import tensorflow as tf

import tensorflow_image_utils as tiu
from vgg16 import Vgg16Model

batch_size = 32
augmentation_epochs = 4
    
tf.reset_default_graph()

with tf.Session() as sess:
    filenames, labels = filenames_and_labels(sample_training_dataset_dir)
    
    filename_queue, label_queue = tf.train.slice_input_producer(
                        [
                            tf.convert_to_tensor(filenames, dtype=tf.string),
                            tf.convert_to_tensor(labels, dtype=tf.float32)
                        ], num_epochs=augmentation_epochs, shuffle=False)
    
    image = tiu.load_image(filename_queue, size=(224, 224))
    image = tiu.distort_image(image)
    image = tiu.vgg16_preprocess(image, shape=(224, 224, 3))
    
    batched_data = tf.train.batch(
        [image, label_queue, filename_queue],
        batch_size=batch_size,
        num_threads=4,
        enqueue_many=False,
        capacity=3 * batch_size)
    
    inputs = tf.placeholder(tf.float32, shape=(None, 224, 224, 3), name="input")
    model = Vgg16Model()
    model.build(inputs)

    sess.run([
        tf.local_variables_initializer(),
        tf.global_variables_initializer()
    ])
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess,coord=coord)
    
    codes = []
    
    num_unique_files = len(filenames)
    num_files_to_process = num_unique_files * augmentation_epochs
    num_batches = num_unique_files // batch_size
    
    if num_unique_files % batch_size != 0:
        num_batches = num_batches + 1
    
    num_processed_files = 0
    
    current_epoch = 0
    current_batch = 0
    
    try:
        while not coord.should_stop():
            current_epoch = num_processed_files // num_unique_files
            current_batch = (num_processed_files - (current_epoch * num_unique_files)) // batch_size
            
            print("\nEpoch {}/{}, Batch {}/{}:".format(
                current_epoch + 1, augmentation_epochs, current_batch + 1, num_batches
            ))
            
            t0 = time.perf_counter()
            batch_images, batch_labels, batch_filenames = sess.run(batched_data)
            t1 = time.perf_counter()
            print("\tFetching batch took {:.3f} seconds".format(t1-t0))
            
            # flatten shape of maxpool5: (7, 7, 512) -> 7 * 7 * 512
            flattened = tf.reshape(model.max_pool5, shape=(-1, 7 * 7 * 512))
            
            features = sess.run(flattened, feed_dict={inputs: batch_images})
            t2 = time.perf_counter()
            print("\tExtracting features took {:.3f} seconds".format(t2-t1))
            
            for i, batch_filename in enumerate(batch_filenames):
                codes.append([batch_labels[i], batch_filename, features[i]])
            
            t3 = time.perf_counter()            
            num_processed_files = num_processed_files + len(batch_filenames)
            print("\tProcessing {} images {:.3f} seconds".format(len(batch_filenames), t3-t0))
    except tf.errors.OutOfRangeError:
            print('Done training -- epoch limit reached')
    finally:
        coord.request_stop()
        coord.join(threads)

np.save('sample_training_codes.npy', np.array(codes, dtype='object'))


Epoch 1/4, Batch 1/7:
	Fetching batch took 0.342 seconds
	Extracting features took 34.143 seconds
	Processing 32 images 34.487 seconds

Epoch 1/4, Batch 2/7:
	Fetching batch took 0.071 seconds
	Extracting features took 33.090 seconds
	Processing 32 images 33.161 seconds

Epoch 1/4, Batch 3/7:
	Fetching batch took 0.052 seconds
	Extracting features took 27.242 seconds
	Processing 32 images 27.294 seconds

Epoch 1/4, Batch 4/7:
	Fetching batch took 0.055 seconds
	Extracting features took 26.428 seconds
	Processing 32 images 26.483 seconds

Epoch 1/4, Batch 5/7:
	Fetching batch took 0.059 seconds
	Extracting features took 26.924 seconds
	Processing 32 images 26.983 seconds

Epoch 1/4, Batch 6/7:
	Fetching batch took 0.071 seconds
	Extracting features took 32.275 seconds
	Processing 32 images 32.346 seconds

Epoch 1/4, Batch 7/7:
	Fetching batch took 0.073 seconds
	Extracting features took 30.528 seconds
	Processing 32 images 30.601 seconds

Epoch 2/4, Batch 1/7:
	Fetching batch took 0.07

In [None]:
d = np.load('sample_training_codes.npy')