In [1]:
%matplotlib inline
import cv2
from matplotlib import pyplot as plt
import os
import PIL
from PIL import Image
import scipy.misc
import scipy.ndimage
import random
import tensorflow as tf
import theano
import theano.tensor as T
import lasagne
import numpy as np
import time
from random import randint


# print things out as we go.
VERBOSE = True

# Adjustable Constants
IMAGE_SIZE = 128
BATCH_SIZE = 100
NUM_EPOCHS = 200

COMPRESSED_NONBIRDS = "https://www.dropbox.com/s/igu1loh921a4z2g/non_birds.tar.gz?dl=1"
COMPRESSED_BIRDS = "https://www.dropbox.com/s/k039odbkard5c3h/raw_birds.tar.gz?dl=1"


TRAIN_PERCENT = .7
VALIDATION_PERCENT = .2
TESTING_PERCENT = .1

np.testing.assert_almost_equal((TRAIN_PERCENT + VALIDATION_PERCENT + TESTING_PERCENT), 1.,
                               err_msg='make your percents better, dummy.')


# store refs to filenames for each image in here. This way I can load them off
# disk by just retreiving N items from this list and hotloading them.
IMAGE_LIST = []
LABEL_LIST = []

Using gpu device 0: GRID K520 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 4007)


In [2]:
DOWNLOAD_DIR = "/mnt/"
BIRD_FILE = "/mnt/raw_birds.tar.gz"
NONBIRD_FILE = "/mnt/non_birds.tar.gz"

INPUT_BIRD_DIRECTORY = "/mnt/raw_birds/"
INPUT_NONBIRD_DIRECTORY = "/mnt/101_ObjectCategories/"
OUTPUT_BIRD_DIRECTORY = "/mnt/raw_birds_small/"
OUTPUT_NONBIRD_DIRECTORY = "/mnt/not_birds_small/"



# We can save the preprocessed images to disk so that we don't always need to do this 
# full preprocess.
FETCH_PHOTOS = len(IMAGE_LIST) == 0



## Data Preprocessing

We will be training our data with two sets of data. One set is going to be full-sized pictures of various species of birds and the other set will be full-size pictures of various other objects/animals that do not contain any birds. It is desirable for all of these images to be of the same dimensions and preferabley square to keep some of the code succinct.

We will then need to split our images into different partitions for training. We will need a set of **training data**, a set of **validation data** and a set of **testing data**. The training data is the data which will be handed to the neural network during training and will be used throughout the training process to modify and learn the internal weights. The validation data will also be handed to the neural network during training, it's purpose will be to reduce overfitting while learning is happening. The testing data will be used outside of training, it will be used to gauge the accuracy of what the neural network has learned.

# Helper Methods

In [3]:
def _to_np_array(X, Y):
    """
    Converts a list of images in the standard numpy image representation to a numpy array
    that is formatted to be handed into our neural network.
    
    Args:
        X: A python list of arrays with a shape (IMAGE_SIZE, IMAGE_SIZE, 3) each elements is an 8bit integer.
        Y: A list of 0 and 1 values corresponding to the classification of the corresponding X element.
    Returns:
        0: 4D Numpy array with dimensions (NUM_IMAGES, 3, IMAGE_SIZE, IMAGE_SIZE) each element is a float32 
           between 0 and 1.
        1: 1D numpy array of 0, 1 values.

    """
    X_np = np.empty((len(X), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.float32)
    Y_np = np.empty((len(X)), dtype=np.int32)
    
    for i in range(len(X)):
        X_np[i, ...] = X[i]
        Y_np[i, ...] = Y[i]
    
    # Quick Patch:
    # lasagne wants images to be (3, 32, 32) instead of (32, 32, 3)
    red_layer = np.zeros((len(X), IMAGE_SIZE, IMAGE_SIZE))
    blue_layer = np.zeros((len(X), IMAGE_SIZE, IMAGE_SIZE))
    green_layer = np.zeros((len(X), IMAGE_SIZE, IMAGE_SIZE))

    red_layer = X_np[:, :, :, 0]
    blue_layer = X_np[:, :, :, 1]
    green_layer = X_np[:, :, :, 2]
    
    new_X_np = np.zeros((len(X), 3, IMAGE_SIZE, IMAGE_SIZE), dtype=np.float32)
    
    for i in range(len(X)):
        new_X_np[i, 0, ...] = red_layer[i, ...]
        new_X_np[i, 1, ...] = blue_layer[i, ...]
        new_X_np[i, 2, ...] = green_layer[i, ...]

    # normalize values to between 0 and 1
    new_X_np /= np.max(np.abs(new_X_np),axis=0)
    return new_X_np, Y_np

def iterate_minibatches(inputs, targets, batchsize=BATCH_SIZE):
    """
    Iterator which moves through the dataset and returns a batch of images and their corresponding classification. 
    
    I am also randomly flipping/mirroring the images as I hand them out to A) increase the size of my dataset
    B) try to make the trained network invariant to these operations.
    
    Args:
        inputs: List of file handles corresponding to images.
        targets: Classifications for each of those images
        batchsize: The number of images to return on each yield.
    Returns:
        0: A batch of images, loaded into memory and formatted to be handed to the neural network
        1: 1D numpy array of 0, 1 values.

    """
    assert len(inputs) == len(targets)
        
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        # store the loaded images in a list for simplicity.
        tmp_images = []
        
        # but we'll put them in an array of this size before handing them out.
        loaded_inputs = np.zeros((batchsize, 3, IMAGE_SIZE, IMAGE_SIZE))

        excerpt = slice(start_idx, start_idx + batchsize)
        
        # load the images from disk
        for filename in inputs[excerpt]:
            # also randomly flip the image
            ri = randint(0,3)
            if ri == 0:
                tmp_images.append(cv2.imread(filename))
            elif ri == 1:
                tmp_images.append(cv2.flip(cv2.imread(filename),0))
            else:
                tmp_images.append(cv2.flip(cv2.imread(filename),1))


                
            
        # convert them to the nn input format    
        loaded_inputs, loaded_labels = _to_np_array(tmp_images, targets[excerpt])    
        
        yield loaded_inputs, loaded_labels
        
def ensure_dir(f):
    """
    This function will check to see if a directory exists and create it if not.
    
    Args:
        f: the name of the directory to check for / create
    """
    d = os.path.dirname(f)
    if not os.path.exists(d):
        os.makedirs(d)

def _download_datasets():
    """
    Make sure that we have the datasets readily available for us on disk.
    """
    import urllib.request
    
    if VERBOSE:
        print ("Downloading", COMPRESSED_BIRDS)   
    urllib.request.urlretrieve(COMPRESSED_BIRDS, BIRD_FILE)


    if VERBOSE:
        print ("Downloading", COMPRESSED_NONBIRDS)   
    urllib.request.urlretrieve(COMPRESSED_NONBIRDS, NONBIRD_FILE)

     
def _extract_datasets():
    """
    If we've just downloaded the datasets we need to extract them from the tarball
    """
    import tarfile
    
    if VERBOSE:
        print ("Extracting", BIRD_FILE)
    tar = tarfile.open(BIRD_FILE, "r:gz")
    tar.extractall(DOWNLOAD_DIR)
    tar.close()
    
    if VERBOSE:
        print ("Extracting", NONBIRD_FILE)
    tar = tarfile.open(NONBIRD_FILE, "r:gz")
    tar.extractall(DOWNLOAD_DIR)
    tar.close()
    
def _shrink_image(image, name, out_dir, img_size=IMAGE_SIZE):
    """
    Our images can be any size, we want our image to be a standard size to be easier to deal with
    """
    img = Image.open(image)
    img = img.resize((IMAGE_SIZE,IMAGE_SIZE), PIL.Image.ANTIALIAS)
    img.save(out_dir+name)
    return out_dir+name
    
          
def _shuffle(X, Y):
    """
    Take in our image sets and shuffle them. We want to make sure that the correspondence between X[i] and 
    Y[i] is maintained.
    """
    X_shuf = []
    Y_shuf = []
    index_shuf = list(range(len(X)))
    random.shuffle(index_shuf)
    for i in index_shuf:
        X_shuf.append(X[i])
        Y_shuf.append(Y[i])
    return X_shuf, Y_shuf


def _error_rate(predictions, labels):
    """
    Return the error rate based on dense predictions and sparse labels.
    """
    return 100.0 - (
      100.0 *
      np.sum(np.argmax(predictions, 1) == labels) /
      predictions.shape[0])

def preprocess_data():
    """
    Run the entire preprocessing pipeline.
    """
    
    
    # download the files from the provided URL
    if FETCH_PHOTOS:
        _download_datasets()
        _extract_datasets()
        ensure_dir(OUTPUT_BIRD_DIRECTORY)
        ensure_dir(OUTPUT_NONBIRD_DIRECTORY)
        
    positive_count = 0
    negative_count = 0
    # Shrink the pictures of birds.
    if FETCH_PHOTOS:
        if VERBOSE:
            print ("Shrinking the pictures of birds to", IMAGE_SIZE)
        img_count = 0
        for root, subdirs, files in os.walk(INPUT_BIRD_DIRECTORY):
            for subdir in subdirs:
                # get all of the subdirectories in this folder
                for r, s, subfiles in os.walk(os.path.join(INPUT_BIRD_DIRECTORY, subdir)):
                    for f in subfiles:
                        if f != ".DS_Store":
                            # please leave me alone DS_Store
                            try:
                                IMAGE_LIST.append(_shrink_image(os.path.join(INPUT_BIRD_DIRECTORY, subdir, f), "img%d.jpg" % img_count, OUTPUT_BIRD_DIRECTORY))
                                LABEL_LIST.append(1.)
                                positive_count += 1
                                img_count += 1
                                
                            except OSError:
                                # dotfile is probably messing things up. Ignore it
                                pass
    
    # Shrink the pictures of nonbirds.
    if FETCH_PHOTOS:
        if VERBOSE:
            print ("Shrinking the pictures of nonbirds to", IMAGE_SIZE)
        img_count = 0
        for root, subdirs, files in os.walk(INPUT_NONBIRD_DIRECTORY):
            for subdir in subdirs:
                # get all of the subdirectories in this folder
                for r, s, subfiles in os.walk(os.path.join(INPUT_NONBIRD_DIRECTORY, subdir)):
                    for f in subfiles:
                        if f != ".DS_Store":
                            # please leave me alone DS_Store
                            try:
                                IMAGE_LIST.append(_shrink_image(os.path.join(INPUT_NONBIRD_DIRECTORY, subdir, f), "img%d.jpg" % img_count, OUTPUT_NONBIRD_DIRECTORY))
                                LABEL_LIST.append(0.)
                                negative_count += 1
                                img_count += 1
                            except OSError:
                                # A dotfile probably got in the way, just ignore it.
                                pass
    # Turns our images into the numpy arrays that we know they can be.
    
    if not FETCH_PHOTOS:
        # manually count instances of positive and negative labels.
        for i in LABEL_LIST:
            if i == 1.:
                positive_count += 1
            else:
                negative_count += 1
    
    
    X = [] # but first they'll go into a list. 
    Y = [] 

                    
    if VERBOSE:            
        print ("Number of Positive Examples: %d" % positive_count)
        print ("Number of Negative Examples: %d" % negative_count)
        
    # now we want to shuffle this data so that we're not handing the network only one
    # class of data at a time. 
    X, Y = _shuffle(IMAGE_LIST, LABEL_LIST)
    
    # Now we split our data into the the different sets for training, testing and validation. Since
    # we've previously shuffled it shouldn't matter if we fetch each set from particular locations
    
    total_number_images = len(X)
    
    num_training = int(total_number_images * TRAIN_PERCENT)
    num_validation = int(total_number_images * VALIDATION_PERCENT)
    
    training_data = X[:num_training]
    training_labels = Y[:num_training]
    
    validation_data = X[num_training:num_training+num_validation]
    validation_labels = Y[num_training:num_training+num_validation]
    
    testing_data = X[num_training+num_validation:]
    testing_labels = Y[num_training+num_validation:]
    
    if VERBOSE:
        print ("Training set shape:", len(training_data))
        print ("Validation set shape:", len(validation_data))
        print ("Training set shape:", len(testing_data))


    # I'm going to return 6 things from this function. It's not pretty but I would just end up unpacking
    # the anyway.
    
    if VERBOSE:
        print ("Finished preprocessing data")
        
    return training_data, training_labels, validation_data, validation_labels, testing_data, testing_labels

### Let's see how well more normal neural network does.

In [4]:
training_data, training_labels, validation_data, validation_labels, testing_data, testing_labels = preprocess_data()



Downloading https://www.dropbox.com/s/k039odbkard5c3h/raw_birds.tar.gz?dl=1
Downloading https://www.dropbox.com/s/igu1loh921a4z2g/non_birds.tar.gz?dl=1
Extracting /mnt/raw_birds.tar.gz
Extracting /mnt/non_birds.tar.gz
Shrinking the pictures of birds to 128
Shrinking the pictures of nonbirds to 128
Number of Positive Examples: 11788
Number of Negative Examples: 8939
Training set shape: 14508
Validation set shape: 4145
Training set shape: 2074
Finished preprocessing data


## Construct the network

This is where the network is built. Currently the configuration is a 3 convolutional layers each followed by a pooling layer. After the final layer there are two fully connected layers.

In [44]:
def build_cnn(input_var=None):
    
    input1 = lasagne.layers.InputLayer(shape=(None, 3, IMAGE_SIZE, IMAGE_SIZE), input_var=input_var)

    conv1 = lasagne.layers.Conv2DLayer(input1, num_filters=32, filter_size=(9, 9), 
                                     nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(),)
    
    pool1 = lasagne.layers.MaxPool2DLayer(conv1, pool_size=(2,2))

    conv2 = lasagne.layers.Conv2DLayer(pool1, num_filters=64, filter_size=(5, 5), 
                                     nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(),)
        
    pool2 = lasagne.layers.MaxPool2DLayer(conv2, pool_size=(2,2))

    conv3 = lasagne.layers.Conv2DLayer(pool2, num_filters=128, filter_size=(3, 3), 
                                     nonlinearity=lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform(),)

    pool3 = lasagne.layers.MaxPool2DLayer(conv3, pool_size=(2,2))

    fc1 = lasagne.layers.DenseLayer(lasagne.layers.dropout(pool3, p=.5),
                                   num_units=169,
                                   nonlinearity=lasagne.nonlinearities.rectify)

    fc2 = lasagne.layers.DenseLayer(lasagne.layers.dropout(fc1, p=.5),num_units=2, 
                                        nonlinearity=lasagne.nonlinearities.softmax)

    if VERBOSE: 
        print(lasagne.layers.get_output_shape(input1, input_shapes=None))
        print(lasagne.layers.get_output_shape(conv1, input_shapes=None))
        print(lasagne.layers.get_output_shape(pool1, input_shapes=None))
        print(lasagne.layers.get_output_shape(conv2, input_shapes=None))
        print(lasagne.layers.get_output_shape(pool2, input_shapes=None))
        print(lasagne.layers.get_output_shape(conv3, input_shapes=None))
        print(lasagne.layers.get_output_shape(pool3, input_shapes=None))
        print(lasagne.layers.get_output_shape(fc1, input_shapes=None))
        print(lasagne.layers.get_output_shape(fc2, input_shapes=None))



    return fc2, {'input1': input1, 
                 'conv1': conv1, 
                 'conv2': conv2,
                 'pool1': pool1, 
                 'fc1': fc1, 
                 'fc2':fc2}

## Define the training functions.
The neural network's goal is to minimize the error over a certain function. Below is where we define the functions that we want to be minimizing. 

In [None]:
# variables for inputs and labels
input_var = T.tensor4('inputs')
label_var = T.ivector('targets')

network, layers = build_cnn(input_var)

# create the loss expression that we want to minimize during training
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, label_var)
loss = loss.mean()

# now we define how we want the weights to change after each batch of training
params = lasagne.layers.get_all_params(network, trainable=True)
updates = lasagne.updates.nesterov_momentum(loss, params,
                                           learning_rate=0.01,momentum=0.9)

# to monitor progress during training we evaluate the the network on the validation set
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, label_var)
test_loss = test_loss.mean()

test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), label_var), dtype=theano.config.floatX)

# now we can compile a function that performs a training step
train_fn = theano.function([input_var, label_var], loss, updates=updates)

# an a function to do validation
val_fn = theano.function([input_var, label_var], [test_loss, test_acc])

## Train the Network
Now we train the network. The network will receive batches of images and check how well it is currently scoring on those images compared to known results. It will update the weights using stochastic gradient descent at each step.

It will train for predefined number of epochs or will exit early based on acheiving a certain accuracy. I have tried different network configurations and have not been able to acheive better than 91% accuracy on the validation set. To make sure that I am not overfitting the data and to speed things a long I have implemented a simple way to make the process exit early and 89% accuracy. This is probably not actually a good idea.

In [45]:
start = time.time()
print ("training error, validation accuracy")

# now I can run training steps
for epoch in range(NUM_EPOCHS):
    training_error = 0
    train_batches = 0
    
    # each epoch has a pass through the training data
    for batch in iterate_minibatches(training_data, training_labels, NUM_EPOCHS):
        inputs, targets = batch
        training_error += train_fn(inputs, targets)
        train_batches += 1
        
    # and it has a pass over the validation set
    validation_err = 0
    validation_acc = 0
    validation_batches = 0
    for batch in iterate_minibatches(validation_data, validation_labels, BATCH_SIZE):
        inputs, targets = batch
        err, acc = val_fn(inputs, targets)
        validation_err += err
        validation_acc += acc
        validation_batches += 1
        
    print (",", validation_acc / validation_batches )
    if (validation_acc / validation_batches) > .89:
        break
total_time = time.time() - start
print ("Time Spent:", total_time)

(None, 3, 128, 128)
(None, 32, 120, 120)
(None, 32, 60, 60)
(None, 64, 56, 56)
(None, 64, 28, 28)
(None, 128, 26, 26)
(None, 128, 13, 13)
(None, 169)
(None, 2)
training error, validation accuracy
, 0.665121955116
, 0.760731708713
, 0.793170732696
, 0.791219512137
, 0.811463412715
, 0.840731706561
, 0.850731708655
, 0.86073170784
, 0.861707322481
, 0.862926830606
, 0.87170731876
, 0.873170727637
, 0.881707313584
, 0.87804877758
, 0.882682926771
, 0.88048780546
, 0.890487806099
Time Spent: 1044.934053659439


In [46]:
# Now we want to save the model. We could run it off of this machine but this machine costs .50 cents an hours
# and we don't need a machine this powerful aside from training. We'll save the model here and then we can
# port it to a less powerful machine.
np.savez('128_4model.npz', *lasagne.layers.get_all_param_values(network))