# A Cat/Dog classifier

Here we will try to train a cat/dog classifiers with images of higher-resolution than we got from the CIFA10 dataset.

In [None]:
# numpy and plotting
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# keras
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing.image import ImageDataGenerator

## Data
First we load some prepared images of cats and dogs and split the data into training and test sets and make sure we have equal numbers of cats and dogs.

In [None]:
DATAFILE="../../data/catsDogsImageNet.npz"
with np.load(DATAFILE) as datafile:
    ncats = np.where(datafile["label"]==0)[0].size
    ndogs = np.where(datafile["label"]==1)[0].size
    neach = min(ncats, ndogs)
    print(datafile["data"].shape[0], " images in total")
    print(ncats, " cats, ", ndogs, " dogs; using ", neach, " of each")
    data = np.concatenate((datafile['data'][0:neach], datafile['data'][ncats:ncats+neach]))
    data = data /255.
    dataAverage = np.average(data, axis=0)
    #data -= dataAverage
    labels = np.concatenate((np.zeros(neach), np.ones(neach)))
    
    plt.imshow(data[0])
    plt.title("a cat")
    plt.show()
    plt.imshow(data[ncats])
    plt.title("a dog")
    plt.show()
    
selection = np.random.permutation(data.shape[0])
split = int(len(selection)*.7)
x_train = data[selection[:split]]
y_train = labels[selection[:split]]
x_test = data[selection[split:]]
y_test = labels[selection[split:]]
del labels

# traning data is augmented
train_datagen = ImageDataGenerator(
    shear_range=0.2,
    zoom_range=0.2,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True)

batch_size=32
input_shape=x_train.shape[1:]



In this case the we only have ~4500 images and these contain more potentially useless information as they are larger. To metigate this to an extent we use data augmentation from the start.

The `.flow()` command below generates batches of randomly transformed images and it will loop indefinitely, so we need to `break` the loop at some point:

In [None]:
train_generator = train_datagen.flow(x_train, y_train,
    batch_size=batch_size)

In [None]:
%matplotlib inline

for batch in train_generator:
    for i in range(batch[0].shape[0]):
        plt.figure(i)
        imgplot = plt.imshow(batch[0][i])
    break  # otherwise the generator would loop indefinitely
plt.show()

The next Cell defines parameters we use for training our networks and defines functions which we will use later to print our results.

In [None]:
def predict(idx, model, avg=None, transform = lambda x : x/255., cols=3, file=DATAFILE, data=None, threshold=.4):
    try:
        idx = list(idx)
    except:
        idx = [idx]
        
    cats = 0
    dogs = 0
    if data is None:
        with np.load(file) as datafile:
            data = datafile["data"][idx]
    else:
        data = data[idx]
    
    if transform is None:
        transform = lambda x : x
    p = model.predict(transform(data))
    i = 0
    while i < p.shape[0]:
        fig, axs = plt.subplots(1,cols,figsize=(8*cols,8))
        fig.figsize=(20,10)
        for ax in axs:
            if avg is not None:
                img = (data[i]+avg)
            else:
                img = (data[i])

            ax.imshow(img)
            if p[i] < threshold:
                label = "cat"
                cats += 1
            elif p[i] > 1-threshold:
                label = "dog"
                dogs += 1
            else:
                label = "not sure"
            ax.text(.5,0, label+ "; score = " + str(p[i]),
                    horizontalalignment='center', verticalalignment='bottom', transform=ax.axes.transAxes,
                    backgroundcolor="white", size="large")
            i += 1
            if i >= p.shape[0]:
                break
        plt.show()
    print(cats, " cats (", cats/len(idx)*100., "%),", dogs, " dogs (", dogs/len(idx)*100., "%)")
    
def predictCategorical(idx, model, decode=lambda x : str(x), preproc=lambda x : x/255.
                       , cols=3, file=DATAFILE, data=None):
    try:
        idx = list(idx)
    except:
        idx = [idx]
        
    if data is None:
        with np.load(file) as datafile:
            data = datafile["data"][idx]
    else:
        data = data[idx]
        
    p = model.predict(data if preproc is None else preproc(data))
    
    i=0
    while i < p.shape[0]:
        fig, axs = plt.subplots(1,cols,figsize=(8*cols,8))
        fig.figsize=(20,10)
        for ax in axs:
            ax.imshow(data[i])
            ax.text(.5,0, decode(p[i:i+1]),
                    horizontalalignment='center', verticalalignment='bottom', transform=ax.axes.transAxes,
                    backgroundcolor="white", size="large")
            i += 1
            if i >= p.shape[0]:
                break
        plt.show()

### A Rather Simple Network

Next, we define an train a rather simple network to tell cats from dogs. We used this one before on the CIFAR 10 data and it got us to about 80% accuracy there.

In [None]:
model = Sequential()
model.add(MaxPooling2D(pool_size=(2, 2), input_shape=input_shape))
model.add(Conv2D(48, kernel_size=(5, 5),
                 activation='relu', padding="same"))
model.add(Conv2D(32, (5, 5), activation='relu', padding="same"))
model.add(Conv2D(32, (3, 3), activation='relu', padding="same"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(96, (5, 5), activation='relu', padding="same"))
model.add(Conv2D(64, (3, 3), activation='relu', padding="same"))
model.add(Conv2D(64, (3, 3), activation='relu', padding="same"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(160, (3, 3), activation='relu', padding="same"))
model.add(Conv2D(128, (3, 3), activation='relu', padding="same"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
model.add(Conv2D(256, (3, 3), activation='relu', padding="same"))
model.add(Flatten())
model.add(Dropout(.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

opt = keras.optimizers.adam(lr=0.0001)

model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

# model.summary()

In [None]:
hist = model.fit_generator(
        train_generator,
        steps_per_epoch=4000 // batch_size,
        epochs=10,
        validation_data=(x_test, y_test))

# model.save_weights('weights.h5')
# model.load_weights('weights.h5')

After ten epochs we see a validation accuray a bit over 70% and a few more epoch may get this up to ~80% again.

Now, let us have a look at the results this network give on a few images and the corresponding predictions.

In [None]:
# some arbitrary selection from the data
predict([1,2000,3000,4000,4300,4510], model)

This network is not very good, but decent, given, that some of the training images are rather complex.

In [None]:
# some of the training images
predict(range(9), model, data=x_train, transform=None)

In [None]:
# some of the validation images
predict(range(9), model, data=x_test, transform=None)

## Using a Pretrained Network

We will now try to use a pre-trained network for our purposes. The network we will try is the inception_v3 net trained on the ImageNet Dataset. This dataset contains images from 1000 categories.

In [None]:
from keras.applications import inception_v3, VGG16

In [None]:
inception = inception_v3.InceptionV3()

predictCategorical([1,2000,3000,4000,4300,4510], model=inception, decode=lambda x: inception_v3.decode_predictions(x, top=1))

# Transfer Learning

Recent winners of the ImageNet competitions are very accurate. Can we change it to only do this and to also focus on cats and dogs only?

One way in which they are better than our small network is that they are able to generalize concepts better. The deep networks have learned to extract features from image which make up the things it knows. By replacing the top layer of the network by one we train ourselfes we can make use of the pre-trained network's ability to extract features to distinguish only the classes we want it to:

## Using Bottleneck Features

We will use the VGG16 architecture, pre-trained on the ImageNet dataset ---a model we will learn about later. Because the ImageNet dataset contains several "cat" classes (persian cat, siamese cat, ...) and many "dog" classes among its total of 1000 classes, this model will already have learned features that are relevant to our classification problem. In fact, it is possible that merely recording the softmax predictions of the model over our data rather than the bottleneck features would be enough to solve our dogs vs. cats classification problem extremely well. However, the method we present here is more likely to generalize well to a broader range of problems, including problems featuring classes absent from ImageNet.

Here's what the VGG16 architecture looks like:

![](../../images/vgg16_original.png)

Our strategy will be as follow: we will only instantiate the convolutional part of the model, everything up to the fully-connected layers. We will then run this model on our training and validation data once, recording the output (the "bottleneck features" from the VGG16 model: the last activation maps before the fully-connected layers) as numpy arrays. Then we will train a small fully-connected model on top of these stored features.

The reason why we are storing the features offline rather than adding our fully-connected model directly on top of a frozen convolutional base and running the whole thing, is computational efficiency. Running VGG16 is expensive, especially if you're working on CPU, and we want to only do it once. Note that this prevents us from using data augmentation though.

In [None]:
vgg16 = VGG16(include_top=False, weights='imagenet',input_shape=(224,224,3))

In [None]:
n = 2000 // batch_size

x_bottleneck_features_train = []
y_bottleneck_features_train = []
for i in train_generator:
    x_bottleneck_features_train.append(vgg16.predict(i[0]))
    y_bottleneck_features_train.append(i[1])
    n -= 1
    if n <= 0:
        break
x_bottleneck_features_train = np.concatenate(x_bottleneck_features_train)
y_bottleneck_features_train = np.concatenate(y_bottleneck_features_train)

x_bottleneck_features_test = vgg16.predict(x_test)

In [None]:
model = Sequential()
model.add(Flatten(input_shape=x_bottleneck_features_train.shape[1:]))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(x_bottleneck_features_train, y_bottleneck_features_train,
          epochs=10,
          batch_size=batch_size,
          validation_data=(x_bottleneck_features_test, y_test))

We reach a validation accuracy of 0.90-0.91: not bad at all. This is definitely partly due to the fact that the base model was trained on a dataset that already featured dogs and cats (among hundreds of other classes).

We also save the weights of this model as we will use it for our next method!

In [None]:
model.save_weights('bottleneck_fc_model.h5')

## Fine-tuning

To further improve our previous result, we can try to "fine-tune" the last convolutional block of the VGG16 model alongside the top-level classifier. Fine-tuning consist in starting from a trained network, then re-training it on a new dataset using very small weight updates. In our case, this can be done in 3 steps:

*  instantiate the convolutional base of VGG16 and load its weights
* add our previously defined fully-connected model on top, and load its weights
* freeze the layers of the VGG16 model up to the last convolutional block

![](../../images/vgg16_modified.png)

Note that:
* in order to perform fine-tuning, all layers should start with properly trained weights: for instance you should not slap a randomly initialized fully-connected network on top of a pre-trained convolutional base. This is because the large gradient updates triggered by the randomly initialized weights would wreck the learned weights in the convolutional base. In our case this is why we first train the top-level classifier, and only then start fine-tuning convolutional weights alongside it.
* we choose to only fine-tune the last convolutional block rather than the entire network in order to prevent overfitting, since the entire network would have a very large entropic capacity and thus a strong tendency to overfit. The features learned by low-level convolutional blocks are more general, less abstract than those found higher-up, so it is sensible to keep the first few blocks fixed (more general features) and only fine-tune the last one (more specialized features).
* fine-tuning should be done with a very slow learning rate, and typically with the SGD optimizer rather than an adaptative learning rate optimizer such as RMSProp. This is to make sure that the magnitude of the updates stays very small, so as not to wreck the previously learned features.

In [None]:
# was already done above if you ran all cells
# vgg16 = VGG16(include_top=False, weights='imagenet',input_shape=(224,224,3))

After instantiating the VGG base and loading its weights, we add our previously trained fully-connected classifier on top:

In [None]:
top_model = Sequential()
top_model.add(Flatten(input_shape=vgg16.output_shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(1, activation='sigmoid'))

Now its necessary to start with a fully-trained classifier including the top classifier to do fine-tuning. Thus we now load the weights we saved in the bottleneck method:

In [None]:
top_model.load_weights('bottleneck_fc_model.h5')

Now we can add the model on top of the VGG16 convolutional base:

In [None]:
from keras.models import Model

model = Model(inputs=vgg16.input, 
              outputs=top_model(vgg16.output))

We then proceed to freeze all convolutional layers up to the last convolutional block:

In [None]:
# set the first 25 layers (up to the last conv block)
# to non-trainable (weights will not be updated)
for layer in model.layers[:15]:
    layer.trainable = False

In [None]:
from keras import optimizers

# compile the model with a SGD/momentum optimizer
# and a very slow learning rate.
model.compile(loss='binary_crossentropy',
              optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

Finally, we start training the whole thing, with a very slow learning rate:

In [None]:
# fine-tune the model
model.fit_generator(
    train_generator,
    steps_per_epoch=2000 // batch_size,
    epochs=10,
    validation_data=(x_test, y_test))

This approach gets us to a validation accuracy of 0.94 after 10 epochs. Great success!

Here are a few more approaches you can try to get to above 0.95:
* more aggresive data augmentation
* more aggressive dropout
* use of L1 and L2 regularization (also known as "weight decay")
* fine-tuning one more convolutional block (alongside greater regularization)

## Outlook: Inception V3

We can also do the same thing with a more recend ImageNet winner: inception V3, which it actually the one we looked at for these first classifications.

For variety, we will only replace the top layer of this network by a single dense layer with one output (cat/dog). For brevity, we will not pre-calculate bootleneck features, but just freeze the whole network. This has the advantage that we can train on the whole augmented dataset.

In [None]:
# get the pre-trained inception_v3 network without its top layer
inception = inception_v3.InceptionV3(include_top=False
        , weights='imagenet', input_tensor=None, input_shape=input_shape, pooling=None)

# keep the pre-trained layers constant during training
for l in inception.layers:
    l.trainable = False
    
# add our own top
x = Flatten()(inception.output)
predictions=Dense(1, activation='sigmoid')(x)
modelInception = Model(outputs=predictions, inputs=inception.input)

modelInception.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])


# modelInception.summary()

This model won the 2016 challenge. We will look at it, too, in more detail in a later session.

In [None]:
# train without augmentation, this shows more gradual improvement
modelInception.fit(x_train, y_train
                             , batch_size=batch_size
                             , validation_data=[x_test, y_test]
                             , epochs=8)

This model gives us ~98% accuracy after only 10 epochs and not much work.

Above we trained without data augmentation just on our bare traning data. With augmentation, the validation accuracy ends up at 99% after a few epochs, value where we might consider our test dataset too small to give us an accurate measurement. The traing accuracy remains smaller, probably because of the

In [None]:
# consider resetting the model above
modelInception.fit_generator(train_generator
                             , steps_per_epoch=2000 // batch_size
                             , validation_data=[x_test, y_test]
                             , epochs=2)

In [None]:
# some arbitrary selection from the data
predict([1,2000,3000,4000,4300,4510], modelInception)

In [None]:
# some of the training images
predict(range(9), modelInception, data=x_train, transform=None)

In [None]:
# some of the validation images
predict(range(9), modelInception, data=x_test, transform=None)

### Databias

When preparing our dataset we did not include all dog-pictures. In fact, we happened to include only pictures of shepherds and huskies, while another breed was left. See how the inception-based model views these now:

In [None]:
predict(range(4400, 4420), modelInception)

Let us tune the model a bit with a more diverse dataset.

Here we can also see another way to feed images into keras: directly from directories:

## Another Cats-Dogs Dataset

We will start from a setup of only 2000 training examples (1000 per class either a dog or a cat). The data is stored in the `data/cats-dogs` folder which has a `train` data directory and a `validation` data directory containing one subdirectory per image class, filled with `.jpg` images:

```
cats-dogs/
    train/
        dogs/
            dog.001.jpg
            dog.002.jpg
            ...
        cats/
            cat.001.jpg
            cat.002.jpg
            ...
    validation/
        dogs/
            dog.003.jpg
            dog.004.jpg
            ...
        cats/
            cat.003.jpg
            cat.004.jpg
            ...
```

The `validataion` set consists of 400 additional samples from each class as validation data, to evaluate our models.

Note that `2000` samples are *very* few examples to learn from, for a classification problem that is far from simple. So this is a challenging machine learning problem, but it is also a realistic one: in a lot of real-world use cases, even small-scale data collection can be extremely expensive or sometimes near-impossible (e.g. in medical imaging). Being able to make the most out of very little data is a key skill of a competent data scientist.

In [None]:
import os
base_dir = os.path.join('..','..','data','cats-dogs')
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'validation')

train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')

validation_cats_dir = os.path.join(validation_dir, 'cats')
validation_dogs_dir = os.path.join(validation_dir, 'dogs')

The following generator wil read the images from this directory tree and also normalize and augment them.

In [None]:
# prepare data augmentation configuration
train_datagen2 = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

test_datagen2 = ImageDataGenerator(rescale=1. / 255)

train_generator2 = train_datagen2.flow_from_directory(
        train_dir,  # this is the target directory
        target_size=(224, 224),  # all images will be resized to 224x224
        batch_size=batch_size,
        class_mode='binary')  # since we use binary_crossentropy loss, we need binary labels

test_generator2 = test_datagen2.flow_from_directory(
    validation_dir,
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='binary')

In [None]:
# consider resetting the model above
modelInception.fit_generator(train_generator2
                             , steps_per_epoch=2000 // batch_size
                             , validation_data=test_generator2
                             , validation_steps=800 // batch_size
                             , epochs=5)

Now, we can have another look at that misunderstood breed:

In [None]:
predict(range(4400, 4420), modelInception)