In [1]:
'''This script goes along the blog post
"Building powerful image classification models using very little data"
from blog.keras.io.
It uses data that can be downloaded at:
https://www.kaggle.com/c/dogs-vs-cats/data
In our setup, we:
- created a data/ folder
- created train/ and validation/ subfolders inside data/
- created cats/ and dogs/ subfolders inside train/ and validation/
- put the cat pictures index 0-999 in data/train/cats
- put the cat pictures index 1000-1400 in data/validation/cats
- put the dogs pictures index 12500-13499 in data/train/dogs
- put the dog pictures index 13500-13900 in data/validation/dogs
So that we have 1000 training examples for each class, and 400 validation examples for each class.
In summary, this is our directory structure:
```
data/
    train/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
    validation/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
```
'''
import progressbar
progressbar.streams.flush()
DATA_DIR = '../keras-bottleneck-features'

In [None]:
bar = progressbar.ProgressBar()
start = 1
stop = 1001
bar.start(stop-start)
for x in range(start, stop):
    bar.update(x-start)
    !cp {DATA_DIR}/train/cat.{x}.jpg data/train/cats
bar.finish()

In [16]:
bar = progressbar.ProgressBar()
start = 1001
stop = 1401
bar.start(stop-start)
for x in range(start, stop):
    bar.update(x-start)
    !cp {DATA_DIR}/train/cat.{x}.jpg data/validation/cats
bar.finish()

100% (400 of 400) |#######################| Elapsed Time: 0:00:50 Time: 0:00:50


In [3]:
bar = progressbar.ProgressBar()
start = 1250 # downloaded filenames differed from the original notebook instructions
stop = 2250
bar.start(stop-start)
for x in range(start, stop):
    bar.update(x-start)
    !cp {DATA_DIR}/train/dog.{x}.jpg data/train/dogs
bar.finish()

100% (1000 of 1000) |#####################| Elapsed Time: 0:02:07 Time: 0:02:07


In [2]:
bar = progressbar.ProgressBar()
start = 2250 # downloaded filenames differed from the original notebook instructions
stop = 2650
bar.start(stop-start)
for x in range(start, stop):
    bar.update(x-start)
    !cp {DATA_DIR}/train/dog.{x}.jpg data/validation/dogs
bar.finish()

100% (400 of 400) |#######################| Elapsed Time: 0:00:51 Time: 0:00:51


In [4]:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications

# dimensions of our images.
img_width, img_height = 150, 150

top_model_weights_path = 'bottleneck_fc_model.h5'
train_data_dir = 'data/train'
validation_data_dir = 'data/validation'
nb_train_samples = 2000
nb_validation_samples = 800
epochs = 50
batch_size = 16


def save_bottlebeck_features():
    datagen = ImageDataGenerator(rescale=1. / 255)

    # build the VGG16 network
    model = applications.VGG16(include_top=False, weights='imagenet')

    generator = datagen.flow_from_directory(
        train_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
    bottleneck_features_train = model.predict_generator(
        generator, nb_train_samples // batch_size)
    np.save('bottleneck_features_train.npy', bottleneck_features_train)

    generator = datagen.flow_from_directory(
        validation_data_dir,
        target_size=(img_width, img_height),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)
    bottleneck_features_validation = model.predict_generator(
        generator, nb_validation_samples)
    np.save('bottleneck_features_validation.npy', bottleneck_features_validation)


def train_top_model():
    train_data = np.load(open('bottleneck_features_train.npy'))
    train_labels = np.array(
        [0] * (nb_train_samples / 2) + [1] * (nb_train_samples / 2))

    validation_data = np.load(open('bottleneck_features_validation.npy'))
    validation_labels = np.array(
        [0] * (nb_validation_samples / 2) + [1] * (nb_validation_samples / 2))

    model = Sequential()
    model.add(Flatten(input_shape=train_data.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(train_data, train_labels,
              epochs=epochs,
              batch_size=batch_size,
              validation_data=(validation_data, validation_labels))
    model.save_weights(top_model_weights_path)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
save_bottlebeck_features()

Found 2000 images belonging to 2 classes.
Found 800 images belonging to 2 classes.


KeyboardInterrupt: 

In [None]:
train_top_model()