In [1]:
'''This script goes along the blog post
"Building powerful image classification models using very little data"
from blog.keras.io.
It uses data that can be downloaded at:
https://www.kaggle.com/c/dogs-vs-cats/data
In our setup, we:
- created a data/ folder
- created train/ and validation/ subfolders inside data/
- created cats/ and dogs/ subfolders inside train/ and validation/
- put the cat pictures index 0-999 in data/train/cats
- put the cat pictures index 1000-1400 in data/validation/cats
- put the dogs pictures index 12500-13499 in data/train/dogs
- put the dog pictures index 13500-13900 in data/validation/dogs
So that we have 1000 training examples for each class, and 400 validation 
examples for each class.
In summary, this is our directory structure:
```
data/
    train/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
    validation/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
```
'''
import numpy as np
import os
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications
import os.path


train_data_dir = '../data/train'
validation_data_dir = '../data/validation'

train_features_path = '{}_train_features.npy'
train_labels_path = '{}_train_labels.npy'
validation_features_path = '{}_validation_features.npy'
validation_labels_path = '{}_validation_labels.npy'
weights_path = '{}_top_model.h5'

# TODO: set properly
width, height = 200, 200
train_samples = 1152
validation_samples = 288
categories = 21
batch_size = 4
epochs = 20

Using TensorFlow backend.


In [9]:
def is_train_data_generated(name):
    return os.path.isfile(train_features_path.format(name)) \
       and os.path.isfile(train_labels_path.format(name))

def generate_train_data(name, model):
    
    print 'generating train data'
    
    naive_datagen = ImageDataGenerator(rescale=1. / 255)
    
    dataflow = naive_datagen.flow_from_directory(train_data_dir, 
                                                 batch_size=batch_size, 
                                                 class_mode='categorical',
                                                 target_size=(width, height),
                                                 shuffle=False)

    features = None
    labels = None    
    rounds = train_samples // batch_size
    print 'running {} rounds'.format(rounds)
    for i in range(rounds):
        if i % 50 == 0:
            print
            print i,'/',rounds,'.',
        else:
            print '.',
        batch = dataflow.next()
        batch_features = model.predict(batch[0])
        batch_labels = batch[1]

        if features is None:
            features = batch_features
        else:
            features = np.append(features,batch_features,axis=0)

        if labels is None:
            labels = batch_labels
        else:
            labels = np.append(labels,batch_labels,axis=0)
            
    np.save(open(train_features_path.format(name), 'w'), features)
    np.save(open(train_labels_path.format(name), 'w'), labels)




In [10]:
def is_validation_data_generated(name):
    return os.path.isfile(validation_features_path.format(name)) \
       and os.path.isfile(validation_labels_path.format(name))

def generate_validation_data(name, model):
    
    print 'generating validation data'
    
    naive_datagen = ImageDataGenerator(rescale=1. / 255)
    
    dataflow = naive_datagen.flow_from_directory(validation_data_dir, 
                                                 batch_size=batch_size, 
                                                 class_mode='categorical',
                                                 target_size=(width, height),
                                                 shuffle=False)

    features = None
    labels = None    
    rounds = validation_samples // batch_size
    print 'running {} rounds'.format(rounds)
    for i in range(rounds):
        if i % 50 == 0:
            print
            print i,'/',rounds,'.',
        else:
            print '.',
        batch = dataflow.next()
        batch_features = model.predict(batch[0])
        batch_labels = batch[1]

        if features is None:
            features = batch_features
        else:
            features = np.append(features,batch_features,axis=0)

        if labels is None:
            labels = batch_labels
        else:
            labels = np.append(labels,batch_labels,axis=0)
            
    np.save(open(validation_features_path.format(name), 'w'), features)
    np.save(open(validation_labels_path.format(name), 'w'), labels)

In [None]:
def do_VGG16_exp1():    
    name = 'VGG16_exp1'
       
    if not is_train_data_generated(name):
        VGG16 = applications.VGG16(include_top=False, weights='imagenet')
        generate_train_data(name,VGG16)
        
    if not is_validation_data_generated(name):
        VGG16 = applications.VGG16(include_top=False, weights='imagenet')
        generate_validation_data(name,VGG16)
    
    print 'training the top model'
    
    train_features = np.load(open(train_features_path.format(name)))
    train_labels = np.load(open(train_labels_path.format(name)))
    validation_features = np.load(open(validation_features_path.format(name)))
    validation_labels = np.load(open(validation_labels_path.format(name)))

    print train_features.shape
    print train_labels.shape
    print validation_features.shape
    print validation_labels.shape

    model = Sequential()
    model.add(Flatten(input_shape=train_features.shape[1:]))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(21, activation='softmax'))

    model.compile(
        optimizer='rmsprop',
        loss='categorical_crossentropy',
        metrics=['accuracy'])

    model.fit(train_features,
              train_labels,
              batch_size=batch_size,
              nb_epoch=epochs,
              validation_data=(validation_features, validation_labels))
    model.save_weights(weights_path.format(name))

do_VGG16_exp1()

# # revisar esto para hacer checkpoints
# # http://stackoverflow.com/questions/35074549/how-to-load-a-model-from-an-hdf5-file-in-keras

generating validation data
Found 288 images belonging to 21 classes.
running 72 rounds

0 / 72 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
50 / 72 . . . . .