In [2]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd

def prepare_dataframe(folder_dir, test_size=0.2, skip_classes=None):
    #https://stackoverflow.com/questions/42654961/creating-pandas-dataframe-from-os
    res = []
    
    for root, dirs, files in os.walk(folder_dir, topdown=True):
        if len(files) > 0:
            res.extend(list(zip([root]*len(files), files)))

    df = pd.DataFrame(res, columns=['Path', 'File_Name'])

    df = df[df['File_Name'] != 'Thumbs.db']
    df['ClientId'] = df.Path.apply(lambda x: int(x.split("\\")[-1]))
    df = df[df['ClientId'] < 10000]

    df['Full_Path'] = df["Path"] + '\\' + df["File_Name"]
    df['Cat'] = df.File_Name.apply(lambda x: x.split(".")[0].split("_")[-1])

    df = df[df['Cat'].map(df['Cat'].value_counts()) > 1]

    if skip_classes != None:
        df = df[df.Cat.notin(skip_classes)]

    df_train, df_test = train_test_split(df, test_size=0.2, random_state=1, stratify=df.Cat.values)

    return df_train, df_test


In [3]:
df_train, df_test = prepare_dataframe(folder_dir = 'C:\\Users\\Michael\\Feels Like Home\\Francisco Cruz - Fotos_DL\\011_Fotos\\')

## Using VGG19 for feature extraction
Freeze the convolutional base and only train the densely connected network.

In [1]:
from keras.applications import VGG19

conv_base = VGG19(weights="imagenet", include_top=False, input_shape=(150, 150, 3))
conv_base.summary()

Using TensorFlow backend.
Model: "vgg19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150, 150, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 150, 150, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 150, 150, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 75, 75, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 75, 75, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 75, 75, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None,

In [2]:
conv_base.trainable

True

In [9]:
# 1. Option 
# Run the conv_base once with all the images to obtain the features, save the results as a numpy array and then use it as the input to a standalone, densely conncected classifier
# Faster option

from keras.preprocessing.image import ImageDataGenerator
import numpy as np

datagen = ImageDataGenerator(rescale = 1./255)

# Function adapted from Deep Learning with Python (Francois Chollet, 2018)
def extract_features(df, sample_count, batch_size):
    features = np.zeros(shape=(sample_count, 4, 4, 512))
    labels = np.zeros(shape=(sample_count,6))
    generator = datagen.flow_from_dataframe(dataframe=df.iloc[0:sample_count,:], directory=None, x_col='Full_Path', y_col='Cat',
                                            target_size=(150, 150), batch_size=batch_size, class_mode='categorical')
    i=0
    for inputs_batch, labels_batch in generator: 
        print("Batch:", i)
        features_batch = conv_base.predict(inputs_batch)
        features[i * batch_size : (i + 1) * batch_size] = features_batch
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
        i += 1
        if i * batch_size >= sample_count:
            break
    return features, labels

In [10]:
# Extract features 

train_size = 10
train_features, train_labels = extract_features(df_train, train_size, 100)


Found 10 validated image filenames belonging to 6 classes.
Batch: 0


In [11]:
train_features.shape

(10, 4, 4, 512)

In [43]:
test_size = 2200
test_features, test_labels = extract_features(df_test, test_size, 100)

Found 2200 validated image filenames belonging to 14 classes.
Batch: 0
Batch: 1
Batch: 2
Batch: 3
Batch: 4
Batch: 5
Batch: 6
Batch: 7
Batch: 8
Batch: 9
Batch: 10
Batch: 11
Batch: 12
Batch: 13
Batch: 14
Batch: 15
Batch: 16
Batch: 17
Batch: 18
Batch: 19
Batch: 20
Batch: 21


In [51]:
# Save numpy arrays as .npy 
np.save("test_features.npy", test_features)
np.save("test_labels.npy", test_labels)
np.save("train_features.npy", train_features)
np.save("train_labels.npy", train_labels)

In [52]:
test_features = np.load("test_features.npy")
test_labels = np.load("test_labels.npy")
train_features = np.load("train_features.npy")
train_labels = np.load("train_labels.npy")

In [53]:
# Flatten
train_input = np.reshape(train_features, (train_size, 4 * 4 * 512))
test_input = np.reshape(test_features, (test_size, 4 * 4 * 512))

In [57]:
from keras.models import Sequential
from keras import layers
from keras import optimizers

model = Sequential()
model.add(layers.Dense(256, activation="relu", input_dim=4 * 4 * 512))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(14, activation="softmax"))

model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
              loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(train_input, train_labels, epochs=100, batch_size=20)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [59]:
scores = model.evaluate(test_input, test_labels)
print(model.metrics_names[1], scores[1]*100)

accuracy 68.77272725105286


## Using VGG19 for fine-tuning
Unfreeze some of the top layer of the convolutional base and train them with the already trained densely connected network.

In [39]:
# Step 1: Combine convolutional base with densely connected network

conv_base = VGG19(weights="imagenet", include_top=False, input_shape=(150, 150, 3))

model = Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation="relu"))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation="softmax"))
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Model)                (None, 4, 4, 512)         20024384  
_________________________________________________________________
flatten_4 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 256)               2097408   
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 771       
Total params: 22,122,563
Trainable params: 22,122,563
Non-trainable params: 0
_________________________________________________________________


In [40]:
# Step 2: Freeze the convolutional base and train only densely connected network
conv_base.trainable = False

def get_generator(df, sample_count, batch_size):
    return datagen.flow_from_dataframe(dataframe=df.iloc[0:sample_count,:], directory=None, x_col='Full_Path', y_col='Cat',
                                       target_size=(150, 150), batch_size=batch_size, class_mode='categorical')

train_generator = get_generator(df_train, 150, 10)
test_generator = get_generator(df_test, 20, 2)

model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
              loss = 'categorical_crossentropy', metrics = ['accuracy'])

history = model.fit_generator(train_generator, steps_per_epoch=15, epochs=5)

Found 150 validated image filenames belonging to 3 classes.
Found 20 validated image filenames belonging to 3 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
# Check accuracy 
model.evaluate_generator(test_generator, steps=11)

[0.950821042060852, 0.6818181872367859]

In [43]:
# Step 3: Set trainable attribute of last n layers of convolutional base to True

n = 5
conv_base.trainable = False 
for layer in conv_base.layers[-n:]:
    layer.trainable = True

model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg19 (Model)                (None, 4, 4, 512)         20024384  
_________________________________________________________________
flatten_4 (Flatten)          (None, 8192)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 256)               2097408   
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 3)                 771       
Total params: 22,122,563
Trainable params: 2,098,179
Non-trainable params: 20,024,384
_________________________________________________________________


In [44]:
# Step 4: Train last layers of convolutional base jointly with densely connected layer 

def get_generator(df, sample_count, batch_size):
    return datagen.flow_from_dataframe(dataframe=df.iloc[0:sample_count,:], directory=None, x_col='Full_Path', y_col='Cat',
                                       target_size=(150, 150), batch_size=batch_size, class_mode='categorical')

train_generator = get_generator(df_train, 150, 10)
test_generator = get_generator(df_test, 20, 2)

model.compile(optimizer=optimizers.RMSprop(lr=2e-5),
              loss = 'categorical_crossentropy', metrics = ['accuracy'])

history = model.fit_generator(train_generator, steps_per_epoch=15, epochs=5)

Found 150 validated image filenames belonging to 3 classes.
Found 20 validated image filenames belonging to 3 classes.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
# Check accuracy 
model.evaluate_generator(test_generator, steps=11)

[0.571797251701355, 0.8636363744735718]