### Plant seedlings classification with bottleneck features. 

See plant-seedlings-classification-transferLearning.ipynb

In [11]:
from os import listdir
from os import mkdir
from os import makedirs
import os
import shutil
from IPython.display import Image, display
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import vgg16, vgg19
from keras.layers import Dense, GlobalAveragePooling2D, Dropout, Flatten
from keras.models import Model
from keras.models import load_model
from keras import optimizers
import pandas as pd
from skimage import io
import numpy as np
import cv2

%matplotlib inline

In [12]:
# This code will require GPU usage... so sometimes we will need to run it in floydhub
FLOYDHUB=True
if FLOYDHUB:
    OUTPUT_DIR = "/output/"
    TRAIN_DIR = "/input/train/"
    VALIDATION_DIR = "/input/validation/"
    FAKE_TEST_DIR = "/input/fake-test"
    TEST_DIR = "/input/test"
else:
    OUTPUT_DIR = "/tmp/"
    TRAIN_DIR = "train/"
    VALIDATION_DIR = "validation/"
    FAKE_TEST_DIR = "fake-test/"
    TEST_DIR = "test/"

# As per the image size we will use, I am going with 224... no particular reason really
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224


In [13]:
CLASS_NAMES = [
    "Black-grass",
    "Charlock",
    "Cleavers",
    "Common Chickweed",
    "Common wheat",
    "Fat Hen",
    "Loose Silky-bent",
    "Maize",
    "Scentless Mayweed",
    "Shepherds Purse",
    "Small-flowered Cranesbill",
    "Sugar beet",
]

### Prepare prediction functions

In [14]:
real_test_images = []
final_predictions = pd.DataFrame(columns=CLASS_NAMES)

image_files = listdir(TEST_DIR)
i = 0
for image_file in image_files:     
    raw_image = io.imread(TEST_DIR+"/"+image_file)
    scaled_img = cv2.resize(raw_image, (IMAGE_WIDTH, IMAGE_HEIGHT), interpolation=cv2.INTER_CUBIC)
    real_test_images.append(scaled_img)
    i+=1    
    if i % 100 == 0:
        print("Loaded", i, "images so far...")
X = np.array(real_test_images)
X = X / 255
print("Done!") 


def predict_and_dump(model_to_use, X_to_use, image_files_to_use, file_name):
    results = model_to_use.predict(X_to_use, verbose=1)
    final_predictions = pd.DataFrame(columns=CLASS_NAMES, data=results)
    predictions = final_predictions.head().idxmax(axis=1)
    kaggle_data = pd.DataFrame(columns=["file"])
    kaggle_data["file"] = image_files_to_use
    kaggle_data["species"] = final_predictions.idxmax(axis=1)
    kaggle_data.to_csv(file_name, index=False)
    return kaggle_data, final_predictions


Loaded 100 images so far...
Loaded 200 images so far...
Loaded 300 images so far...
Loaded 400 images so far...
Loaded 500 images so far...
Loaded 600 images so far...
Loaded 700 images so far...
Done!


In [15]:
batch_size = 16

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        horizontal_flip=False,
        vertical_flip=False)

# this is the augmentation configuration we will use for validation:
# only rescaling
validation_datagen = ImageDataGenerator(rescale=1./255)

# and the same for the test set
fake_test_datagen = ImageDataGenerator(rescale=1./255)

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        TRAIN_DIR,  # this is the target directory
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),  # all images will be resized
        batch_size=batch_size,
        class_mode=None, # this means our generator will only yield batches of data, no labels
        shuffle=False) # It is very iimportant NOT to shuffle the data, as we need them in order...

# this is a similar generator, for validation data
validation_generator = validation_datagen.flow_from_directory(
        VALIDATION_DIR,
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)  

# And the generator for test data
fake_test_generator = fake_test_datagen.flow_from_directory(
        FAKE_TEST_DIR,
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
        batch_size=batch_size,
        class_mode=None,
        shuffle=False)  



Found 3806 images belonging to 12 classes.
Found 474 images belonging to 12 classes.
Found 470 images belonging to 12 classes.


In [16]:
training_samples = (3805 // batch_size ) * batch_size
validation_samples = (474 // batch_size ) * batch_size
fake_test_samples = (470 // batch_size ) * batch_size

In [17]:
model = vgg16.VGG16(weights='imagenet', include_top=False)

In [18]:
for layer in model.layers:
    print(layer.name, "\t",  "trainable" if layer.trainable else "NOT trainable")

input_2 	 NOT trainable
block1_conv1 	 trainable
block1_conv2 	 trainable
block1_pool 	 trainable
block2_conv1 	 trainable
block2_conv2 	 trainable
block2_pool 	 trainable
block3_conv1 	 trainable
block3_conv2 	 trainable
block3_conv3 	 trainable
block3_pool 	 trainable
block4_conv1 	 trainable
block4_conv2 	 trainable
block4_conv3 	 trainable
block4_pool 	 trainable
block5_conv1 	 trainable
block5_conv2 	 trainable
block5_conv3 	 trainable
block5_pool 	 trainable


In [19]:
bottleneck_features_train = model.predict_generator(train_generator, 
                                                          training_samples // batch_size, 
                                                          verbose=1)







In [20]:
bottleneck_features_validation = model.predict_generator(validation_generator, 
                                                               validation_samples // batch_size,
                                                               verbose=1)



In [21]:
bottleneck_features_fake_test = model.predict_generator(fake_test_generator, 
                                                               fake_test_samples // batch_size,
                                                               verbose=1)



### Notice that what we will need are the bottleneck_ variables as INPUTS for our new model, that's why they are saved into disk...

In [22]:
np.save(OUTPUT_DIR+'bottleneck_features_train.npy', bottleneck_features_train)
np.save(OUTPUT_DIR+'bottleneck_features_validation.npy', bottleneck_features_validation)
np.save(OUTPUT_DIR+'bottleneck_features_fake_test.npy', bottleneck_features_fake_test)

In [23]:
print(bottleneck_features_train.shape)
print(bottleneck_features_validation.shape)
print(bottleneck_features_fake_test.shape)

(3792, 7, 7, 512)
(464, 7, 7, 512)
(464, 7, 7, 512)


#### We also need to get the labels for the features we have loaded... and we need to make sure we get them in the same order

In [24]:
# This is the same as the train generator... but it will have the classes too.
train_generator_classes = train_datagen.flow_from_directory(
                            TRAIN_DIR,  # this is the target directory
                            target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),  # all images will be resized to 150x150
                            batch_size=batch_size,
                            shuffle=False,
                            class_mode='categorical')  # since we use categorical_crossentropy loss, 
                                                       # we will need one-hot-encoded...
    
# This is the same as the validation generator... but it will have the classes too
validation_generator_classes = validation_datagen.flow_from_directory(
                            VALIDATION_DIR,
                            target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
                            batch_size=batch_size,
                            shuffle=False,
                            class_mode='categorical')  # since we use categorical_crossentropy loss, 
                                                       # we will need one-hot-encoded...  
    
    
fake_test_generator_classes = fake_test_datagen.flow_from_directory(
                            FAKE_TEST_DIR ,
                            target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
                            batch_size=batch_size,
                            shuffle=False,
                            class_mode='categorical')  # since we use categorical_crossentropy loss, 
                                                       # we will need one-hot-encoded...      

Found 3806 images belonging to 12 classes.
Found 474 images belonging to 12 classes.
Found 470 images belonging to 12 classes.


Remember, we used variables

    training_samples
    validation_samples

before to determine how many images will be for training/validation, lets use the same ones to restore the labels

This is a bit non-direct, but here's how it works. Essentially we will iterate over the train_generator_classes variable a total of

training_samples // batch_size

And the same will be done with the validation_generator_classes, that will yield us X and y elements, we can then use the y element in conjunction with classes and class_indices attributes to build a numpy array with the correct labels

Also, regarding how the array is constructed, you REALLY want to check this http://akuederle.com/create-numpy-array-with-for-loop


In [25]:
num_classes = len(CLASS_NAMES)
print("bottleneck_features_validation shape is ", bottleneck_features_train.shape)
train_labels = np.empty((0, num_classes))
total_iterations = training_samples // batch_size
for x, y in train_generator_classes:
    train_labels = np.append(train_labels, y, axis=0)
    total_iterations-=1
    if total_iterations % 100 == 0:
        print(total_iterations, "to go")
    if total_iterations == 0:
        break
        
print("train_labels shape is", train_labels.shape)


print("bottleneck_features_validation shape is ", bottleneck_features_validation.shape)
validation_labels = np.empty((0, num_classes))
total_iterations = validation_samples // batch_size
for x, y in validation_generator_classes:
    validation_labels = np.append(validation_labels, y, axis=0)
    total_iterations-=1
    if total_iterations % 100 == 0:
        print(total_iterations, "to go")
    if total_iterations == 0:
        break
        
print("validation_labels shape is", validation_labels.shape)


bottleneck_features_validation shape is  (3792, 7, 7, 512)
200 to go
100 to go
0 to go
train_labels shape is (3792, 12)
bottleneck_features_validation shape is  (464, 7, 7, 512)
0 to go
validation_labels shape is (464, 12)


In [26]:
print("fake_test_features_validation shape is ", bottleneck_features_fake_test.shape)
fake_test_labels = np.empty((0, num_classes))
total_iterations = fake_test_samples // batch_size
for x, y in fake_test_generator_classes:
    fake_test_labels = np.append(fake_test_labels, y, axis=0)
    total_iterations-=1
    if total_iterations % 100 == 0:
        print(total_iterations, "to go")
    if total_iterations == 0:
        break
        
print("fake_test_labels shape is", fake_test_labels.shape)

fake_test_features_validation shape is  (464, 7, 7, 512)
0 to go
fake_test_labels shape is (464, 12)


In [27]:
from keras.models import Sequential


model = Sequential()
model.add(Flatten(input_shape=bottleneck_features_train.shape[1:]))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.75))
model.add(Dense(num_classes, activation='softmax'))

adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 25088)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              25691136  
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 12)                12300     
Total params: 25,703,436
Trainable params: 25,703,436
Non-trainable params: 0
_________________________________________________________________


In [28]:
model_history = model.fit(bottleneck_features_train, train_labels,
                          epochs=50,
                          batch_size=batch_size,
                          validation_data=(bottleneck_features_validation, validation_labels))

Train on 3792 samples, validate on 464 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
model.save_weights(OUTPUT_DIR+'bottleneck_fc_model.h5')

### Fine tuning

In [39]:
from keras import applications
vgg16_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(224,224,3))

In [40]:
top_model = Sequential()
top_model.add(Flatten(input_shape=vgg16_model.output_shape[1:]))
top_model.add(Dense(1024, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(12, activation="softmax"))
top_model.load_weights(OUTPUT_DIR+'bottleneck_fc_model.h5')


In [41]:
fine_tuned_model = Model(inputs=vgg16_model.input, outputs=top_model(vgg16_model.output))

In [42]:
fine_tuned_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [43]:
for layer in fine_tuned_model.layers[:-5]:
    layer.trainable = False

In [44]:
print(len(fine_tuned_model.layers))
for layer in fine_tuned_model.layers:
    if layer.trainable:
        print("Layer ", layer.name, "\t is trainable")
    else:
        print("Layer ", layer.name, "\t is NOT trainable")

20
Layer  input_4 	 is NOT trainable
Layer  block1_conv1 	 is NOT trainable
Layer  block1_conv2 	 is NOT trainable
Layer  block1_pool 	 is NOT trainable
Layer  block2_conv1 	 is NOT trainable
Layer  block2_conv2 	 is NOT trainable
Layer  block2_pool 	 is NOT trainable
Layer  block3_conv1 	 is NOT trainable
Layer  block3_conv2 	 is NOT trainable
Layer  block3_conv3 	 is NOT trainable
Layer  block3_pool 	 is NOT trainable
Layer  block4_conv1 	 is NOT trainable
Layer  block4_conv2 	 is NOT trainable
Layer  block4_conv3 	 is NOT trainable
Layer  block4_pool 	 is NOT trainable
Layer  block5_conv1 	 is trainable
Layer  block5_conv2 	 is trainable
Layer  block5_conv3 	 is trainable
Layer  block5_pool 	 is trainable
Layer  sequential_3 	 is trainable


In [45]:
batch_size = 16

# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=180,
        width_shift_range=0.5, # Consider removing...
        height_shift_range=0.5,
        horizontal_flip=True,
        vertical_flip=True)


# this is the augmentation configuration we will use for validation:
# only rescaling
validation_datagen = ImageDataGenerator(rescale=1./255)

# and the same for the test set
fake_test_datagen = ImageDataGenerator(rescale=1./255)

# this is a generator that will read pictures found in
# subfolers of 'data/train', and indefinitely generate
# batches of augmented image data
train_generator = train_datagen.flow_from_directory(
        TRAIN_DIR,  # this is the target directory
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),  # all images will be resized to 150x150
        batch_size=batch_size,
        class_mode='categorical')  # since we use categorical_crossentropy loss, we will need one-hot-encoded...

# this is a similar generator, for validation data
validation_generator = validation_datagen.flow_from_directory(
        VALIDATION_DIR,
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
        batch_size=batch_size,
        class_mode='categorical')

# And the generator for test data
fake_test_generator = fake_test_datagen.flow_from_directory(
        FAKE_TEST_DIR,
        target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
        batch_size=batch_size,
        class_mode='categorical')

Found 3806 images belonging to 12 classes.
Found 474 images belonging to 12 classes.
Found 470 images belonging to 12 classes.


In [46]:
adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

fine_tuned_model.compile(optimizer=adam,
              loss='categorical_crossentropy',
              metrics=['accuracy'])


history = fine_tuned_model.fit_generator(train_generator,
                                                  steps_per_epoch=training_samples // batch_size,
                                                  epochs=10,
                                                  validation_data=validation_generator,
                                                  validation_steps=validation_samples // batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
fine_tuned_model.evaluate_generator(fake_test_generator, steps= fake_test_samples // batch_size)

[0.48653128897321635, 0.83620689655172409]