# Machine Learning

## Imports and Configuration

In [1]:
import sys, os
sys.path.append(os.path.abspath('../util'))

# core imports
from keras_tf_util import *
from keras.applications.vgg19 import VGG19 #, preprocess_input, decode_predictions
from keras.applications.resnet50 import ResNet50 #, preprocess_input, decode_predictions

Using TensorFlow backend.


In [2]:
# configure various jupyter defaults
%matplotlib notebook
plt.rcParams['figure.figsize'] = (9,6)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# configure autoreload to automatically reload modules when files are changed
%load_ext autoreload
%autoreload 2

## Setup

In [3]:
current_dir = os.getcwd() + '/'
DATA_HOME_DIR = current_dir + 'data/'
use_sample = False

path = DATA_HOME_DIR if not use_sample else DATA_HOME_DIR + 'sample/'

train_path = path + 'train/'
val_path = path + 'valid/'
test_path = DATA_HOME_DIR + 'test_stg1/'

models_path = (current_dir if not use_sample else path) + 'models/'   # save weights here
results_path = (current_dir if not use_sample else path) + 'results/' # save predictions here
preprocesed_data_path = path + 'preprocesed_data/'  # save preprocessed data used for training here

if not os.path.exists(models_path): os.makedirs(models_path)
if not os.path.exists(results_path): os.makedirs(results_path)
if not os.path.exists(preprocesed_data_path): os.makedirs(preprocesed_data_path)

In [4]:
bs = 12 #12 works on my ubuntu notebook (64 would be better)

## Preprocess data

In [5]:
# get classes, one-hot encoded labels, and filenames
train_classes, train_labels, train_filenames = get_batch_info(train_path)
val_classes, val_labels, val_filenames = get_batch_info(val_path)
test_filenames = get_batch_info(train_path)[2]

# sometimes helpful to have filenames without the parent folder
raw_train_filenames = [ s.split('/')[-1] for s in train_filenames ]
raw_val_filenames = [ s.split('/')[-1] for s in val_filenames ]
raw_test_filenames = [ s.split('/')[-1] for s in test_filenames ]

Found 3277 images belonging to 8 classes.
Found 500 images belonging to 8 classes.
Found 3277 images belonging to 8 classes.


In [6]:
# preprocess and load image data; will reload data if already pre-processed
def load_data(file_path, data_path, target_size=(224,224)):
    if not os.path.exists(file_path):
        d = get_data(data_path, target_size=target_size)
        save_array(file_path, d)
    else:
        d = load_array(file_path)
        print('{0} loaded ...'.format(os.path.basename(file_path)))
        
    return d
        
train_data = load_data(preprocesed_data_path + 'train_data.bc', train_path)
val_data = load_data(preprocesed_data_path + 'val_data.bc', val_path)
test_data = load_data(preprocesed_data_path + 'test_data.bc', test_path)

train_data.bc loaded ...
val_data.bc loaded ...
test_data.bc loaded ...


## VGG19 

### 1. Fine-tune the last layer

In [7]:
limit_mem()

model = finetune(VGG19(), 8)
model.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
# model.summary()

In [9]:
# model.load_weights(models_path + '1-ft_last_layer.h5')

In [10]:
model.fit(train_data, train_labels, batch_size=bs, epochs=3, validation_data=(val_data, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fab3869db70>

In [11]:
model.evaluate(val_data, val_labels, batch_size=bs)



[0.40102759416401385, 0.87400001859664922]

In [12]:
model.save_weights(models_path + 'vgg19_ft_last_layer.h5')

### Pre-compute convolutional features

In [13]:
limit_mem()

# you have to specify input_shape IF you want to use with pre-computed datasets!!!
conv_model = VGG19(include_top=False, input_shape=(224,224,3))

In [14]:
# conv_model.summary()

In [15]:
# preprocess predictions; will reload data if already pre-processed
def load_pp_data(file_path, model, data, batch_size=64):
    if not os.path.exists(file_path):
        d = model.predict(data, batch_size=batch_size)
        save_array(file_path, d)
        print('{0} saved ...'.format(os.path.basename(file_path)))
    else:
        d = load_array(file_path)
        print('{0} loaded ...'.format(os.path.basename(file_path)))
        
    return d

In [16]:
conv_train_feat_vgg19 = load_pp_data(preprocesed_data_path + 'conv_train_feat_vgg19.bc', conv_model, train_data, bs)
conv_val_feat_vgg19 = load_pp_data(preprocesed_data_path + 'conv_val_feat_vgg19.bc', conv_model, val_data, bs)
conv_test_feat_vgg19 = load_pp_data(preprocesed_data_path + 'conv_test_feat_vgg19.bc', conv_model, test_data, bs)

conv_train_feat_vgg19.bc loaded ...
conv_val_feat_vgg19.bc loaded ...
conv_test_feat_vgg19.bc loaded ...


In [17]:
conv_train_feat_vgg19.shape

(3277, 7, 7, 512)

In [18]:
conv_model.output.shape.as_list()

[None, 7, 7, 512]

### 2. Pre-computed conv. features + FC layers

In [19]:
def build_vgg19_fc_seq(p):
    # using sequential API
    return [
        BatchNormalization(axis=1, input_shape=conv_model.output.shape.as_list()[1:]),
        Dropout(p / 4),
        Flatten(),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(p / 2),
        Dense(8, activation='softmax')
    ]

In [20]:
def build_vgg19_fc_func(p):
    # using functional API
    inputs = Input(shape=conv_model.output.shape.as_list()[1:])
    
    x = BatchNormalization(axis=1)(inputs)
    x = Dropout(p / 4)(x)
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    
    x = BatchNormalization()(x)
    x = Dropout(p)(x)
    x = Dense(512, activation='relu')(x)
    
    x = BatchNormalization()(x)
    x = Dropout(p / 2)(x)
    preds = Dense(8, activation='softmax')(x)
    
    return Model(inputs=inputs, outputs=preds)

In [21]:
p = 0.6

In [22]:
limit_mem()

# model = Sequential(build_vgg19_fc_seq(p))
model = build_vgg19_fc_func(p)
model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(conv_train_feat_vgg19, train_labels, batch_size=bs, epochs=4, 
          validation_data=(conv_val_feat_vgg19, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7faa4b1f6128>

In [24]:
model.optimizer.lr = 1e-4

model.fit(conv_train_feat_vgg19, train_labels, batch_size=bs, epochs=7, 
          validation_data=(conv_val_feat_vgg19, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7faa4b1634a8>

In [25]:
model.save_weights(models_path + 'vgg19_conv_precomputed.h5')

## ResNet50

In [26]:
limit_mem()

# rn = ResNet50(include_top=False, input_shape=(224,224,3))
# del rn.layers[-1:]
# rn.layers[-1].outbound_nodes = []
# rn.outputs = [rn.layers[-1].output]
# rn.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224,224,3))
rn = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

In [27]:
rn.output_shape, rn.layers[-1].output_shape

((None, 7, 7, 2048), (None, 7, 7, 2048))

In [28]:
conv_train_feat_rn50 = load_pp_data(preprocesed_data_path + 'conv_train_feat_rn50.bc', rn, train_data, bs)
conv_val_feat_rn50 = load_pp_data(preprocesed_data_path + 'conv_val_feat_rn50.bc', rn, val_data, bs)
conv_test_feat_rn50 = load_pp_data(preprocesed_data_path + 'conv_test_feat_rn50.bc', rn, test_data, bs)

conv_train_feat_rn50.bc loaded ...
conv_val_feat_rn50.bc loaded ...
conv_test_feat_rn50.bc loaded ...


In [29]:
def build_rn50_top_func(p):
    inputs = Input(shape=rn.output.shape.as_list()[1:])
    x = GlobalAveragePooling2D()(inputs)
    x = Dropout(p)(x)
    preds = Dense(8, activation='softmax')(x)
    
    return Model(inputs, preds)

In [30]:
rn_model = build_rn50_top_func(0.2)
rn_model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
rn_model.fit(conv_train_feat_rn50, train_labels, batch_size=bs, epochs=6, 
          validation_data=(conv_val_feat_rn50, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7faa28770860>

In [32]:
rn_model.optimizer.lr = 1e-5

rn_model.fit(conv_train_feat_rn50, train_labels, batch_size=bs, epochs=8, 
          validation_data=(conv_val_feat_rn50, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7faa287703c8>

In [35]:
rn_model.save_weights(models_path + 'rn50_conv_224x224.h5')

#### Resnet with different image resolutions

In [36]:
limit_mem()

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(400,400,3))
conv_rn50_400 = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

In [37]:
conv_rn50_400.output_shape, conv_rn50_400.layers[-1].output_shape

((None, 13, 13, 2048), (None, 13, 13, 2048))

In [38]:
rn400_trn_batches = get_batches(train_path, target_size=(400,400), batch_size=bs, shuffle=False)
rn400_val_batches = get_batches(val_path, target_size=(400,400), batch_size=bs, shuffle=False)

Found 3277 images belonging to 8 classes.
Found 500 images belonging to 8 classes.


In [39]:
conv_train_feat_rn50_400 = conv_rn50_400.predict_generator(rn400_trn_batches, rn400_trn_batches.n/bs)
conv_val_feat_rn50_400 = conv_rn50_400.predict_generator(rn400_val_batches, rn400_val_batches.n/bs)

In [40]:
conv_train_feat_rn50_400.shape

(3277, 13, 13, 2048)

In [41]:
conv_rn50_400.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_6 (InputLayer)             (None, 400, 400, 3)   0                                            
____________________________________________________________________________________________________
zero_padding2d_2 (ZeroPadding2D) (None, 406, 406, 3)   0           input_6[0][0]                    
____________________________________________________________________________________________________
conv1 (Conv2D)                   (None, 200, 200, 64)  9472        zero_padding2d_2[0][0]           
____________________________________________________________________________________________________
bn_conv1 (BatchNormalization)    (None, 200, 200, 64)  256         conv1[0][0]                      
___________________________________________________________________________________________

In [42]:
inputs = Input(shape=conv_rn50_400.output_shape[1:])
x = GlobalAveragePooling2D()(inputs)
x = Dropout(0.1)(x)
preds = Dense(8, activation='softmax')(x)

rn400_model = Model(inputs, preds)

In [43]:
rn400_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 13, 13, 2048)      0         
_________________________________________________________________
global_average_pooling2d_2 ( (None, 2048)              0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 8)                 16392     
Total params: 16,392
Trainable params: 16,392
Non-trainable params: 0
_________________________________________________________________


In [44]:
rn400_model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
rn400_model.fit(conv_train_feat_rn50_400, train_labels, batch_size=bs, epochs=5,
                validation_data=(conv_val_feat_rn50_400, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa9813e2e48>

In [46]:
rn400_model.optimizer.lr = 1e-6

rn400_model.fit(conv_train_feat_rn50_400, train_labels, batch_size=bs, epochs=20,
                validation_data=(conv_val_feat_rn50_400, val_labels))

Train on 3277 samples, validate on 500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa9813b8e48>

In [47]:
rn400_model.save_weights(models_path + 'rn50_conv_400x400.h5')
# rn400_model.load_weights(models_path + 'rn50_400_weights.h5')

## Combine convolutional model used to pre-compute convolutional features with the model that uses them to make final predictions

In [48]:
final_model = Model(inputs=conv_rn50_400.input, outputs=rn400_model(conv_rn50_400.output))
final_model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
# final_model.summary()

In [49]:
final_model.evaluate_generator(rn400_val_batches, rn400_val_batches.n / bs)

[0.21556607327610255, 0.950000009059906]

In [50]:
img_path = val_path + val_filenames[250]
img = image.load_img(img_path, target_size=(400, 400))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
preds = final_model.predict(x)

In [51]:
print(img_path), val_classes[250], np.argmax(preds)

/home/wgilliam/development/_training/ml/fastai-projects/kaggle-fisheries-competition/data/valid/BET/img_07311.jpg


(None, 1, 1)

In [52]:
final_model.save_weights(models_path + 'rn50_conv+top_400x400.h5')
# rn400_model.load_weights(models_path + 'rn50_400_weights.h5')