In [1]:
import os
os.environ["THEANO_FLAGS"] = "device=gpu"
from sklearn.base import BaseEstimator
import os
from lasagne import layers, nonlinearities
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
import numpy as np

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score

class Classifier(BaseEstimator):

    def __init__(self, net):
        #self.X = X
        #self.y = y
        self.net = net 

    def preprocess(self, X):
        X = (X / 255.)
        X = X.astype(np.float32)
        X = X.transpose((0, 3, 1, 2))
        return X

    def fit(self, X, y):
        X = self.preprocess(X)
        self.net.fit(X, y)
        return self

    def predict(self, X):
        X = self.preprocess(X)
        return self.net.predict(X)

    def predict_proba(self, X):
        X = self.preprocess(X)
        return self.net.predict_proba(X)
    

def unit_test(X, y, clf, nb_iter=5):
    test_size = 0.2
    random_state = 15
    cv = StratifiedShuffleSplit(y, nb_iter,
                                test_size=test_size,
                                random_state=random_state)
    scores = cross_val_score(clf, X=X, y=y, scoring='accuracy', cv=cv)
    return scores

def unit_test2(X, y, clf, nb_iter=5):  
    test_size = 0.2
    random_state = 15
    cv = StratifiedShuffleSplit(y, nb_iter,
                                test_size=test_size,
                                random_state=random_state)
    #scores = cross_val_score(clf, X=X, y=y, scoring='accuracy', cv=cv)
    scores = []
    for train, test in cv:
        clf.fit(X[train], y[train])
        acc = (clf.predict(X[test])==y[test]).mean()
        scores.append(acc)
    return scores

data = np.load("train_64x64.npz")

X, y = data['X'], data['y']

Using gpu device 0: Tesla K20Xm (CNMeM is disabled)


Let's first make sure we can overfit the training set, by using a CNN (Convolutional Neural Network) big enough to reduce the training error close to 0, without any regularization (e.g. dropout, L2, etc.).In this exploratory phase we want to reduce the training time as much as possible, while still leaving enough epochs of training to see the training loss going close to 0. In this spirit, we will set the number of cross-validation iterations to 1 (instead of the default 5) and the maximum number of training epochs to 30 (we could also increase the maximum number of training epochs even more and stop the training manually if we thought we already had enough evidence of overfitting and training was taking too long).

In [11]:
hyper_parameters = dict(conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
            conv2_num_filters=128, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2),
            conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2),
            hidden4_num_units=500, hidden5_num_units=500,
            output_num_units=18, output_nonlinearity=nonlinearities.softmax,
            update_learning_rate=0.01,
            update_momentum=0.9,
            max_epochs=30,
        )

net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('hidden4', layers.DenseLayer),
            ('hidden5', layers.DenseLayer),
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=1,
        **hyper_parameters
    )

data = np.load("train_64x64.npz")

X, y = data['X'], data['y']

#X = (X / 255.)
#X = X.astype(np.float32)
#X = X.transpose((0, 3, 1, 2))

#how can this take longer than calling unit_test?????
#net.fit(X, y)
clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)

# Neural Network with 3496370 learnable parameters

## Layer information

  #  name     size
---  -------  ---------
  0  input    3x64x64
  1  conv1    64x62x62
  2  pool1    64x31x31
  3  conv2    128x30x30
  4  pool2    128x15x15
  5  conv3    128x14x14
  6  pool3    128x7x7
  7  hidden4  500
  8  hidden5  500
  9  output   18

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.48588[0m       [32m2.45851[0m      1.01113      0.28275  8.67s
      2       [36m2.42201[0m       [32m2.38081[0m      1.01731      0.30981  8.63s
      3       [36m2.32502[0m       [32m2.27260[0m      1.02307      0.34622  8.62s
      4       [36m2.21311[0m       [32m2.19140[0m      1.00991      0.36246  8.62s
      5       [36m2.13006[0m       [32m2.12441[0m      1.00266      0.37809  8.62s
      6       [36m2.04799[0m       [32m2.07170[0m      0.98856      0.38470  8.62s
      7  

[0.34390363815142577]

We can see that while the training loss keeps going down, after a certain epoch the validation loss keeps increasing and the validation accuracy fluctuates; this is evidence of overfitting. 
Our CNN seems big enough to be able to memorize the training set, while allowing for decent training time. We could now try to deal with the overfitting; a good, first choice method could be adding dropout layers, especially after the fully-connected layers.

In [12]:
net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('hidden4', layers.DenseLayer),
            ('dropout4', layers.DropoutLayer),  # !
            ('hidden5', layers.DenseLayer),
            ('dropout5', layers.DropoutLayer),  # !
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=1,
        conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
            conv2_num_filters=128, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2),
            conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2),
            hidden4_num_units=500, 
            dropout4_p=0.5,  # !
            hidden5_num_units=500,
            dropout5_p=0.5,  # !
            output_num_units=18, output_nonlinearity=nonlinearities.softmax,
            update_learning_rate=0.01,
            update_momentum=0.9,
            max_epochs=30,
    )

clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)

# Neural Network with 3496370 learnable parameters

## Layer information

  #  name      size
---  --------  ---------
  0  input     3x64x64
  1  conv1     64x62x62
  2  pool1     64x31x31
  3  conv2     128x30x30
  4  pool2     128x15x15
  5  conv3     128x14x14
  6  pool3     128x7x7
  7  hidden4   500
  8  dropout4  500
  9  hidden5   500
 10  dropout5  500
 11  output    18

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.52702[0m       [32m2.47076[0m      1.02277      0.28275  8.69s
      2       [36m2.46551[0m       [32m2.45832[0m      1.00293      0.28275  8.61s
      3       [36m2.45060[0m       [32m2.44256[0m      1.00329      0.28275  8.62s
      4       [36m2.41406[0m       [32m2.37222[0m      1.01764      0.32695  8.61s
      5       [36m2.33618[0m       [32m2.27582[0m      1.02652      0.35883  8.62s
      6       [36m2.26065[0m       [32m2.21

[0.51892822025565388]

TODO: MSR initialization, learning rate annealing, Adam; convolution + max-pooling illustrations

We can see that the CNN is now well-enough regularized so as to no longer overfit, and after 30 epochs of training the training loss hasn't converged and the validation accuracy keeps improving. We can thus increase the maximum number of training epochs, in the hope that this will lead to even better validation accuracy. To stop the training procedure automatically (without having to check whether the validation accuracy is still improving after each couple of epochs and then stop the procedure manually), we can use a technique called early stopping. We will also increase the number of maximum training epochs to 200.

In [None]:
# code source: 
# http://danielnouri.org/notes/2014/12/17/using-convolutional-neural-nets-to-detect-facial-keypoints-tutorial/
# adapted to use the validation accuracy, rather than the validation loss
class EarlyStopping(object):
    def __init__(self, patience=20):
        self.patience = patience
        self.best_valid = - np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_accuracy']
        #print(train_history[-1])
        current_epoch = train_history[-1]['epoch']
        if current_valid > self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best validation accuracy was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()
            
net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('hidden4', layers.DenseLayer),
            ('dropout4', layers.DropoutLayer),  
            ('hidden5', layers.DenseLayer),
            ('dropout5', layers.DropoutLayer),  
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=1,
        conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
        conv2_num_filters=128, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2),
        conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2),
        hidden4_num_units=500, 
        dropout4_p=0.5,  
        hidden5_num_units=500,
        dropout5_p=0.5,  
        output_num_units=18, output_nonlinearity=nonlinearities.softmax,
        update_learning_rate=0.01,
        update_momentum=0.9,
        max_epochs=100,   # !
        on_epoch_finished=[
            EarlyStopping(patience=20),
            ],    # !
    )

clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)

# Neural Network with 3496370 learnable parameters

## Layer information

  #  name      size
---  --------  ---------
  0  input     3x64x64
  1  conv1     64x62x62
  2  pool1     64x31x31
  3  conv2     128x30x30
  4  pool2     128x15x15
  5  conv3     128x14x14
  6  pool3     128x7x7
  7  hidden4   500
  8  dropout4  500
  9  hidden5   500
 10  dropout5  500
 11  output    18

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.51599[0m       [32m2.47018[0m      1.01855      0.28275  8.68s
      2       [36m2.46085[0m       [32m2.45187[0m      1.00366      0.28275  8.88s
      3       [36m2.43908[0m       [32m2.42682[0m      1.00505      0.28275  8.92s
      4       [36m2.40050[0m       [32m2.36129[0m      1.01661      0.33057  8.89s
      5       [36m2.32766[0m       [32m2.27343[0m      1.02385      0.35882  8.64s
      6       [36m2.26093[0m       [32m2.21

In [3]:
class EarlyStopping(object):
    def __init__(self, patience=20):
        self.patience = patience
        self.best_valid = - np.inf
        self.best_valid_epoch = 0
        self.best_weights = None

    def __call__(self, nn, train_history):
        current_valid = train_history[-1]['valid_accuracy']
        #print(train_history[-1])
        current_epoch = train_history[-1]['epoch']
        if current_valid > self.best_valid:
            self.best_valid = current_valid
            self.best_valid_epoch = current_epoch
            self.best_weights = nn.get_all_params_values()
        elif self.best_valid_epoch + self.patience < current_epoch:
            print("Early stopping.")
            print("Best validation accuracy was {:.6f} at epoch {}.".format(
                self.best_valid, self.best_valid_epoch))
            nn.load_params_from(self.best_weights)
            raise StopIteration()
            

net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('hidden4', layers.DenseLayer),
            ('dropout4', layers.DropoutLayer),  
            ('hidden5', layers.DenseLayer),
            ('dropout5', layers.DropoutLayer),  
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=1,
        conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
        #conv1_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        conv2_num_filters=128, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2),
        #conv2_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2), 
        #conv3_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        hidden4_num_units=500, hidden4_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout4_p=0.5,  
        hidden5_num_units=500, hidden5_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout5_p=0.5,  
        output_num_units=18, output_nonlinearity=nonlinearities.softmax,
        update_learning_rate=0.01,
        update_momentum=0.9,
        max_epochs=100, 
        on_epoch_finished=[
            EarlyStopping(patience=20),
            ], 
    )

clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)

# Neural Network with 3496370 learnable parameters

## Layer information

  #  name      size
---  --------  ---------
  0  input     3x64x64
  1  conv1     64x62x62
  2  pool1     64x31x31
  3  conv2     128x30x30
  4  pool2     128x15x15
  5  conv3     128x14x14
  6  pool3     128x7x7
  7  hidden4   500
  8  dropout4  500
  9  hidden5   500
 10  dropout5  500
 11  output    18

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.50736[0m       [32m2.46144[0m      1.01865      0.28275  8.79s
      2       [36m2.45095[0m       [32m2.43589[0m      1.00618      0.28275  8.77s
      3       [36m2.40795[0m       [32m2.35649[0m      1.02184      0.33207  8.71s
      4       [36m2.31090[0m       [32m2.24590[0m      1.02894      0.35852  8.66s
      5       [36m2.22343[0m       [32m2.16668[0m      1.02619      0.37235  8.65s
      6       [36m2.15814[0m       [32m2.09

[0.50319567354965589]

The difference between the train / validation loss ratio still seems to become very small after a certain number of epochs. We can try to regularize the CNN even more, by adding dropout between the CNN layers.

In [4]:

net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('dropout1', layers.DropoutLayer),  # !  
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('dropout2', layers.DropoutLayer),   # ! 
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('dropout3', layers.DropoutLayer),   # ! 
            ('hidden4', layers.DenseLayer),
            ('dropout4', layers.DropoutLayer),  
            ('hidden5', layers.DenseLayer),
            ('dropout5', layers.DropoutLayer),  
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=1,
        conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
        #conv1_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout1_p=0.5, # !  
        conv2_num_filters=128, conv2_filter_size=(2, 2), pool2_pool_size=(2, 2),
        #conv2_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout2_p=0.5, # ! 
        conv3_num_filters=128, conv3_filter_size=(2, 2), pool3_pool_size=(2, 2), 
        #conv3_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout3_p=0.5, # !
        hidden4_num_units=500, hidden4_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout4_p=0.5,  
        hidden5_num_units=500, hidden5_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout5_p=0.5,  
        output_num_units=18, output_nonlinearity=nonlinearities.softmax,
        update_learning_rate=0.01,
        update_momentum=0.9,
        max_epochs=100, 
        on_epoch_finished=[
            EarlyStopping(patience=20),
            ], 
    )

clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)

# Neural Network with 3496370 learnable parameters

## Layer information

  #  name      size
---  --------  ---------
  0  input     3x64x64
  1  conv1     64x62x62
  2  pool1     64x31x31
  3  dropout1  64x31x31
  4  conv2     128x30x30
  5  pool2     128x15x15
  6  dropout2  128x15x15
  7  conv3     128x14x14
  8  pool3     128x7x7
  9  dropout3  128x7x7
 10  hidden4   500
 11  dropout4  500
 12  hidden5   500
 13  dropout5  500
 14  output    18

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  -----
      1       [36m2.50717[0m       [32m2.53538[0m      0.98888      0.28275  9.17s
      2       [36m2.46273[0m       [32m2.51567[0m      0.97895      0.28275  9.11s
      3       [36m2.45210[0m       [32m2.49026[0m      0.98467      0.28275  9.12s
      4       [36m2.41018[0m       [32m2.43447[0m      0.99002      0.28577  9.18s
      5       [36m2.32680[0m       [32m2.36617[0m      0.

[0.36332350049164208]

At this point we are clearly underfitting. While the fully connected part of the network is probably expressive enough, we can try to make the convolutional more expressive, so that it extracts better features. 

In [None]:

net = NeuralNet(
        layers=[
            ('input', layers.InputLayer),
            ('conv1', layers.Conv2DLayer),
            ('pool1', layers.MaxPool2DLayer),
            ('dropout1', layers.DropoutLayer),  # !  
            ('conv2', layers.Conv2DLayer),
            ('pool2', layers.MaxPool2DLayer),
            ('dropout2', layers.DropoutLayer),   # ! 
            ('conv3', layers.Conv2DLayer),
            ('pool3', layers.MaxPool2DLayer),
            ('dropout3', layers.DropoutLayer),   # ! 
            #('conv4', layers.Conv2DLayer),
            #('pool4', layers.MaxPool2DLayer),
            #('dropout4', layers.DropoutLayer),  
            ('hidden5', layers.DenseLayer),
            ('dropout5', layers.DropoutLayer), 
            ('hidden6', layers.DenseLayer),
            ('dropout6', layers.DropoutLayer),
            ('output', layers.DenseLayer),
            ],
        input_shape=(None, 3, 64, 64),
        use_label_encoder=True,
        verbose=2,
        conv1_num_filters=64, conv1_filter_size=(3, 3), pool1_pool_size=(2, 2),
        #conv1_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout1_p=0.5, # !  
        conv2_num_filters=128, conv2_filter_size=(3, 3), pool2_pool_size=(2, 2),
        #conv2_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout2_p=0.5, # ! 
        conv3_num_filters=256, conv3_filter_size=(3, 3), pool3_pool_size=(2, 2), 
        #conv3_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        dropout3_p=0.5, # !
        #conv4_num_filters=512, conv4_filter_size=(3, 3), pool4_pool_size=(2, 2), 
        #conv4_nonlinearity = nonlinearities.very_leaky_rectify,    # !
        #dropout4_p=0.5, # ! 
        hidden5_num_units=500, hidden5_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout5_p=0.5,
        hidden6_num_units=500, hidden6_nonlinearity = nonlinearities.very_leaky_rectify, # !
        dropout6_p=0.5, 
        output_num_units=18, output_nonlinearity=nonlinearities.softmax,
        update_learning_rate=0.01,
        update_momentum=0.9,
        max_epochs=100, 
        on_epoch_finished=[
            EarlyStopping(patience=20),
            ], 
    )

clf = Classifier(net)

unit_test2(X, y, clf, nb_iter=1)