## End-to-end MNIST training

In [1]:
%matplotlib inline
import utils; reload(utils); # init theano GPU
from utils import *
from __future__ import division, print_function

Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 960 (0000:01:00.0)
  from ._conv import register_converters as _register_converters
Using Theano backend.


In [2]:
# SETUP 
batch_size = 256 # MNIST data is small, so my computer can surely handle this
from keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [3]:
# MNIST contains grayscale images, so we need to expand a dim
# so that we got data in shape: (n_samples, n_channels, h, w)
X_test = np.expand_dims(X_test, 1) # second dimension
X_train = np.expand_dims(X_train, 1) # second dimension

In [5]:
X_train.shape, X_test.shape, X_train.dtype

((60000, 1, 28, 28), (10000, 1, 28, 28), dtype('uint8'))

In [6]:
y_train[:5] # see what categories we have

array([5, 0, 4, 1, 9], dtype=uint8)

In [38]:
??onehot

In [39]:
??to_categorical

In [7]:
# convert y to onehot encoding
y_train = onehot(y_train)
y_test = onehot(y_test)

In [8]:
y_train[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [9]:
# normalize the input
print(X_train.shape)
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32)

(60000, 1, 28, 28)


In [10]:
def norm_input(x): return (x - mean_px)/std_px

In [10]:
# LINEAR MODEL
def get_lin_model():
    """This is ensentially a multi-output linear model"""
    model = Sequential([
        Lambda(norm_input, input_shape = (1, 28, 28)),
        Flatten(),
        Dense(10, activation='softmax')
    ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
lm = get_lin_model()

  .format(self.name, input_shape))


In [11]:
# make keras' numpyArrayIterator
gen = image.ImageDataGenerator()
batches = gen.flow(X_train, y_train, batch_size=batch_size)
test_batches = gen.flow(X_test, y_test, batch_size=batch_size)

# new keras params:
# steps_per_epoch: Total number of steps (batches of samples) 
# before declaring one epoch finished and starting the next
# epoch.  similar for validation_steps
steps_per_epoch = int(np.ceil(batches.n/batch_size))
validation_steps = int(np.ceil(test_batches.n/batch_size))

(batches.n, test_batches.n, steps_per_epoch, validation_steps) # show dims

(60000, 10000, 235, 40)

In [12]:
# fitting data:
# to start, use default learning rate for 1 epoch and see what you get
# for linear model, we get 91% acc, not really great  
lm.fit_generator(batches, steps_per_epoch=steps_per_epoch,
                    epochs=1, 
                    validation_data=test_batches, 
                    validation_steps=validation_steps)

NameError: name 'lm' is not defined

In [13]:
# SINGLE DENSE LAYER
def get_fc_model():
    """Add a fully-connected hidden layer. Old school."""
    model = Sequential([
        Lambda(norm_input, input_shape=(1, 28, 28)),
        Flatten(),
        Dense(1024, activation='softmax'),
        Dense(10, activation='softmax')
        ])
    model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [14]:
fc = get_fc_model()
# With 8 training epochs, I got 93% validation accuracy
# pretty good
fc.fit_generator(batches, epochs=8, 
                 steps_per_epoch=steps_per_epoch,
                 validation_steps=validation_steps, 
                 validation_data=test_batches)

  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0fe3214710>

In [15]:
# BASIC VGG-STYLE CNN
def get_model():
    model = Sequential([
        Lambda(norm_input, input_shape=(1, 28, 28)),
        Convolution2D(32, 3, 3, activation='relu'),
        Convolution2D(32, 3, 3, activation='relu'),
        MaxPooling2D(),
        Convolution2D(64, 3, 3, activation='relu'),
        Convolution2D(64, 3, 3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(512, activation='relu'),
        Dense(10, activation='softmax')
    ])
    model.compile(Adam(), loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [16]:
model = get_model()

  """
  
  
  if __name__ == '__main__':
  .format(self.name, input_shape))


In [17]:
model.fit_generator(batches, epochs=1,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/1


<keras.callbacks.History at 0x7f0f9c4c5e10>

In [18]:
# change learning rate
model.optimizer.lr=0.1

In [19]:
model.fit_generator(batches, epochs=1,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/1


<keras.callbacks.History at 0x7f0f9b56e1d0>

In [20]:
# one more epoch - we overfitted this time
model.optimizer.lr = 0.05
model.fit_generator(batches, epochs=1,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/1


<keras.callbacks.History at 0x7f0f9bd72610>

In [21]:
# DATA AUGMENTATION
model = get_model()

  """
  
  
  if __name__ == '__main__':
  .format(self.name, input_shape))


In [22]:
gen = image.ImageDataGenerator(rotation_range=8, width_shift_range=0.08,
                              height_shift_range=0.08, zoom_range=0.08)
batches = gen.flow(X_train, y_train, batch_size=64)
test_batches = gen.flow(X_test, y_test, batch_size=64)

steps_per_epoch = int(np.ceil(batches.n/batch_size))
validation_steps = int(np.ceil(test_batches.n/batch_size))

# TODO: visualize data augmentation

In [24]:
model.fit_generator(batches, epochs=4,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f0ff671aed0>

In [25]:
# BATCH NORM
def get_model_bn():
    model = Sequential([
        Lambda(norm_input, input_shape=(1, 28, 28)),
        Convolution2D(32, 3, 3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(32, 3, 3, activation='relu'),
        MaxPooling2D(),
        BatchNormalization(axis=1),
        Convolution2D(64, 3, 3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(64, 3, 3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dense(10, activation='softmax')
    ])
    model.compile(Adam(), loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

# TODO: WHY DO WE NEED AXIS=1 for batchnorm in Conv layers?
# solution is on the forum

In [31]:
model_bn = get_model_bn()

  """
  import sys
  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  .format(self.name, input_shape))


In [33]:
# start with default learning rate first
model_bn.fit_generator(batches, epochs=1,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/1


<keras.callbacks.History at 0x7f0f86c0d0d0>

In [27]:
model_bn.optimizer.lr = 0.1

In [34]:
model_bn.fit_generator(batches, epochs=4,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f0ffbd8c650>

In [35]:
model_bn.optimizer.lr = 0.01

In [36]:
model_bn.fit_generator(batches, epochs=4,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f0f8bd4dfd0>

In [None]:
model_bn.optimizer.lr = 0.001

In [None]:
model_bn.fit_generator(batches, epochs=8,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

In [40]:
# BATCHNORM + DROPOUT + DATA AUGMENTATION
# Combine all the good stuff so far
def get_model_bn_do():
    model = Sequential([
        Lambda(norm_input, input_shape=(1, 28, 28)),
        Convolution2D(32, 3, 3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(32, 3, 3, activation='relu'),
        MaxPooling2D(),
        BatchNormalization(axis=1),
        Convolution2D(64, 3, 3, activation='relu'),
        BatchNormalization(axis=1),
        Convolution2D(64, 3, 3, activation='relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(),
        Dense(512, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(10, activation='softmax')
    ])
    model.compile(Adam(), loss='categorical_crossentropy',
                 metrics=['accuracy'])
    return model

In [41]:
model_bn_do = get_model_bn_do()

  
  
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  .format(self.name, input_shape))


In [111]:
# should ALWAYS use the default learning rates for a couple of epoch
model_bn_do.fit_generator(batches, epochs=1,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/1


<keras.callbacks.History at 0x7f0f51151590>

In [43]:
model_bn_do.optimizer.lr = 0.1 # shift-M to merge in command mode
model_bn_do.fit_generator(batches, epochs=4,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f0f7db87e50>

In [44]:
model_bn_do.optimizer.lr = 0.01 # shift-M to merge in command mode
model_bn_do.fit_generator(batches, epochs=8,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f0f7d43add0>

In [46]:
model_bn_do.optimizer.lr = 0.001 # shift-M to merge in command mode
model_bn_do.fit_generator(batches, epochs=12,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f0f7db87d90>

In [48]:
# ENSEMBLE ACCROSS ALL THESE MODELS
# TODO: MAKE 6 MODELS LIKE THIS and average them
def fit_model():
    model_bn_do = get_model_bn_do()
    model_bn_do.fit_generator(batches, epochs=8,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)
    
    model_bn_do.optimizer.lr = 0.1 
    model_bn_do.fit_generator(batches, epochs=4,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)
    
    model_bn_do.optimizer.lr = 0.01 
    model_bn_do.fit_generator(batches, epochs=8,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)
    
    model_bn_do.optimizer.lr = 0.001 
    model_bn_do.fit_generator(batches, epochs=8,
                steps_per_epoch=steps_per_epoch,
                validation_steps=validation_steps, 
                validation_data=test_batches)
    return model_bn_do

In [49]:
models = [fit_model() for i in range(6)]

  
  
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


  .format(self.name, input_shape))


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [51]:
HOME_DIR = os.getenv('HOME')
path = HOME_DIR + '/data/mnist/'
model_path = path + 'models/'

In [53]:
for i, m in enumerate(models):
    m.save_weights(model_path+'cnn-mnist-'+str(i)+'.pkl')


In [54]:
evals = np.array([m.evaluate(X_test, y_test, batch_size=256) for m in models])



In [55]:
evals.mean(axis=0)

array([0.0193, 0.9937])

In [56]:
all_preds = np.stack([m.predict(X_test, batch_size=256) for m in models])

In [57]:
all_preds.shape

(6, 10000, 10)

In [92]:
avg_preds = all_preds.mean(axis=0).astype(np.float64)
avg_preds.shape, y_test.shape

((10000, 10), (10000, 10))

In [108]:
# keras metric function takes (y_true, y_pred)
# it returns array of size [n_samples] with interger values in [0, 1]
# need to do eval() and mean() to get the accuracy over the whole set
ensemble_acc = keras.metrics.categorical_accuracy(
    y_test, avg_preds).eval().mean()

In [109]:

print(ensemble_acc.shape, avg_preds.shape, y_test.shape)
print(y_test[0,:], avg_preds[0,:])
print(y_test.dtype, avg_preds.dtype)
print(ensemble_acc)

() (10000, 10) (10000, 10)
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [4.7674e-07 4.2028e-07 1.1153e-07 2.0570e-07 4.2358e-08 9.1133e-09 3.0495e-10 9.9999e-01 1.4833e-09
 5.5657e-06]
float64 float64
0.9963


In [110]:
# deeper look into the categorical_accuracy function in keras
from keras.metrics import *
res = categorical_accuracy(
    np.array([[0, 1.0, 0], [1.0, 0.0, 0]], dtype=float), 
    np.array([[0, 1.0, 0], [0.0, 1.0, 0]], dtype=float)).eval()
res.mean()

0.5

In [None]:
# TODO: TRAIN FOR LONGER TIME AND PARAM SEARCH FOR THE DATA
