# Digit Recognizer

## 1. notes

For best results, set ```sample_fraction``` to 1 and ```epochs``` to 30 or higher.

A vote for this notebook is highly appreciated!

The latest version of this notebook you find at [Github](http://nbviewer.jupyter.org/github/Brinkhuis/digit_recognizer/blob/master/notebook/cnn_model_advanced.ipynb).

## 2. import packages

In [None]:
# import packages and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

In [None]:
# import packages and modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from random import randint

In [None]:
# import packages and modules
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.utils.np_utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator

## 3. settings

In [None]:
# random seed for reproducibility
seed = 777
np.random.seed(seed)

In [None]:
# fraction for data sampling
sample_fraction = .1 # fraction = 1 is no sampling (i.e. all train data is used to train the model)

In [None]:
# fraction for splitting test data from train data
split_fraction = 0.1

In [None]:
# number of epochs
epochs = 10

In [None]:
# batch size
batch_size = 64

## 4. data preparation

### 4.1 read data

In [None]:
# read data files
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

### 4.2 sample data

In [None]:
# sample train data
if not (sample_fraction > 0 and sample_fraction < 1):
    print('Data not sampled, all {} data points will be used!'.format(train.shape[0]))
else:
    print('Data sampled, {} data points will be used instead of all {} data points!'.format(round(train.shape[0] * sample_fraction), train.shape[0]))
    train = train.sample(frac = sample_fraction, random_state = seed)   

### 4.3 validate data

In [None]:
# check for null values
assert train.isnull().sum().sum() == 0, 'Null values found in train.'
assert test.isnull().sum().sum() == 0, 'Null values found in test.'

### 4.4 separate data

In [None]:
# store labels and pixels in separate arrays
y_train = train.iloc[:, 0].values.astype('uint8') # labels
X_train = train.iloc[:, 1:].values.astype('float32') # pixels
X = test.values.astype('float32') # pixels

In [None]:
# delete dataframes
del train
del test

### 4.5 inspect data

In [None]:
# show minimum and maximum values
pd.DataFrame([[np.amin(y_train), np.amin(X_train), np.amin(X)], 
              [np.amax(y_train), np.amax(X_train), np.amax(X)]], 
             columns = ['y_train', 'X_train', 'X'], 
             index = ['min', 'max'])

In [None]:
#show memory usage
pd.DataFrame([round(y_train.nbytes / 1024 ** 2, 2), round(X_train.nbytes / 1024 ** 2, 2), round(X.nbytes/ 1024 ** 2, 2)], 
             index = ['y_train', 'X_train', 'X'], 
             columns = ['memory usage (MB)'])

In [None]:
# plot label distribution
sns.set(style='white', context = 'notebook')
sns.countplot(y_train, color = 'royalblue')
plt.show()

In [None]:
# plot the first occurence of every digit
for i in range(0, 10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(np.reshape(X_train[[np.where(y_train == i)[0][0] for i in range(10)][i]], (28, 28)),
               cmap=plt.get_cmap('gray_r')) # gray is the default grayscale; gray_r is inverse grayscale
    plt.title('index {}'.format([np.where(y_train == i)[0][0] for i in range(10)][i]));
    plt.xticks([]) # hide xticks
    plt.yticks([]) # hide yticks
plt.show()

In [None]:
# plot the first ten occurences of every digit
n = 10
plt.figure(figsize = (8, 8))
for i in range(0, 10):
    a = np.where(y_train == i)[0]
    for j in range(0, n):
        plt.subplot(10, n, i * n + j + 1)
        plt.imshow(np.reshape(X_train[a[j]], (28, 28)), cmap = plt.get_cmap('gray_r'))
        plt.axis('off')
plt.show()
del n

### 4.6 normalize data

In [None]:
# normalize the data
X_train /= 255
X /= 255

### 4.7 reshape data

In [None]:
# reshape data
X_train = X_train.reshape(-1,28,28,1)
X = X.reshape(-1,28,28,1)

pd.DataFrame([str(X_train.shape), str(X.shape)], 
             index = ['X_train', 'X'], 
             columns = ['shape'])

### 4.8 encode labels

In [None]:
# encode labels to one hot vectors
y_train = to_categorical(y_train, num_classes = len(np.unique(y_train)))

In [None]:
# visualize random label on a hot vector
fig, ax = plt.subplots()
ax.set_xticks(list(range(10)))
ax.set_yticks(list(range(2)))
ax.xaxis.set_ticks_position('top')
ax.plot(y_train[randint(0, y_train.shape[0])])
plt.show()

### 4.6 split train/test

In [None]:
# split the train and the validation set for the fitting
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = split_fraction, random_state = seed)

## 5. create model

### 5.1 define model

In [None]:
# create model
model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Dropout(0.25))

model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2), strides = (2,2)))
model.add(Dropout(0.25))

model.add(Conv2D(filters = 128, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(Conv2D(filters = 128, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2,2), strides = (2,2)))
model.add(Dropout(0.25))

model.add(Conv2D(filters = 256, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(Conv2D(filters = 256, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (3,3), strides = (3,3)))
model.add(Dropout(0.25))




model.add(Flatten())
model.add(Dense(1024, activation = 'relu'))
model.add(Dropout(0.5))


model.add(Dense(10, activation = 'softmax'))

In [None]:
# model summary
model.summary()

### 5.2 compile model

In [None]:
# define the optimizer
optimizer = RMSprop(lr = 0.001, rho = 0.9, epsilon = 1e-08, decay = 0.0)

In [None]:
# compile the model
model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy'])

### 5.3 learning rate annealing

In [None]:
# set learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_acc', patience = 3, verbose = 0, factor = 0.5, min_lr = 0.00001)

### 5.4 data augmentation

In [None]:
# set arguments real-time data augmentation
datagen = ImageDataGenerator(featurewise_center = False,
                             samplewise_center = False,
                             featurewise_std_normalization = False,
                             samplewise_std_normalization = False,
                             zca_whitening = False,
                             rotation_range = 20,
                             zoom_range = 0.1,
                             width_shift_range = 0.1,
                             height_shift_range = 0.1,
                             horizontal_flip = False,
                             vertical_flip = False)

In [None]:
# compute quantities required for featurewise normalization
datagen.fit(X_train)

### 5.3 train model

In [None]:
# fit the model
history = model.fit_generator(datagen.flow(X_train, y_train, batch_size = batch_size),
                              epochs = epochs, validation_data = (X_test, y_test),
                              verbose = 0, steps_per_epoch = X_train.shape[0] // batch_size, 
                              callbacks = [learning_rate_reduction])

In [None]:
# list all keys in history
pd.DataFrame(list(history.history.keys()), 
             columns = ['dict_keys']).set_index('dict_keys')

In [None]:
# create dataframe to store absolute distance between train and test accuracy
acc = pd.DataFrame({'train': history.history['acc'], 'test': history.history['val_acc']})
acc['abs_dist'] = abs(acc.iloc[:, 0] - acc.iloc[:, 1])

In [None]:
# summarize history for accuracy
plt.figure(figsize = (12, 6))
plt.plot(history.history['acc'], color = 'blue')
plt.plot(history.history['val_acc'], color = 'red')

# dots to mark maximum accuracy for train
for i, x in [(i, x) for i, x in enumerate(history.history['acc']) if x == max(history.history['acc'])]:
    plt.plot(i, x, marker = 'o', markersize = 12, color = 'blue', linestyle = '')

# dots to mark maximum accuracy for test
for i, x in [(i, x) for i, x in enumerate(history.history['val_acc']) if x == max(history.history['val_acc'])]:
    plt.plot(i, x, marker = 'o', markersize = 12, color = 'red', linestyle = '')

# dots to mark closest accuracy for train and test
for i in acc.loc[acc['abs_dist'] == min(acc['abs_dist'])].index:
    plt.plot(i, acc.iloc[i, 0], marker = 'o', markersize = 8, color = 'yellow', linestyle = '')
for i in acc.loc[acc['abs_dist'] == min(acc['abs_dist'])].index:
    plt.plot(i, acc.iloc[i, 1], marker = 'o', markersize = 8, color = 'yellow', linestyle = '')

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc = 'best')
plt.show()

In [None]:
# accuracy summary
pd.DataFrame([[acc['train'].max(), list(acc.loc[acc['train'] == acc['train'].max()].index)], 
              [acc['test'].max(), list(acc.loc[acc['test'] == acc['test'].max()].index)],
              [acc['abs_dist'].min(), list(acc.loc[acc['abs_dist'] == acc['abs_dist'].min()].index)]], 
             columns = ['value', 'epoch'], 
             index = ['maximum accuracy train', 'maximum accuracy test', 'minimum distance train and test accuracy'])

In [None]:
# create dataframe to store absolute distance between train and test loss
loss = pd.DataFrame({'train': history.history['loss'], 'test': history.history['val_loss']})
loss['abs_dist'] = abs(loss.iloc[:,0] - loss.iloc[:,1])

In [None]:
# summarize history for loss
plt.figure(figsize = (12, 6))
plt.plot(history.history['loss'], color = 'blue')
plt.plot(history.history['val_loss'], color = 'red')

# dots to mark minimum loss for train
for i, x in [(i, x) for i, x in enumerate(history.history['loss']) if x == min(history.history['loss'])]:
    plt.plot(i, x, marker = 'o', markersize = 12, color = 'blue', linestyle = '')

# dots to mark minimum loss for test
for i, x in [(i, x) for i, x in enumerate(history.history['val_loss']) if x == min(history.history['val_loss'])]:
    plt.plot(i, x, marker = 'o', markersize = 12, color = 'red', linestyle = '')

# dots to mark closest loss for train and test
for i in loss.loc[loss['abs_dist'] == min(loss['abs_dist'])].index:
    plt.plot(i, loss.iloc[i, 0], marker = 'o', markersize = 8, color = 'yellow', linestyle = '')
for i in loss.loc[loss['abs_dist'] == min(loss['abs_dist'])].index:
    plt.plot(i, loss.iloc[i, 1], marker = 'o', markersize = 8, color = 'yellow', linestyle = '')

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc = 'best')
plt.show()

In [None]:
pd.DataFrame([[loss['train'].min(), list(loss.loc[loss['train'] == loss['train'].min()].index)], 
              [loss['test'].min(), list(loss.loc[loss['test'] == loss['test'].min()].index)],
              [loss['abs_dist'].min(), list(loss.loc[loss['abs_dist'] == loss['abs_dist'].min()].index)]], 
             columns = ['value', 'epoch'], 
             index = ['minimum loss train', 'minimum loss test', 'minimum distance train and test loss'])

In [None]:
# Plot the loss and accuracy curves for training and validation 
fig, ax = plt.subplots(2, 1)

ax[0].plot(history.history['loss'], color = 'blue', label = 'Training loss')
ax[0].plot(history.history['val_loss'], color = 'red', label = 'validation loss', axes = ax[0])
legend = ax[0].legend(loc='best', shadow = True)

ax[1].plot(history.history['acc'], color = 'blue', label = 'Training accuracy')
ax[1].plot(history.history['val_acc'], color = 'red', label= 'Validation accuracy')
legend = ax[1].legend(loc = 'best', shadow = True)

plt.show()

### 5.4 evaluate model

In [None]:
# baseline error
scores = model.evaluate(X_train, y_train, verbose = 0)
print('Baseline Error: {:.2f}%'.format(100-scores[1] * 100))

In [None]:
# define plot confusion matrix
def plot_confusion_matrix(cm, classes, title = 'Confusion matrix', cmap = plt.cm.Blues):
    '''
    This function prints and plots the confusion matrix.
    '''
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 verticalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
# confusion matrix
predicted_labels_ohv = model.predict(X_train) # predicted labels (one hot vector format)
predicted_labels = np.argmax(predicted_labels_ohv, axis = 1) # convert one-hot-vectors to predictions labels
true_labels = np.argmax(y_train, axis = 1) # convert the one-hot-vectors to validation observations (mapping the true labels)
confusion_mtx = confusion_matrix(true_labels, predicted_labels)

In [None]:
# print confusion matrix
print(confusion_mtx)

In [None]:
# plot confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(y_train.shape[1]))

In [None]:
# create confusion table
conf_tbl = []
for i in range(10):
    for j in range(10):
        tmp = [i, j, confusion_mtx[i, j]]
        conf_tbl.append(tmp)
del tmp
confusion_tbl = pd.DataFrame(conf_tbl, columns = ['True label', 'Predicted label', 'Count'])

In [None]:
# top 5 prediction errors
confusion_tbl.loc[(confusion_tbl['True label'] != confusion_tbl['Predicted label']) 
                  & (confusion_tbl['Count'] != 0)].sort_values(ascending = False, by = 'Count').head(5).set_index(['True label', 'Predicted label'])

In [None]:
# predictions errors per label
confusion_tbl.loc[(confusion_tbl['True label'] != confusion_tbl['Predicted label']) 
                  & (confusion_tbl['Count'] != 0), ['True label', 'Count']].groupby('True label').sum().sort_values(ascending = False, by = 'Count')

## 6. save model

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

In [None]:
# serialize weights to HDF5
model.save_weights('model.h5')

## 7. load model

In [None]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

In [None]:
# load weights into model
loaded_model.load_weights('model.h5')

In [None]:
# compile loaded model
loaded_model.compile(optimizer = optimizer , loss = 'categorical_crossentropy', metrics = ['accuracy'])

## 8. make predictions

In [None]:
# preditions with loaded model
predictions = loaded_model.predict_classes(X, verbose = 0)

## 9. save predictions

In [None]:
# save predictions to file
preds = pd.DataFrame({'ImageId': list(range(1, len(predictions) + 1)), 'Label': predictions})
preds.to_csv('predictions.csv', index = False, header = True)
from IPython.display import FileLink
FileLink('predictions.csv')