# Grand Classification Challenge

The file digits.zip provided contains training and test images of handwritten digits. Each row describes one of the 28x28 pixel sample-images (see a few examples below). The dataset is split in two parts, 500 examples in the training set and 500 examples in the test set. For the training set, the first number represents the target class (i.e. 0-9). For the testset, these assignments are random and can be ignored. The remaining numbers define the image in a sparse representation, i.e., by giving pairs of PixelID:GrayValue where black pixels (GrayValue=0) are not explicitly listed.

In [None]:
# load the modules
import numpy as np
from sidekit.libsvm import svmutil
from matplotlib import pyplot as plt
%matplotlib inline

from scipy.ndimage import interpolation

from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

## Load and Inspect the Data Set

Load the dataset using `libsvm` and convert the sparse feature vectors to ndarray also filling the `0` values for the black values.

In [None]:
def convert_to_ndarray(X_in, n_dim=28*28):
    '''
    Synopsis:
        X = convert_to_ndarray(X_raw)
    Arguments:
        X_raw:   list o tuples (pixelID:GreyValue) in a dict of samples 
    Output:
        X:       2D array of non-sparese features (samples x features)
    '''
    X_out = np.zeros((len(X_in), n_dim))
    for i, xi in enumerate(X_in):
        key_list = list(X_in[i].keys())
        value_list = list(X_in[i].values())
        X_out[i,key_list] = value_list
    return X_out

def invert_pixels(X_in):
    '''
    Invert the colors as the white values should be 255 not 0.
    
    Synopsis:
        X = invert_pixels(X_in)
    Arguments:
        X_in:   2D array samples (samples x features) 
    Output:
        X:      2D array of non-sparese features (samples x features)
    '''
    X_out = (np.ones(X.shape) * 255) - X
    return X_out

In [None]:
y_raw, X_raw = svmutil.svm_read_problem('../data/digits/digit_train')
_, X_validation_raw = svmutil.svm_read_problem('../data/digits/digit_test')

X = convert_to_ndarray(X_raw)
X = invert_pixels(X)
y = np.asarray(y_raw).astype(int)
X_validation = convert_to_ndarray(X_validation_raw)
X_validation = invert_pixels(X_validation)

Visualize the 12 first trainings images alongside their labels. Its a good first impression how the data look like.

In [None]:
fig = plt.figure(1, figsize=(6*2, 2*2))
for i in range(12):
    ax = fig.add_subplot(2,6,i+1)
    ax.imshow(X[i,:].reshape(28, 28), cmap=plt.cm.gray, interpolation = "none")
    ax.set_title('label: {:d}'.format(y[i]))
    ax.set_xticklabels('')
    ax.set_yticklabels('')
plt.show()

## Expand the dataset

In [None]:
def rotate_digits(X_in, angle):
    X_out = np.empty(X_in.shape)
    for i, x_i in enumerate(X_in):
        x_i = x_i.reshape(28, 28)
        x_i_rot = interpolation.rotate(x_i, angle=angle, cval=256)
        start_idx = int((x_i_rot.shape[0]-28)/2)
        end_idx = start_idx + 28
        x_i_rot = x_i_rot[start_idx:end_idx,start_idx:end_idx]
        x_i_rot[x_i_rot > 250] = 255
        x_i_rot[x_i_rot < 5] = 0
        X_out[i, :] = x_i_rot.reshape(1,28*28)
    return X_out

In [None]:
# shitty shifting in one direction
def shift_pixels(X_in,pixels):
    pixels = abs(pixels)
    X_out = np.empty(X.shape)
    print (X.shape)
    for x in range(X_out.shape[0]): 
        for y in range(X_out.shape[1]-pixels):
            X_out[x,y] = X_in[x,y+pixels]
            
    return X_out

In [None]:
# expand dataset
X_exp_rot = np.vstack((rotate_digits(X, 5) ,X))
y_exp = y.repeat(2)
X_exp_shift = np.vstack((shift_pixels(X, 5) ,X))

# make datasets for plotting
X_test1 = shift_pixels(X,5)
X_test2 = rotate_digits(X,-15)
y_test = y

In [None]:
# plot our datasets to check if everything works
fig = plt.figure(1, figsize=(6*2, 2*2))
print ("shifted")
for i in range(12):
    ax = fig.add_subplot(2,6,i+1)
    ax.imshow(X_test1[i,:].reshape(28, 28), cmap=plt.cm.gray, interpolation = "none")
    ax.set_title('label: {:d}'.format(y[i]))
    ax.set_xticklabels('')
    ax.set_yticklabels('')
plt.show()

fig = plt.figure(1, figsize=(6*2, 2*2))
print ("rotated")
for i in range(12):
    ax = fig.add_subplot(2,6,i+1)
    ax.imshow(X_test2[i,:].reshape(28, 28), cmap=plt.cm.gray, interpolation = "none")
    ax.set_title('label: {:d}'.format(y[i]))
    ax.set_xticklabels('')
    ax.set_yticklabels('')
plt.show()

print ("original")
fig = plt.figure(1, figsize=(6*2, 2*2))
for i in range(12):
    ax = fig.add_subplot(2,6,i+1)
    ax.imshow(X[i,:].reshape(28, 28), cmap=plt.cm.gray, interpolation = "none")
    ax.set_title('label: {:d}'.format(y[i]))
    ax.set_xticklabels('')
    ax.set_yticklabels('')
plt.show()

## More investigation on the Dataset

### The principal components spectrum

tbd

In [None]:
pca = PCA()
pca.fit(X)

plt.figure(figsize=(7, 6))
plt.plot(pca.explained_variance_)
plt.xlabel('number pc')
plt.ylabel('explained variance')
plt.show()

### Visualize in 2D
Using PCA to determine the 2 principal components and embedd the trainings data in the reduced 2-dimensional space. A very blurry clustering can already be seen. This is an indication for the separability of the digit classes.

In [None]:
pca = PCA(n_components=2)
pca.fit(X_exp)
X_embedded = pca.transform(X_exp_rot)

pca2 = PCA(n_components=2)
pca2.fit(X)
X_embedded2 = pca.transform(X_exp_shift)

pca3 = PCA(n_components=2)
pca3.fit(X)
X_embedded3 = pca.transform(X)

n = 3

plt.figure(figsize=(18, 15))
plt.subplot(221)
for d in range(n):
    plt.scatter(X_embedded[y_exp==d,0], X_embedded[y_exp==d,1], s=20, label=d)
plt.clim(0,9)
plt.title("test set with reduced dimesions and rotated dataset")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()

plt.subplot(222)
for d in range(n):
    plt.scatter(X_embedded2[y==d,0], X_embedded2[y==d,1], s=20, label=d)
plt.clim(0,9)
plt.title("test set with reduced dimesions and shifted dataset")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()


plt.subplot(223)
for d in range(n):
    plt.scatter(X_embedded3[y==d,0], X_embedded3[y==d,1], s=20, label=d)
plt.clim(0,9)
plt.title("test set with reduced dimesions and original dataset")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.legend()

plt.show()

# Classification

Take a classification algorithm of your choice (e.g. MLPs, C-SVMs, KNN) and use the training data to find the (hyper) parameters that will allow you to optimally predict new example digits.

In [None]:
# helper functions
def crossvalidation(clf, X, y, n_folds=8):
    score_array = np.empty((n_folds, 1))
    kf = KFold(n_splits=n_folds)
    for ff, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
        # fit the model
        clf.fit(X_train, y_train)
        # predict and meassure the error
        score_array[ff] = clf.score(X_test, y_test)

    return np.mean(score_array)

def plot_two_params_score(score_array, x, y, xlabel, ylabel, classifier_name):
    plt.imshow(score_array.T, interpolation='none')
    plt.xticks(np.arange(x.shape[0]), x, rotation='vertical')
    plt.yticks(np.arange(y.shape[0]), y)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title('Score of {} [best={:4f}]'.format(classifier_name, score_array.max()))
    plt.colorbar()

## Classification with KNN

tdb

In [None]:
n_pc_array = np.array([1,2,3,4,5,10,20,50,100,200,500])
n_neighbors_array = np.arange(2, 12, 1)
score_array = np.zeros((n_pc_array.shape[0], n_neighbors_array.shape[0]))

for i, n_pc in enumerate(n_pc_array):
    for j, n_neighbors in enumerate(n_neighbors_array):
        pca = PCA(n_components=n_pc)
        pca.fit(X_exp_shift)
        X_embedded = pca.transform(X_exp_shift)
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        score_array[i,j] = crossvalidation(knn, X_embedded, y_exp)

In [None]:
fig = plt.figure(1)   
fig.set_size_inches((16, 6))
plot_two_params_score(score_array, n_pc_array, n_neighbors_array, 'number pc', 'k', 'KNN')
plt.show()

## Classification with RandomForest

tbd

In [None]:
n_estimators_array = np.array([10,50,100,200,500,1000])
min_samples_leaf_array = np.arange(2, 5, 1)
score_array = np.zeros((n_estimators_array.shape[0], min_samples_leaf_array.shape[0]))
tuned_parameters = [{
    'n_estimators': n_estimators_array, 
    'min_samples_leaf': min_samples_leaf_array
}]
rf = RandomForestClassifier(criterion="gini")
clf = GridSearchCV(rf, tuned_parameters, cv=8)
clf.fit(X_exp_shift, y_exp)
score_array = np.asarray(clf.cv_results_['mean_test_score']) \
    .reshape(n_estimators_array.shape[0], min_samples_leaf_array.shape[0])

In [None]:
fig = plt.figure(1)   
fig.set_size_inches((5, 6))
plot_two_params_score(score_array, n_estimators_array, min_samples_leaf_array, 
                      'n estimators', 'min samples leaf', 'Random Forest')
plt.show()