## **PART I**
* Install the libraies
* Download the dataset
* Reshape the data
* Convert data image to grayscale image

In [0]:
!pip install prettytable
from scipy.io import loadmat
import psutil
import gc # del vaires; gc.collect() 
import os
import urllib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import cross_validate
from sklearn import tree
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
from prettytable import PrettyTable

f_test,_ = urllib.request.urlretrieve('http://ufldl.stanford.edu/housenumbers/test_32x32.mat')
f_train,_ = urllib.request.urlretrieve('http://ufldl.stanford.edu/housenumbers/train_32x32.mat')
raw_data_train = loadmat(f_train)
raw_data_test = loadmat(f_test)


data_train_gray = (raw_data_train.get('X')[:, :, 0] * 0.30 +
                   raw_data_train.get('X')[:, :, 1] * 0.59 +
                   raw_data_train.get('X')[:, :, 2] * 0.11 )
data_train_gray = np.reshape(data_train_gray, (-1,73257)).T

data_test_gray = (raw_data_test.get('X')[:, :, 0] * 0.30 +
                   raw_data_test.get('X')[:, :, 1] * 0.59 +
                   raw_data_test.get('X')[:, :, 2] * 0.11 )
data_test_gray = np.reshape(data_test_gray, (-1,26032)).T

data_conc = np.vstack((data_train_gray, data_test_gray))

label_test = raw_data_test.get('y')
label_test = label_test.ravel()

label_train = raw_data_train.get('y')
label_train = label_train.ravel()

data_train = np.hstack((np.hstack((raw_data_train.get('X')[:, :, 0],
                                   raw_data_train.get('X')[:, :, 1])),
                        raw_data_train.get('X')[:, :, 2]))
data_train = np.reshape(data_train,(32*32*3,73257)).T

data_test = np.hstack((np.hstack((raw_data_test.get('X')[:, :, 0],
                                  raw_data_test.get('X')[:, :, 1])),
                       raw_data_test.get('X')[:, :, 2]))
data_test = np.reshape(data_test,(32*32*3,26032)).T



##*PART II*

Test the performance of convertion on grayscale image.
Using Decision tree, 10-fold CV

In [0]:
nb = tree.DecisionTreeClassifier()
cv_results = cross_validate(nb, data_train, label_train, return_train_score=False,cv=10)
nb_gray = tree.DecisionTreeClassifier()
cv_results_gray = cross_validate(nb_gray, data_train_gray, label_train, return_train_score=False,cv=10)

##*PART III*
* IPCA decomposition the data into 100 - dimension

In [0]:
pca_gray = IncrementalPCA(n_components=100, batch_size= 2000)
pca_gray.fit(data_train_gray)
decom_data_train_gray = pca_gray.transform(data_train_gray)
decom_data_test_gray = pca_gray.transform(data_test_gray)
plt.plot([i for i in range(200)],[np.sum(100 * pca_gray.explained_variance_ratio_[:i+1]) for i in range(200)] , color='blue', label='entropy')
plt.ylabel('entropy percent/%')
plt.xlabel('dimension')

##*PART IV*
* Use GridSearchCV to modify the parameters
* Predict the test data through optimal parameters

!Note: This part will cost a lot of time as the GridSearchCV matches the paremeters and using 10-fold CV.
The time is about 10 hours.

The result of estimator is recorded here:
KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='distance')

In [0]:
classifier_Knn_gray = KNeighborsClassifier()
parameters = {'n_neighbors':[3, 5, 10, 15, 20, 25, 30],
              'weights':['distance', 'uniform'],
              'algorithm':['kd_tree']}

modifiy = GridSearchCV(classifier_Knn_gray, parameters)
modifiy.fit(decom_data_train_gray,label_train)

##*PART V*
* Pass the paramenters to the classfier, training the data and predict the test label

In [0]:
Knn_gray = KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='distance')
Knn_gray.fit(decom_data_train_gray,label_train)
result_test_gray = Knn_gray.predict(decom_data_test_gray)

##*PART VI*
* Compute the confuse matrix and plot

In [0]:
label_test[np.where(label_test == 10)] = 0
result_test_gray[np.where(result_test_gray == 10)] = 0
class_names = [0,1,2,3,4,5,6,7,8,9]

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Compute confusion matrix
cnf_matrix = confusion_matrix(label_test, result_test_gray,labels=[0,1,2,3,4,5,6,7,8,9])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')


plt.show()

##*PART VII*
* Compute the value of FN, TP and draw the diagram

In [0]:
label=np.arange(0,10)
x = PrettyTable(['Label', 'Precision', 'recall', 'F1-score','Support'])
for i in label:
  TP=cnf_matrix[i,i]
  FN = np.sum(cnf_matrix[i]) - TP
  FP = np.sum(cnf_matrix[:, i]) - TP
  TN = np.sum(cnf_matrix) - TP - FN - FP
  precision =(TP / (TP + FP))
  recall = TP / (TP + FN)
  F_measure = TP / (2*TP + FP + FN)
  Support= (TP + FN)
  x.add_row([label[i],round(precision,3),round(recall,3), round(F_measure,3),round(Support,0)])
print('Table 1. Precision,Recall,F1 and Support score for each label')
print(x)