# NearestCentriodClassifier classifier, on all datasets

#### Import statements

In [14]:
from sklearn import metrics, datasets
import numpy as np
import MNIST
from gaussianNaiveBayesianClassifier import GaussianNaiveBayesianClassifier
%config IPCompleter.greedy=True


### 1. Digits dataset
Load and split into 70% training and 30% test

In [15]:
digits = datasets.load_digits()

digits.data
digits.data.shape

num_split = int(0.7*len(digits.data))
train_features = digits.data[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits.data[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


In [16]:
print(train_features.shape)
print(train_labels[:30])


(1257, 64)
[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [4]:
gbc = GaussianNaiveBayesianClassifier()
gbc.fit(train_features, train_labels, 10)
y_pred = gbc.predict(test_features)

In [5]:
print("Classification report SKLearn digits:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn digits:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn digits:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96        53
           1       0.72      0.74      0.73        53
           2       0.98      0.85      0.91        53
           3       0.93      0.74      0.82        53
           4       0.98      0.91      0.95        57
           5       0.87      0.95      0.91        56
           6       0.96      0.98      0.97        54
           7       0.78      0.83      0.80        54
           8       0.64      0.71      0.67        52
           9       0.73      0.82      0.77        55

    accuracy                           0.85       540
   macro avg       0.86      0.85      0.85       540
weighted avg       0.86      0.85      0.85       540


Confusion matrix SKLearn digits:
[[50  1  0  0  1  0  0  0  1  0]
 [ 1 39  0  0  0  0  0  0  3 10]
 [ 0  3 45  1  0  0  1  0  0  3]
 [ 0  0  0 39  0  2  0  2  9  1]
 [ 0  0  0  0 52  0  0  4  1  0]
 [ 0

### 2. Digits summarised dataset

Load data and minify the data to dark, gray and light values

In [6]:
flat_digits_data = digits.data.flatten()

digits_data_summarised = np.zeros(len(flat_digits_data))
for i,x in enumerate(flat_digits_data):
    if x < 5:
        digits_data_summarised[i] = 0 #dark
    elif x > 10:
        digits_data_summarised[i] = 2 #light
    else:
        digits_data_summarised[i] = 1 #gray
        
digits_data_summarised = digits_data_summarised.reshape(digits.data.shape)

print(digits.data.shape)
print(digits_data_summarised.shape)

(1797, 64)
(1797, 64)


Split into training and test set. 70/30

In [7]:
num_split = int(0.7*len(digits.data))
train_features = digits_data_summarised[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits_data_summarised[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


In [8]:
print(train_features.shape)
print(train_labels.shape)
print(test_features[5])

(1257, 64)
(1257,)
[0. 0. 2. 2. 2. 1. 0. 0. 0. 1. 2. 1. 1. 2. 0. 0. 0. 0. 2. 0. 1. 2. 1. 0.
 0. 0. 2. 2. 2. 2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 0.
 0. 0. 0. 0. 2. 2. 0. 0. 0. 0. 2. 2. 2. 1. 0. 0.]


In [9]:
gbc.fit(train_features, train_labels, 10)
y_pred = gbc.predict(test_features)

In [10]:
print("Classification report SKLearn digits_summarised:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn digits_summarised:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn digits_summarised:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        53
           1       0.86      0.68      0.76        53
           2       0.98      0.92      0.95        53
           3       0.93      0.75      0.83        53
           4       0.95      0.93      0.94        57
           5       0.92      0.86      0.89        56
           6       0.98      0.91      0.94        54
           7       0.84      0.89      0.86        54
           8       0.63      0.88      0.74        52
           9       0.70      0.82      0.76        55

    accuracy                           0.86       540
   macro avg       0.87      0.86      0.86       540
weighted avg       0.87      0.86      0.86       540


Confusion matrix SKLearn digits_summarised:
[[49  0  0  0  3  0  0  0  1  0]
 [ 0 36  0  0  0  1  0  0  4 12]
 [ 1  0 49  0  0  0  0  0  2  1]
 [ 0  1  0 40  0  0  0  3  9  0]
 [ 0  0  0  0 5

### 3. MNIST_light dataset

Load dataset and split

In [32]:
mnist = MNIST.MNISTData('MNIST_Light/*/*.png')


train_features, test_features, train_labels, test_labels = mnist.get_data()  #70% train, 30% test

print(train_features*255)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [33]:
gbc.fit(train_features, train_labels, 10)
y_pred = gbc.predict(test_features)

In [34]:
print("Classification report SKLearn:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn:
              precision    recall  f1-score   support

           0       0.85      0.93      0.89       164
           1       0.80      0.95      0.87       152
           2       0.81      0.58      0.68       155
           3       0.85      0.71      0.77       154
           4       0.85      0.61      0.71       143
           5       0.89      0.46      0.61       141
           6       0.81      0.93      0.86       143
           7       0.97      0.76      0.85       158
           8       0.47      0.80      0.59       132
           9       0.65      0.88      0.75       158

    accuracy                           0.76      1500
   macro avg       0.79      0.76      0.76      1500
weighted avg       0.80      0.76      0.76      1500


Confusion matrix SKLearn:
[[153   0   2   0   0   1   2   0   5   1]
 [  0 144   0   0   0   0   1   0   5   2]
 [  4   5  90   8   0   1  17   1  29   0]
 [  2   5  12 109   0   1   2   1  18   4]
 [  3   1  