# NearestCentriodClassifier classifier, on all datasets

#### Import statements

In [26]:
from sklearn import metrics, datasets
import numpy as np
import MNIST
from naiveBayesianClassifier import NaiveBayesianClassifier
%config IPCompleter.greedy=True


### 1. Digits dataset
Load and split into 70% training and 30% test

In [36]:
digits = datasets.load_digits()

digits.data
digits.data.shape

num_split = int(0.7*len(digits.data))
train_features = digits.data[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits.data[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


In [37]:
print(train_features.shape)
print(train_labels[:30])


(1257, 64)
[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]


In [38]:
nbc = NaiveBayesianClassifier()
nbc.fit(train_features, train_labels, 10)
y_pred = nbc.predict(test_features)

[0.14285714 0.08730159 0.05555556 0.06349206 0.06349206 0.1031746
 0.08730159 0.04761905 0.07142857 0.05555556 0.04761905 0.03968254
 0.03968254 0.         0.04761905 0.02380952 0.02380952]


In [39]:
print("Classification report SKLearn digits:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn digits:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn digits:
              precision    recall  f1-score   support

           0       0.24      1.00      0.39        53
           1       0.84      0.51      0.64        53
           2       1.00      0.36      0.53        53
           3       1.00      0.58      0.74        53
           4       0.95      0.65      0.77        57
           5       0.88      0.52      0.65        56
           6       1.00      0.65      0.79        54
           7       0.86      0.59      0.70        54
           8       0.72      0.65      0.69        52
           9       0.75      0.65      0.70        55

    accuracy                           0.62       540
   macro avg       0.83      0.62      0.66       540
weighted avg       0.83      0.62      0.66       540


Confusion matrix SKLearn digits:
[[53  0  0  0  0  0  0  0  0  0]
 [20 27  0  0  1  0  0  0  3  2]
 [28  0 19  0  0  0  0  0  5  1]
 [13  0  0 31  0  3  0  1  3  2]
 [20  0  0  0 37  0  0  0  0  0]
 [19

### 2. Digits summarised dataset

Load data and minify the data to dark, gray and light values

In [31]:
flat_digits_data = digits.data.flatten()

digits_data_summarised = np.zeros(len(flat_digits_data))
for i,x in enumerate(flat_digits_data):
    if x < 5:
        digits_data_summarised[i] = 0 #dark
    elif x > 10:
        digits_data_summarised[i] = 2 #light
    else:
        digits_data_summarised[i] = 1 #gray
        
digits_data_summarised = digits_data_summarised.reshape(digits.data.shape)

print(digits.data.shape)
print(digits_data_summarised.shape)

(1797, 64)
(1797, 64)


Split into training and test set. 70/30

In [32]:
num_split = int(0.7*len(digits.data))
train_features = digits_data_summarised[:num_split]
train_labels =  digits.target[:num_split]
test_features = digits_data_summarised[num_split:]
test_labels = digits.target[num_split:]

print("Number of training examples: ",len(train_features))
print("Number of test examples: ",len(test_features))
print("Number of total examples:", len(train_features)+len(test_features))

Number of training examples:  1257
Number of test examples:  540
Number of total examples: 1797


In [34]:
print(train_features.shape)
print(train_labels.shape)

(1257, 64)
(1257,)


In [35]:
nbc.fit(train_features, train_labels, 10)
y_pred = nbc.predict(test_features)

[0.41269841 0.41269841 0.17460317 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]


In [25]:
print("Classification report SKLearn digits_summarised:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn digits_summarised:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn digits_summarised:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        53
           1       0.82      0.70      0.76        53
           2       0.94      0.87      0.90        53
           3       0.85      0.75      0.80        53
           4       0.91      0.89      0.90        57
           5       0.82      0.88      0.84        56
           6       0.98      0.87      0.92        54
           7       0.83      0.83      0.83        54
           8       0.69      0.77      0.73        52
           9       0.66      0.85      0.75        55

    accuracy                           0.82       540
   macro avg       0.83      0.82      0.83       540
weighted avg       0.83      0.82      0.83       540


Confusion matrix SKLearn digits_summarised:
[[43  0  1  0  4  3  0  0  2  0]
 [ 0 37  0  0  0  1  0  0  2 13]
 [ 1  0 46  4  0  1  0  0  0  1]
 [ 0  3  1 40  0  1  0  3  4  1]
 [ 1  0  0  0 5

### 3. MNIST_light dataset

Load dataset and split

In [11]:
mnist = MNIST.MNISTData('MNIST_Light/*/*.png')


train_features, test_features, train_labels, test_labels = mnist.get_data()  #70% train, 30% test

#undo normalization
train_features = train_features*255
test_features = test_features*255

In [12]:
nbc.fit(train_features, train_labels, 10, 255)
y_pred = nbc.predict(test_features)

[0.86629526 0.00278552 0.         0.00835655 0.         0.00278552
 0.00557103 0.00278552 0.00278552 0.         0.         0.00835655
 0.00835655 0.00557103 0.00557103 0.00278552 0.00278552 0.00278552
 0.         0.00278552 0.00557103 0.00278552 0.         0.
 0.         0.         0.         0.00278552 0.00557103 0.
 0.         0.         0.00278552 0.00278552 0.00557103 0.
 0.00835655 0.00557103 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.00278552 0.         0.         0.         0.
 0.         0.         0.00278552 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.00278552 0.         0.         0.
 0.         0.00278552 0.       

In [13]:
print("Classification report SKLearn:\n%s\n"
% (metrics.classification_report(test_labels, y_pred)))
print("Confusion matrix SKLearn:\n%s" % metrics.confusion_matrix(test_labels, y_pred))

Classification report SKLearn:
              precision    recall  f1-score   support

           0       0.11      1.00      0.20       164
           1       0.00      0.00      0.00       152
           2       0.00      0.00      0.00       155
           3       0.00      0.00      0.00       154
           4       0.00      0.00      0.00       143
           5       0.00      0.00      0.00       141
           6       0.00      0.00      0.00       143
           7       0.00      0.00      0.00       158
           8       0.00      0.00      0.00       132
           9       0.00      0.00      0.00       158

    accuracy                           0.11      1500
   macro avg       0.01      0.10      0.02      1500
weighted avg       0.01      0.11      0.02      1500


Confusion matrix SKLearn:
[[164   0   0   0   0   0   0   0   0   0]
 [152   0   0   0   0   0   0   0   0   0]
 [155   0   0   0   0   0   0   0   0   0]
 [154   0   0   0   0   0   0   0   0   0]
 [143   0  

  _warn_prf(average, modifier, msg_start, len(result))
