In [74]:
import os
import numpy
import pandas as pd
from PIL import Image

In [75]:
data_matrix = numpy.ndarray(shape=(400, 10304), dtype=numpy.dtype('u1'))  # 8-bit unsigned int
label_vector = numpy.empty(shape=(400, 1), dtype=numpy.dtype('u1'))

i = 0
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        image = Image.open(os.path.join(dirname, filename))
        numpy_image = numpy.asarray(image).flatten()
        data_matrix[i] = numpy_image.copy()
        label_vector[i] = int(dirname[dirname.rindex('\\') + 2:])
        i += 1

print(data_matrix)
print(data_matrix.shape)

[[ 48  49  45 ...  47  46  46]
 [ 34  34  33 ...  37  40  33]
 [ 60  60  62 ...  32  34  34]
 ...
 [112 109 116 ...  93  88  92]
 [111 114 112 ...  88  86  92]
 [110 112 113 ...  92  87  90]]
(400, 10304)


In [76]:
# Odd rows for training
train_data = data_matrix[::2, :]
y_train = label_vector[::2, :].ravel()

# Even rows for testing
test_data = data_matrix[1::2, :]
y_test = label_vector[1::2, :].ravel()

print(test_data)
print(test_data.shape)
print("\n")
print(train_data)
print(train_data.shape)

[[ 34  34  33 ...  37  40  33]
 [ 39  44  53 ...  29  26  29]
 [ 64  76  80 ...  35  37  39]
 ...
 [110 109 111 ...  94  88  90]
 [112 109 116 ...  93  88  92]
 [110 112 113 ...  92  87  90]]
(200, 10304)


[[ 48  49  45 ...  47  46  46]
 [ 60  60  62 ...  32  34  34]
 [ 63  53  35 ...  41  10  24]
 ...
 [114 110 112 ...  93  89  86]
 [113 112 111 ...  87  87  89]
 [111 114 112 ...  88  86  92]]
(200, 10304)


In [77]:
df = pd.DataFrame(train_data)
df['class'] = y_train

In [78]:
#classes --->classes matrix
#mean_mat ---> mean for each class matrix
#samples_c ---> nu of samples per class

classes = numpy.sort(df['class'].unique())
mean_mat = df.groupby('class').mean().values
samples_c = df.groupby('class').size().values

In [79]:
#between classes matrix
#nk (nu of samples for class)
#uk (mean of class)
#u  (mean of all classes)

u = mean_mat.mean(0).reshape(1, train_data.shape[1])

B = numpy.zeros(shape=(train_data.shape[1], train_data.shape[1]))

for i in range(classes.size):
    nk = samples_c[i]
    uk = mean_mat[i].reshape(1, train_data.shape[1])
    t = uk - u
    B += nk * numpy.matmul(t.T, t)

print(B)

[[231615.82  228816.94  231111.73  ... -41390.46  -19667.31   -7796.11 ]
 [228816.94  226547.58  228482.91  ... -39573.22  -18219.37   -7079.97 ]
 [231111.73  228482.91  231351.995 ... -39955.09  -18238.365  -7093.165]
 ...
 [-41390.46  -39573.22  -39955.09  ... 335064.38  304869.63  288210.43 ]
 [-19667.31  -18219.37  -18238.365 ... 304869.63  292480.555 277844.955]
 [ -7796.11   -7079.97   -7093.165 ... 288210.43  277844.955 272640.155]]


In [80]:
#within class matrix
#class_ss --> class samples
#class_m --> class mean

S = numpy.zeros(shape=(train_data.shape[1], train_data.shape[1]))

for i in range(classes.size):
    class_ss = df.loc[df['class'] == i + 1].values[:, :-1]
    class_m = mean_mat[i].reshape(1, train_data.shape[1])
    centered_data = numpy.subtract(class_ss, class_m)
    s = numpy.matmul(centered_data.T, centered_data)
    S = S + s

print(S)

[[ 26322.   24844.8  24771.6 ...   -757.2  -2197.8   3702.4]
 [ 24844.8  25913.6  24942.4 ...    190.6   -395.4   4566. ]
 [ 24771.6  24942.4  25832.4 ...   -331.2    332.4   6442.8]
 ...
 [  -757.2    190.6   -331.2 ... 143575.2  90017.8  74673.8]
 [ -2197.8   -395.4    332.4 ...  90017.8  96311.6  82691. ]
 [  3702.4   4566.    6442.8 ...  74673.8  82691.   95567.6]]


In [81]:
S_inv = numpy.linalg.inv(S)
print(S_inv)

[[-1.48334431e+10  2.19806018e+10 -1.11231903e+10 ...  1.61426255e+10
   5.71164011e+09  5.31346143e+09]
 [-3.45817617e+08 -1.71984349e+10  2.47060253e+10 ... -1.33535827e+09
  -5.49533009e+09  1.74066133e+09]
 [-1.08400987e+10 -3.82258231e+09 -3.78457935e+09 ... -2.09934602e+10
  -7.18811238e+09 -5.90215670e+09]
 ...
 [ 2.11771705e+09 -1.00350918e+09  4.12195984e+08 ...  3.39594566e+09
   5.35317908e+08  1.71767263e+09]
 [-1.94660029e+09 -6.97764844e+08  3.05241252e+09 ... -2.99356813e+09
  -8.28629144e+08 -1.00106820e+09]
 [ 6.76654735e+09 -3.63735882e+09  4.71473434e+08 ...  5.76514010e+09
   1.36148421e+09  2.11845303e+09]]


In [113]:
eigen_values, eigen_vectors = numpy.linalg.eigh(numpy.matmul(S_inv, B))

In [114]:


#sort the eigenvalues and eigenvectors in descending order
sorted_index = numpy.argsort(eigen_values)[::-1]
sorted_eigenvalue = eigen_values[sorted_index]
sorted_eigenvectors = eigen_vectors[:, sorted_index]

# sorted_eigenvectors = numpy.real_if_close(sorted_eigenvectors)

sorted_eigenvectors

array([[ 4.42897293e-03,  6.14092590e-03, -7.27808637e-03, ...,
         7.66368283e-03, -5.96815885e-03, -4.45112875e-03],
       [ 4.64659855e-03,  6.38154560e-03, -7.30239870e-03, ...,
         7.67581073e-03, -6.19456437e-03, -4.66707273e-03],
       [ 4.75891055e-03,  6.39704160e-03, -6.67463601e-03, ...,
         7.08289035e-03, -6.24385237e-03, -4.78066622e-03],
       ...,
       [ 2.16435775e-05, -2.42301559e-03,  1.34663337e-02, ...,
         1.33437567e-02, -1.98744040e-03,  7.00140600e-05],
       [ 3.67864458e-03, -8.21830212e-04, -1.11083481e-02, ...,
        -1.07152147e-02, -1.22788676e-03,  3.66218921e-03],
       [-1.37936054e-03,  2.03117430e-04,  1.77564896e-02, ...,
         1.73438821e-02,  8.75985301e-04, -1.32082063e-03]])

In [115]:
U = sorted_eigenvectors[:, 0:39]
U.shape

(10304, 39)

In [116]:
X_train = numpy.matmul(train_data, U)
X_test = numpy.matmul(test_data, U)

print(X_train.shape)
print(X_test.shape)

(200, 39)
(200, 39)


In [120]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

# Predict on dataset which model has not seen before
y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))




Accuracy: 0.955
