In [1]:
import os
import numpy
from PIL import Image

In [2]:
data_matrix = numpy.ndarray(shape=(400, 10304), dtype=numpy.dtype('u1'))  # 8-bit unsigned int
label_vector = numpy.empty(shape=(400,1), dtype=numpy.dtype('u1'))

i = 0
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        image = Image.open(os.path.join(dirname, filename))
        numpy_image = numpy.asarray(image).flatten()
        data_matrix[i] = numpy_image.copy()
        label_vector[i] = int(dirname[dirname.rindex('\\')+ 2:])
        i += 1

print(data_matrix)

[[ 48  49  45 ...  47  46  46]
 [ 34  34  33 ...  37  40  33]
 [ 60  60  62 ...  32  34  34]
 ...
 [112 109 116 ...  93  88  92]
 [111 114 112 ...  88  86  92]
 [110 112 113 ...  92  87  90]]


In [3]:
# Odd rows for training
train_data = data_matrix[::2,:]
y_train = label_vector[::2,:].ravel()

# Even rows for testing
test_data = data_matrix[1::2,:]
y_test = label_vector[1::2,:].ravel()

print(test_data)
print("\n")
print(train_data)

[[ 34  34  33 ...  37  40  33]
 [ 39  44  53 ...  29  26  29]
 [ 64  76  80 ...  35  37  39]
 ...
 [110 109 111 ...  94  88  90]
 [112 109 116 ...  93  88  92]
 [110 112 113 ...  92  87  90]]


[[ 48  49  45 ...  47  46  46]
 [ 60  60  62 ...  32  34  34]
 [ 63  53  35 ...  41  10  24]
 ...
 [114 110 112 ...  93  89  86]
 [113 112 111 ...  87  87  89]
 [111 114 112 ...  88  86  92]]


In [4]:
# PCA
def PCA(D ,alpha):
    mean = numpy.mean(D,axis=0)
    centered_data = numpy.subtract(D,mean)
    cov_matrix = numpy.cov(centered_data,rowvar = False)
    eigen_values , eigen_vectors = numpy.linalg.eigh(cov_matrix)


    #sort the eigenvalues and eigenvectors in descending order
    sorted_index = numpy.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:,sorted_index]

    sum_eigenvalues = sorted_eigenvalue.sum()
    sum_reduced = sorted_eigenvalue[0]

    components = 1
    while (sum_reduced/sum_eigenvalues) < alpha:
        sum_reduced += sorted_eigenvalue[components]
        components+=1

    reduced_basis = sorted_eigenvectors[:,0:components]

    return reduced_basis

In [5]:
U = PCA(train_data,0.90)

(200, 76)
(200, 76)


In [9]:
X_train = numpy.dot(U.T,train_data.T).T
X_test = numpy.dot(U.T,test_data.T).T

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix ,accuracy_score

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

# Predict on dataset which model has not seen before
y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

[[5 0 0 ... 0 0 0]
 [0 5 0 ... 0 0 0]
 [0 0 5 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 5 0]
 [0 0 0 ... 0 0 3]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.83      1.00      0.91         5
           3       1.00      1.00      1.00         5
           4       0.83      1.00      0.91         5
           5       0.67      0.80      0.73         5
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         5
           8       0.83      1.00      0.91         5
           9       1.00      1.00      1.00         5
          10       1.00      0.60      0.75         5
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         5
          13       0.83      1.00      0.91         5
          14       1.00      1.00      1.00         5
          15       0.83      1.00      0.91         5
          16   