In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
from PIL import Image
import os

# Generate Data Matrix

The dataset consists of 40 persons each is referenced to by a folder called s#. Each folder for a person contains 10 different samples of grayscale pictures for this person producing a total number of 400 samples. Each sample is represented by a vector of 10304 elements (112x92).

In [None]:
number_of_persons = 40

In [None]:
def read_single_image(image_path):
    ans = []
    with open(image_path, 'rb') as f:
        assert f.readline() == b'P5\n'
        assert f.readline() == b'92 112\n'
        assert f.readline() == b'255\n'        
        for i in range(10304):
            ans.append(ord(f.read(1)))
    return ans

In [None]:
def construct_data_frame():
    images = []
    persons = []
    path = 'E:/Term 07/Pattern Recognition/Projects/Project 01/People/'
    print('Reading Started')
    for x in range(1, number_of_persons + 1):
        current_person_path = path + 's' + str(x) + '/'
        for y in range(1, 11):
            persons.append(str(x))
            images.append(read_single_image(current_person_path + str(y) + '.pgm'))
    print('Reading Finished')
    print('Number of Images loaded: ',len(images))
    print('Size of Vector per Image: ',len(images[0]))
    images = np.array(images)    
    return images, persons

In [None]:
(D, labels) = construct_data_frame()

# Test-Train Split

The data matrix is divided into two equally divided matrices each with size 200x10304 one for testing and another for training.

In [None]:
def custom_train_test_split(data, labels, samples_no, train_indices):
    X_train = []
    y_train = []
    X_test = []
    y_test = []
    for i in range(samples_no):
        if i%10 in train_indices:
            X_train.append(data[i])
            y_train.append(labels[i])
        else:
            X_test.append(data[i])
            y_test.append(labels[i])
    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = custom_train_test_split(D, labels, len(D), [0, 1, 3, 4, 6, 7, 9])
train_data = pd.DataFrame(X_train,index=y_train)
test_data = pd.DataFrame(X_test,index=y_test)
print('Train Data\n',train_data)
print('Test Data\n',test_data)

# Classifier

The KNN classifier is created according to a specified value for K from these: 1, 3, 5, and 7. The classifier gets trained by the data and label matrices specified for training. Then, it uses the test data in order to predict the corresponding class then compare this output with the expected output to check the classifier's accuracy.

In [None]:
def classify(X_train, y_train, X_test, y_test, n_neighbors):
    simple_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    simple_classifier.fit(X_train, y_train)
    test_samples = len(X_test)
    acc = 0
    y_predict = []
    for i in range(test_samples):
        result = simple_classifier.predict([X_test[i]])
        y_predict.append(result)
        if result == y_test[i]:
            acc += 1
    acc /= test_samples
    print(f'Acc at K = {n_neighbors}: {acc*100} %')
    return acc, simple_classifier

# PCA

Mean Calculation

In [None]:
def calculate_mean(data):
    return np.mean(data)

Centering Data

In [None]:
def centeralize(data):
    return data - calculate_mean(data).T

Covariance Matrix Calculation

In [None]:
def calculate_covariance_matrix(data):
    z = centeralize(data)
    return (np.matmul(np.transpose(z), z)) / len(data)

Eigenvalues and Eigenvectors Calculation

In [None]:
def calculate_eigen_vectors(data):
    cov = calculate_covariance_matrix(data)
    eig_values, eig_vectors = np.linalg.eigh(cov)
    idx = eig_values.argsort()[::-1]   
    eig_values = eig_values[idx]
    eig_vectors = eig_vectors[:,idx]
    return eig_values, eig_vectors

Fraction of Total Variance

In [None]:
def dimensionality(alpha, eig_values):
    s = np.sum(eig_values)
    r = 0
    i = 0
    for value in eig_values: 
        r = r + value
        i = i + 1
        if ((r / s) >= alpha):
            break
    return i

Reducing Eigenvectors

In [None]:
def calculate_reduced_dimensions(alpha_values, eig_values):
    reduced_dimensions = []
    for alpha in alpha_values:
        reduced_dimensions.append(dimensionality(alpha, eig_values))
    return reduced_dimensions

In [None]:
def PCA(data, alpha_values):
    eig_values, eig_vectors = calculate_eigen_vectors(data)
    r = calculate_reduced_dimensions(alpha_values, eig_values)
    return r, eig_vectors

# Analysis

In [None]:
a = [0.8, 0.85, 0.9, 0.95]
mean = calculate_mean(train_data)
z = centeralize(train_data)
r, eig_vectors = PCA(train_data, a)
print(r)
j = 0
classifiers = []
for i in r:
    print('for Alpha = ',a[j],' : R = ',i,'\n')
    j += 1
    U = eig_vectors[:,0:i]
    projected_train_data = np.array(np.matmul(z, U))
    projected_test_data = np.array(np.matmul(X_test - np.array(mean).T, U))
    acc = []
    k_values = [1, 3, 5, 7]
    for k in k_values:
        accuracy, classifier = classify(projected_train_data, y_train, projected_test_data, y_test, k)
        acc.append(accuracy)
        classifiers.append(classifier)
    plt.plot(k_values, acc)
    plt.xlabel('Number of neighbors')
    plt.ylabel('Acc')
    plt.title('K tunning graph')
    plt.show()
print('Number of Built Classifiers: ',len(classifiers))

# Single Image Prediction

In [None]:
def predict(img,c,r):
    U = eig_vectors[:,:r]
    x = np.array(np.matmul(img - np.array(mean).T, U))
    return c.predict([x])

Tryin to test 16th image from the test matrix and using 11th classifier where (Alpha = 0.9, R = 55, K = 5).

In [None]:
print('Predicted Class ',predict(X_test[15],classifiers[10],55))
print('Expected Class ',y_test[15])