## Data preparation


### Reading Data


- read_data from dataset

- convert images to gray scale

- convert each image to a vector of 10304 value

- stack 400 vector into a single data matrix



In [None]:
import cv2
import numpy as np

path = "dataset\\s%d\\%d.pgm"

def read_data (): 

    data_matrix = np.empty((0, 10304))  
    label_vector = np.array([])
    for folder_counter in range (1, 5):
        for image_counter in range (1, 11):
            image = cv2.imread(path%(folder_counter,image_counter))
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            flattened_image = gray_image.flatten()
            flattened_image = np.array(flattened_image)
            data_matrix = np.append(data_matrix, [flattened_image], axis=0)
            label_vector = np.append(label_vector, folder_counter)
            # data_matrix = np.vstack((data_matrix, flattened_image))
    
    print(data_matrix.shape)

    return data_matrix, label_vector


dm, lv = read_data()
print(lv)

### Splitting data to training and testing data

In [2]:


def split_data(data_matrix, label_vector):
    
    test_data = data_matrix[::2] #even  
    training_data = data_matrix[1::2] #odd

    test_labels = label_vector[::2]
    training_labels = label_vector[1::2] 

    return test_data, training_data, training_labels, test_labels



## PCA Algorithm

In [None]:

def PCA(training_data, alpha):

    mean_vector = np.mean(training_data, axis=0)
   
    centered_training_data = training_data - mean_vector
   
    covariance_matrix = np.cov(np.transpose(centered_training_data), bias= True)
   
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
   
    sorted_indices = np.argsort(eigenvalues)[::-1]
   
    sorted_eigenvalues = eigenvalues[sorted_indices]

    accum = sorted_eigenvalues[0]
  
    sum = np.sum(sorted_eigenvalues)
  
    i = 1
   
    while( accum / sum < alpha):
        accum+=sorted_eigenvalues[i]
        i+=1

    projection_matrix = eigenvectors[:, sorted_indices][:, :i]
  
    reduced_dimensionality_data = np.transpose(projection_matrix).dot(np.transpose(centered_training_data))

    return np.transpose(reduced_dimensionality_data), projection_matrix



## PCA Classification analysis

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data_matrix, label_vector = read_data()

training_data, test_data, training_labels, test_labels = split_data(data_matrix, label_vector)

reduced_dimensionality_data, projection_matrix = PCA(training_data, 0.91)

print("after PCA")

reduced_dimensionality_test_data = np.transpose(projection_matrix).dot(np.transpose(test_data))

print("after reducing test data")

knn_classifier = KNeighborsClassifier(n_neighbors=1)

knn_classifier.fit(reduced_dimensionality_data, training_labels)

print("after fit")

test_pred = knn_classifier.predict(reduced_dimensionality_test_data)

print("after prediction")

accuracy = accuracy_score(test_labels, test_pred)

print("Accuracy:", accuracy)


(40, 10304)
