## Data preparation


### Reading Data


- read_data from dataset

- convert images to gray scale

- convert each image to a vector of 10304 value

- stack 400 vector into a single data matrix



In [2]:
import cv2
import numpy as np



def process_image(path):
    image = cv2.imread(path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    flattened_image = gray_image.flatten()
    flattened_image = np.array(flattened_image)
    return flattened_image

def read_data (no_of_folders, no_of_images): 

    path = "dataset\\s%d\\%d.pgm"

    data_matrix = np.empty((0, 10304))  
    label_vector = np.array([])

    for folder_counter in range (1, no_of_folders + 1):
        for image_counter in range (1, no_of_images + 1):
            processed_image = process_image(path%(folder_counter,image_counter))
            data_matrix = np.append(data_matrix, [processed_image], axis=0)
            label_vector = np.append(label_vector, folder_counter)
    
    print(data_matrix.shape)

    return data_matrix, label_vector


def read_nonfaces(no_of_images):

    path= "processed_nonfaces_dataset\\%d.jpg"
    data_matrix = np.empty((0, 10304))  

    for i in range (1, no_of_images + 1):

        processed_image = process_image(path%(i))
        data_matrix = np.append(data_matrix, [processed_image], axis=0)

    return data_matrix



dm, lv = read_data( 40, 10)

dm2 = read_nonfaces(10)

print(dm2.shape)


(400, 10304)
(10, 10304)


### Splitting data to training and testing data

In [13]:


def split_data(data_matrix, label_vector):
    
    test_data = data_matrix[::2] #even  
    training_data = data_matrix[1::2] #odd

    test_labels = label_vector[::2]
    training_labels = label_vector[1::2] 

    return test_data, training_data, training_labels, test_labels



## PCA Algorithm

In [3]:

def PCA(training_data, alpha):

    mean_vector = np.mean(training_data, axis=0)
   
    centered_training_data = training_data - mean_vector

    covariance_matrix = np.cov(np.transpose(centered_training_data), bias= True)
   
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
   

    sorted_indices = np.argsort(eigenvalues)[::-1]
   
    sorted_eigenvalues = eigenvalues[sorted_indices]

    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    accum = sorted_eigenvalues[0]
  
    sum = np.sum(sorted_eigenvalues)
  
    i = 1
   
    while( accum / sum < alpha):
        accum+=sorted_eigenvalues[i]
        i+=1

    projection_matrix = sorted_eigenvectors[:, :i]
  
    print("multiplying the two matrices")
    reduced_dimensionality_data = np.transpose(projection_matrix).dot(np.transpose(training_data))

    return np.transpose(reduced_dimensionality_data), projection_matrix



## PCA Classification analysis

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

data_matrix, label_vector = read_data()

test_data, training_data, training_labels, test_labels = split_data(data_matrix, label_vector)

reduced_dimensionality_data, projection_matrix = PCA(training_data, 0.95)

print("after PCA")

reduced_dimensionality_test_data = np.transpose(projection_matrix).dot(np.transpose(test_data))
reduced_dimensionality_test_data = np.transpose(reduced_dimensionality_test_data)


print("after reducing test data")

knn_classifier = KNeighborsClassifier(n_neighbors=1)

knn_classifier.fit(reduced_dimensionality_data, training_labels)

print("after fit")

test_pred = knn_classifier.predict(reduced_dimensionality_test_data)

print("after prediction")

accuracy = accuracy_score(test_labels, test_pred)

print("Accuracy:", accuracy)


(400, 10304)


NameError: name 'split_data' is not defined

## Accuracy for every value of alpha using PCA.

as we see when alpha increase the accuracy increases.

but for alpha = 0.95 the accuracy decreases due to some reasons:
- Overfitting: many components might be including noise or irrelevant details from the data that can lead to overfitting
- Curse of dimensionality: when you have many features data becomes sparse and distances between points become less meaningful
- Not all variance is useful: PCA tries to keep the components that explain the most variance, but not all variance is useful for classification.

|  Alpha   | Accuracy |
| -------- | -------- |
| 0.8      | 0.93     |
| 0.85     | 0.94     |
| 0.9      | 0.945    |
| 0.95     | 0.935    |


## LDA 

In [20]:
import scipy as sc
import numpy as np

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def LDA(training_data, training_labels):

    unique_values, counts = np.unique(training_labels, return_counts=True)
    

    mean_vector = np.mean(training_data, axis=0)
    sb=0
    s=0
    for i in range(unique_values.shape[0]):
        print(unique_values[i], counts[i])
        class_i=training_data[np.where(training_labels==unique_values[i])]
        class_mean_i = np.mean(class_i,axis=0)

        sb+=counts[i]*np.outer(class_mean_i-mean_vector,class_mean_i-mean_vector)
        z =class_i - class_mean_i
        s+= np.transpose(z).dot(z)

    # print(s)
    # print('-------------sb--------------')
    # print(sb)
    # print('-----------s-1 sb----------------')
    # print(np.linalg.inv(s).dot(sb))
    # print('-----------eigen values----------------')
    

    eigenvalues, eigenvectors = sc.linalg.eig(np.linalg.inv(s).dot(sb))


    # print(eigenvalues)
    # print('------------eigen vectors---------------')
    # print(eigenvectors)

    sorted_indices = np.argsort(eigenvalues)[::-1]

    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    projection_matrix = sorted_eigenvectors[:, :unique_values.shape[0]-1]
    projection_matrix = np.real(projection_matrix)
    # print('-----------projection matrix----------------')
    # print (projection_matrix)

    reduced_dimensionality_data = np.transpose(projection_matrix).dot(np.transpose(training_data))

    # print('------------result---------------')
    # print (np.transpose(reduced_dimensionality_data))

    return np.transpose(reduced_dimensionality_data), projection_matrix



training_data = np.array([[4,2], [2,4], [2,3], [3,6], [4,4], [9,10], [6,8], [9,5], [8,7], [10,8]])
training_labels = np.array([1,1,1,1,1,2,2,2,2,2])
LDA(training_data, training_labels)





1 5
2 5


(array([[ 4.46966918],
        [ 3.48662485],
        [ 3.06936143],
        [ 5.22993727],
        [ 5.30419602],
        [12.35170446],
        [ 8.79082087],
        [10.26538736],
        [10.19112861],
        [12.4259632 ]]),
 array([[0.90878558],
        [0.41726342]]))

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score



data_matrix, label_vector = read_data()

test_data, training_data, training_labels, test_labels = split_data(data_matrix, label_vector)

reduced_dimensionality_data, projection_matrix = LDA(training_data)

reduced_dimensionality_test_data = np.transpose(projection_matrix).dot(np.transpose(test_data))
reduced_dimensionality_test_data = np.transpose(reduced_dimensionality_test_data)

knn_classifier = KNeighborsClassifier(n_neighbors=1)

knn_classifier.fit(reduced_dimensionality_data, training_labels)

test_pred = knn_classifier.predict(reduced_dimensionality_test_data)

accuracy = accuracy_score(test_labels, test_pred)

print("Accuracy:", accuracy)



(400, 10304)
Accuracy: 0.935


### Classifier Tuning

In [None]:
def test_with_k(reduced_dimensionality_data,training_labels,reduced_dimensionality_test_data,test_labels,k):
    knn_classifier = KNeighborsClassifier(n_neighbors=k)

    knn_classifier.fit(reduced_dimensionality_data, training_labels)

    test_pred = knn_classifier.predict(reduced_dimensionality_test_data)

    accuracy = accuracy_score(test_labels, test_pred)

    print("Accuracy:", accuracy)
    return accuracy

#### LDA with k=1,3,5,7

In [None]:
import matplotlib.pyplot as plt

data_matrix, label_vector = read_data()

test_data, training_data, training_labels, test_labels = split_data(data_matrix, label_vector)

reduced_dimensionality_data, projection_matrix = LDA(training_data)

reduced_dimensionality_test_data = np.transpose(projection_matrix).dot(np.transpose(test_data))
reduced_dimensionality_test_data = np.transpose(reduced_dimensionality_test_data)

LDA_performance_with_recpect_to_k = []
for i in range (1,8,2):
    LDA_performance_with_recpect_to_k.append(test_with_k(reduced_dimensionality_data,training_labels,reduced_dimensionality_test_data,test_labels,i))
    
plt.plot(list(range(1,8,2)), LDA_performance_with_recpect_to_k , marker='o', linestyle='-')

# Add labels and title
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('2D Data Plot')

# Show the plot
plt.grid(True)
plt.show()

#### PCA with k=1,3,5,7

In [31]:
import matplotlib.pyplot as plt

data_matrix, label_vector = read_data()

test_data, training_data, training_labels, test_labels = split_data(data_matrix, label_vector)

reduced_dimensionality_data, projection_matrix = PCA(training_data)

reduced_dimensionality_test_data = np.transpose(projection_matrix).dot(np.transpose(test_data))
reduced_dimensionality_test_data = np.transpose(reduced_dimensionality_test_data)

PCA_performance_with_recpect_to_k = []
for i in range (1,8,2):
    LDA_performance_with_recpect_to_k.append(test_with_k(reduced_dimensionality_data,training_labels,reduced_dimensionality_test_data,test_labels,i))
    
plt.plot(list(range(1,8,2)), LDA_performance_with_recpect_to_k , marker='o', linestyle='-')

# Add labels and title
plt.xlabel('X axis')
plt.ylabel('Y axis')
plt.title('2D Data Plot')

# Show the plot
plt.grid(True)
plt.show()

[[1, 2], [3, 4]]
