- [read data](#read-data)
- [reshape data](#reshape-data)
- [split data](#split-data)
- [PCA](#PCA)
- [K-Means Clustering](#K-Means-Clustering)
- [KNN](#KNN)

In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
dataset_path = 'data'

## read data

In [5]:
images = []
labels = []

for label in range(1, 41):
    person_path = os.path.join(dataset_path, f"s{label}")
    for img_name in os.listdir(person_path):      # return list of the content of the dir like ls -l
        img_path = os.path.join(person_path, img_name)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        images.append(img)
        labels.append(label)   

In [6]:
images = np.array(images)
labels = np.array(labels)

print("Total images:", len(images))
print("Image shape:", images[0].shape) 

Total images: 400
Image shape: (112, 92)


## reshape data

In [9]:
images = images.reshape(-1,10304)       # Flattens 2D images into 1D vectors for input 
print("Image shape:", images.shape)
print(images)

Image shape: (400, 10304)
[[ 48  49  45 ...  47  46  46]
 [ 34  34  33 ...  37  40  33]
 [ 60  60  62 ...  32  34  34]
 ...
 [129 127 133 ...  93  93  93]
 [125 119 124 ...  36  39  40]
 [119 120 120 ...  89  94  85]]


## split data

In [11]:
images_training = []
images_testing = []
labels_training = []
labels_testing = []

for idx in range(400):
    if idx%2==0:
        images_testing.append(images[idx])
        labels_testing.append(labels[idx])
    else:
        images_training.append(images[idx])
        labels_training.append(labels[idx])

In [13]:
print("Test images:", len(images_testing))
print("Train images:", len(images_training))

print("Test labels:", len(labels_testing))
print("Train labels:", len(labels_training))

Test images: 200
Train images: 200
Test labels: 200
Train labels: 200


In [15]:
images_testing = np.array(images_testing)
images_training = np.array(images_training)

labels_testing = np.array(labels_testing)
labels_training = np.array(labels_training)
print(labels_testing)
print(labels_training)

[ 1  1  1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  5  5  5  5
  5  6  6  6  6  6  7  7  7  7  7  8  8  8  8  8  9  9  9  9  9 10 10 10
 10 10 11 11 11 11 11 12 12 12 12 12 13 13 13 13 13 14 14 14 14 14 15 15
 15 15 15 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 19 19 19 20
 20 20 20 20 21 21 21 21 21 22 22 22 22 22 23 23 23 23 23 24 24 24 24 24
 25 25 25 25 25 26 26 26 26 26 27 27 27 27 27 28 28 28 28 28 29 29 29 29
 29 30 30 30 30 30 31 31 31 31 31 32 32 32 32 32 33 33 33 33 33 34 34 34
 34 34 35 35 35 35 35 36 36 36 36 36 37 37 37 37 37 38 38 38 38 38 39 39
 39 39 39 40 40 40 40 40]
[ 1  1  1  1  1  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  5  5  5  5
  5  6  6  6  6  6  7  7  7  7  7  8  8  8  8  8  9  9  9  9  9 10 10 10
 10 10 11 11 11 11 11 12 12 12 12 12 13 13 13 13 13 14 14 14 14 14 15 15
 15 15 15 16 16 16 16 16 17 17 17 17 17 18 18 18 18 18 19 19 19 19 19 20
 20 20 20 20 21 21 21 21 21 22 22 22 22 22 23 23 23 23 23 24 24 24 24 24
 25 25 25 25 25 26 26 26 

## PCA

In [137]:
def calc_eigen(Data_centered):
    cov_matrix = np.cov(Data_centered, rowvar=False)
    eig_vals, eig_vecs = np.linalg.eigh(cov_matrix)
    np.save("eigenvalues.npy", eig_vals)
    np.save("eigenvectors.npy", eig_vecs)
    
class PCA:
    def __init__(self):
        self.variance_threshold = None
        self.mean = None
        self.components = None
        self.explained_variance = None

    def fit(self, X):
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean

        # calc_eigen(X_centered)
        eig_vals = np.load('eigenvalues.npy')
        eig_vecs = np.load('eigenvectors.npy')
        
        sorted_idx = np.argsort(eig_vals)[::-1]
        eig_vals = eig_vals[sorted_idx]
        eig_vecs = eig_vecs[:, sorted_idx]

        explained_variance_ratio = eig_vals / np.sum(eig_vals)     # the ratio of each direction var
        cumulative_variance = np.cumsum(explained_variance_ratio)        # explained_variance_ratio = [0.4, 0.3, 0.2, 0.1]  
                                                                         # cumulative_variance = [0.4, 0.7, 0.9, 1]
        
        num_components = np.argmax(cum_var >= self.variance_threshold) + 1         # returns the first index where the cumulative variance ≥ alpha.
        
        self.components = eig_vecs[:, :num_components]       # Selects all rows, col[0:num_components]  
        self.explained_variance = eig_vals[:num_components]

    def transform(self, X):
        X_centered = X - self.mean
        return np.dot(X_centered, self.components)

    def fit_transform(self, X, variance_threshold):
        self.variance_threshold = variance_threshold
        self.fit(X)
        return self.transform(X)


In [135]:
alphas = [0.8, 0.85, 0.9, 0.95]
pca = PCA()
D_pcas={}
print(f"Original shape: {images_training.shape}")

for alpha in alphas:
    D_pca = pca.fit_transform(images_training, alpha)
    D_pcas[alpha] = D_pca
    print(f"Alpha: {alpha} Reduced shape: {D_pca.shape}")


Original shape: (200, 10304)
Alpha: 0.8 Reduced shape: (200, 36)
Alpha: 0.85 Reduced shape: (200, 52)
Alpha: 0.9 Reduced shape: (200, 76)
Alpha: 0.95 Reduced shape: (200, 117)


## K-Means Clustering

## KNN

In [None]:
def euclidean_distance(x1, x2):
    distance = np.sqrt((x1-x2)**2)
    return distance
    
class KNN:
    def __init__(self, images_training, labels_training, k):
        self.images_training = images_training
        self.labels_training = labels_training
        self.k = k
    
    def predict(self, imgs):
        predictions = [self._predict(img) for img in imgs]
        return predictions
        
    def _predict(self, img):
        distances = [euclidean_distance(img, img_train) for img_train in self.images_training]
        k_indicies = np.argsort(distances)[:self.k]             # return indicies of sorted array [4, 1, 3, 2] -> [1, 3, 2, 0]
        k_nearest_labels = [self.labels_training[i] for i in k_indicies]
        most_common = Counter(k_nearest_labels).most_common()
        return most_common[0]
