## Prepare for learning

In [None]:
from google.colab import drive
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from scipy import ndimage
import time
from sklearn.neighbors import KNeighborsClassifier

### Load Dataset

In [None]:
drive.mount('/content/drive')
mat = sio.loadmat('/content/drive/MyDrive/face.mat')

imgs_raw = mat['X']
labels = mat['l'][0]
print(imgs_raw.shape, labels.shape)

Mounted at /content/drive
(2576, 520) (520,)


In [None]:
imgs = [img_raw.reshape((46,56)) for img_raw in imgs_raw.T]

### Split data into train dataset and test dataset(8(2:2:2:2):2)

In [None]:
'''
split data into train dataset and test dataset(8(2:2:2:2):2)
'''

idx = np.array([[0, 0, 1, 1, 2, 2, 3, 3, 4, 4]]*52)
for i, j in enumerate(idx):
  idx[i] = np.random.permutation(j)
  
idx = idx.flatten()

train_imgs_raw_1 = np.array([i[idx==1] for i in imgs_raw])
train_imgs_raw_2 = np.array([i[idx==2] for i in imgs_raw])
train_imgs_raw_3 = np.array([i[idx==3] for i in imgs_raw])
train_imgs_raw_4 = np.array([i[idx==4] for i in imgs_raw])
train_imgs_raw = np.array([i[idx!=0] for i in imgs_raw])
test_imgs_raw = np.array([i[idx==0] for i in imgs_raw])
train_labels_1 = labels[idx==1]
train_labels_2 = labels[idx==2]
train_labels_3 = labels[idx==3]
train_labels_4 = labels[idx==4]
train_labels = labels[idx!=0]
test_labels = labels[idx==0]

print("Data Dimension")
print("train_imgs_raw : ", len(train_imgs_raw_1)," * ", len(train_imgs_raw_1[0]))
print("test_imgs_raw  : ", len(test_imgs_raw)," * ",  len(test_imgs_raw[0]))
print("train_labels : ", len(train_labels_1))
print("test_labels  : ", len(test_labels))

Data Dimension
train_imgs_raw :  2576  *  104
test_imgs_raw  :  2576  *  104
train_labels :  104
test_labels  :  104


## 2. Incremental PCA

In [None]:
def PCA(train_imgs_raw, M):
  avg = np.mean(train_imgs_raw, axis=1)
  A = (train_imgs_raw.T-avg).T

  S_low = np.dot(A.T, A)
  w_low, v_low = np.linalg.eigh(S_low)
  v_low_to_high = np.dot(A, v_low)
  start = time.time()
  v_low_to_high = v_low_to_high/np.linalg.norm(v_low_to_high, axis=0)
  stop = time.time()
  P_low = v_low_to_high.T[-M:].T
  L_low = np.diag(w_low[-M:])

  N = train_imgs_raw.shape[1]

  P = P_low
  L = L_low

  S = np.dot(np.dot(P, L), P.T)

  T = stop-start

  return avg, N, P, S, T

In [None]:
def increment_PCA(D):
  avg1, N1, P1, S1, T1 = PCA(train_imgs_raw_1, D)

  for train_imgs_raw in [train_imgs_raw_2, train_imgs_raw_3, train_imgs_raw_4]:
    avg2, N2, P2, S2, T2 = PCA(train_imgs_raw, D)

    N3 = N1 + N2
    avg3 = (N1*avg1 + N2*avg2)/N3

    avg_diff = avg1-avg2
    S3 = N1*S1/N3 + N2*S2/N3 + N1*N2*np.dot(avg_diff, avg_diff.T)/N3**2

    start = time.time()
    Phi, _ = np.linalg.qr(np.concatenate((np.concatenate((P1, P2), axis=1).T, np.array([avg_diff])), axis=0).T)
    stop = time.time()

    _, R = np.linalg.eigh(np.dot(np.dot(Phi.T, S3), Phi))
    P3 = np.dot(Phi, R)
    T3 = T1+T2+stop-start

    avg1, N1, P1, S1, T1 = avg3, N3, P3, S3, T3

    return avg1, N1, P1, S1, T1

In [None]:
'''
  Calculate in the original PCA manner
'''
scoreChartInfo = []
reconstructionErrorChartInfo = []

m = 150
avg_img = np.mean(train_imgs_raw, axis=1)
A = (train_imgs_raw.T-avg_img).T

S_low = np.dot(A.T, A)
start_time = time.time()
w_low, v_low = np.linalg.eigh(S_low)
end_time = time.time()
v_low_to_high = np.dot(A, v_low)
v_low_to_high = v_low_to_high/np.linalg.norm(v_low_to_high, axis=0)
v_low_to_high = v_low_to_high.T
v_final = v_low_to_high[-m:]

train_proj = np.dot((train_imgs_raw.T-avg_img), v_final.T)
test_proj = np.dot((test_imgs_raw.T-avg_img), v_final.T)

for i in range(1, 10):
  classifier = KNeighborsClassifier(n_neighbors = i)
  classifier.fit(train_proj, train_labels)

  train_score = classifier.score(train_proj, train_labels)
  test_score = classifier.score(test_proj, test_labels)

  scoreChartInfo.append({"KNN Classifier Score": train_score, "k": i, "version": "original", "dataset": "train"})
  scoreChartInfo.append({"KNN Classifier Score": test_score, "k": i, "version": "original", "dataset": "test"})
  
  print(f"For {i}NN classifier, train score is {train_score} and test score is {test_score}")

train_reconst = np.dot(train_proj, v_final) + avg_img
test_reconst = np.dot(test_proj, v_final) + avg_img

train_error = np.mean(np.linalg.norm(train_reconst-train_imgs_raw.T, axis = 1))
test_error = np.mean(np.linalg.norm(test_reconst-test_imgs_raw.T, axis = 1))

time_spent = end_time - start_time
reconstructionErrorChartInfo.append({"Reconstruction Error": train_error, "Time Taken": time_spent, "version": "original", "dataset": "train"})
reconstructionErrorChartInfo.append({"Reconstruction Error": test_error, "Time Taken": time_spent, "version": "original", "dataset": "test"})

print(f">>>Reconstruction train error is {train_error}\n>>>Reconstruction test error is {test_error}\n>>>Time taken: {time_spent}s\n")

For 1NN classifier, train score is 1.0 and test score is 0.5096153846153846
For 2NN classifier, train score is 0.7668269230769231 and test score is 0.47115384615384615
For 3NN classifier, train score is 0.7451923076923077 and test score is 0.4423076923076923
For 4NN classifier, train score is 0.6995192307692307 and test score is 0.41346153846153844
For 5NN classifier, train score is 0.6610576923076923 and test score is 0.41346153846153844
For 6NN classifier, train score is 0.6322115384615384 and test score is 0.4326923076923077
For 7NN classifier, train score is 0.6033653846153846 and test score is 0.38461538461538464
For 8NN classifier, train score is 0.5769230769230769 and test score is 0.3942307692307692
For 9NN classifier, train score is 0.5504807692307693 and test score is 0.36538461538461536
>>>Reconstruction train error is 405.11380685802385
>>>Reconstruction test error is 642.2050579158112
>>>Time taken: 0.07824182510375977s



In [None]:
D_list = [10, 50, 70, 150]
for D in D_list:
  print(f"Incremental PCA for dimension: {D}")
  avg, N, P, S, T = increment_PCA(D)
  train_proj = np.dot((train_imgs_raw.T-avg), P)
  test_proj = np.dot((test_imgs_raw.T-avg), P)

  for i in range(1, 10):
    classifier = KNeighborsClassifier(n_neighbors = i)
    classifier.fit(train_proj, train_labels)

    train_score = classifier.score(train_proj, train_labels)
    test_score = classifier.score(test_proj, test_labels)
    
    print(f"    For {i}NN classifier, train score is {train_score} and test score is {test_score}")
    scoreChartInfo.append({"KNN Classifier Score": train_score, "k": i, "version": "incremental(d="+str(D)+")", "dataset": "train"})
    scoreChartInfo.append({"KNN Classifier Score": test_score, "k": i, "version": "incremental(d="+str(D)+")", "dataset": "test"})

  train_reconst = np.dot(train_proj, P.T) + avg
  test_reconst = np.dot(test_proj, P.T) + avg

  train_error = np.mean(np.linalg.norm(train_reconst-train_imgs_raw.T, axis = 1))
  test_error = np.mean(np.linalg.norm(test_reconst-test_imgs_raw.T, axis = 1))
  print(f"  Reconstruction train error is {train_error}\n  Reconstruction test error is {test_error}")
  print(f"  Training time is {T}\n\n")

  reconstructionErrorChartInfo.append({"Reconstruction Error": train_error, "Time Taken": T, "version": "incremental(d="+str(D)+")", "dataset": "train"})
  reconstructionErrorChartInfo.append({"Reconstruction Error": test_error, "Time Taken": T, "version": "incremental(d="+str(D)+")", "dataset": "test"})

Incremental PCA for dimension: 10
    For 1NN classifier, train score is 1.0 and test score is 0.38461538461538464
    For 2NN classifier, train score is 0.7211538461538461 and test score is 0.3942307692307692
    For 3NN classifier, train score is 0.6706730769230769 and test score is 0.41346153846153844
    For 4NN classifier, train score is 0.6322115384615384 and test score is 0.38461538461538464
    For 5NN classifier, train score is 0.6105769230769231 and test score is 0.38461538461538464
    For 6NN classifier, train score is 0.5961538461538461 and test score is 0.36538461538461536
    For 7NN classifier, train score is 0.5552884615384616 and test score is 0.3557692307692308
    For 8NN classifier, train score is 0.5192307692307693 and test score is 0.3557692307692308
    For 9NN classifier, train score is 0.4951923076923077 and test score is 0.3269230769230769
  Reconstruction train error is 1118.241203422458
  Reconstruction test error is 1141.7549975682223
  Training time is 0.

In [None]:
import plotly.express as px
px.bar(reconstructionErrorChartInfo, x="version", y="Time Taken", facet_col="dataset")

In [None]:
px.bar(reconstructionErrorChartInfo, x="version", y="Reconstruction Error", facet_col="dataset")

In [None]:
px.line(scoreChartInfo, x="k", y="KNN Classifier Score", color="version", facet_col="dataset")