In [113]:
import numpy as np
from PIL import Image
from random import shuffle
from sklearn.neighbors import KNeighborsClassifier

In [114]:
train_PCA_vecs = np.load('train_PCA_vecs.npy')
train_LDA_vecs = np.load('train_LDA_vecs.npy')

In [115]:
train_PCA_vecs.shape

(2576, 279)

In [116]:
train_LDA_vecs.shape

(240, 39)

In [117]:
train_imgs = []
train_labels = [] # 1...40
for c in range(40):
    for n in range(7):
        img = Image.open('hw2_data/hw2-2_data/{}_{}.png'.format(c+1,n+1)).convert('L')
        img_as_np = np.asarray(img)
        train_imgs.append(img_as_np.reshape(-1).astype(float))
        train_labels.append(c+1)
train_imgs = np.array(train_imgs)
mean_img = train_imgs.mean(axis=0)

In [118]:
print(train_imgs.shape)
print(len(train_labels))
print(mean_img.shape)

(280, 2576)
280
(2576,)


In [119]:
test_imgs = []
test_labels = [] # 1...40
for c in range(40):
    for n in range(7,10):
        img = Image.open('hw2_data/hw2-2_data/{}_{}.png'.format(c+1,n+1)).convert('L')
        img_as_np = np.asarray(img)
        test_imgs.append(img_as_np.reshape(-1).astype(float))
        test_labels.append(c+1)
test_imgs = np.array(test_imgs)

In [120]:
print(test_imgs.shape)
print(len(test_labels))

(120, 2576)
120


In [121]:
idxs = [i for i in range(7)]
shuffle(idxs)
val_idxs = np.array_split(idxs,3)

In [122]:
val_idxs

[array([0, 4, 3]), array([6, 5]), array([2, 1])]

In [125]:
def KNN_acc(K, train_imgs_dim_reduced, train_labels, test_imgs_dim_reduced, test_labels): # prints accuracy
    neigh = KNeighborsClassifier(n_neighbors=K)
    neigh.fit(train_imgs_dim_reduced, train_labels)
    
    correct_count = 0
    total_count = 0
    
    pred_labels = neigh.predict(test_imgs_dim_reduced)
    # print('pred_labels.shape', pred_labels.shape)
    return (pred_labels == test_labels).sum() / len(test_labels)
    
    # print('accuracy:', correct_count / total_count)

## PCA

In [127]:
NUM_FOLDS = 3
K_options = [1,3,5]
N_options = [3,10,39]

best_acc, best_N, best_k = 0, 0, 0
for N in N_options:
    for K in K_options:
        mean_acc = 0
        for fold in range(NUM_FOLDS):
            my_train = []
            my_train_labels = []
            
            my_val = []
            my_val_labels = []
            for idx, img in enumerate(train_imgs):
                if idx % 7 in val_idxs[fold]:
                    my_val.append(img)
                    my_val_labels.append(train_labels[idx])
                else:
                    my_train.append(img)
                    my_train_labels.append(train_labels[idx])
            my_train = np.array(my_train)
            my_val = np.array(my_val)
            my_mean_img = my_train.mean(axis=0)
            
            train_cov_matrix = np.cov(my_train.reshape(-1,2576).T)
            train_eig_vals, train_eig_vecs = np.linalg.eig(train_cov_matrix)
            train_eig_vecs = train_eig_vecs.astype(float)
            
            train_imgs_PCA = (my_train - my_mean_img).dot(train_eig_vecs[:,0:N])
            val_imgs_PCA = (my_val - my_mean_img).dot(train_eig_vecs[:,0:N])            

            mean_acc += KNN_acc(K, train_imgs_PCA, my_train_labels, val_imgs_PCA, my_val_labels) / NUM_FOLDS
        print('N: {}   K: {}   val_acc: {}'.format(N, K, mean_acc))
        if mean_acc > best_acc:
            best_acc = mean_acc
            best_N = N
            best_K = K
print()
print('best_N: {}   best_K: {}   best_val_acc: {}'.format(best_N, best_K, best_acc))

train_imgs_PCA = (train_imgs - mean_img).dot(train_PCA_vecs[:,0:best_N])
test_imgs_PCA = (test_imgs - mean_img).dot(train_PCA_vecs[:,0:best_N])
test_acc = KNN_acc(best_K, train_imgs_PCA, train_labels, test_imgs_PCA, test_labels)
print('test_acc:', test_acc)



N: 3   K: 1   val_acc: 0.6888888888888889
N: 3   K: 3   val_acc: 0.6124999999999999
N: 3   K: 5   val_acc: 0.5513888888888888
N: 10   K: 1   val_acc: 0.8888888888888888
N: 10   K: 3   val_acc: 0.7819444444444446
N: 10   K: 5   val_acc: 0.7013888888888888
N: 39   K: 1   val_acc: 0.9180555555555556
N: 39   K: 3   val_acc: 0.8722222222222222
N: 39   K: 5   val_acc: 0.7805555555555556

best_N: 39   best_K: 1   best_val_acc: 0.9180555555555556
test_acc: 0.9583333333333334


## LDA

In [108]:
N = 10
train_imgs_LDA = (train_imgs - mean_img).dot(train_PCA_vecs[:,0:train_LDA_vecs.shape[0]]).dot(train_LDA_vecs[:,0:N])
test_imgs_LDA = (test_imgs - mean_img).dot(train_PCA_vecs[:,0:train_LDA_vecs.shape[0]]).dot(train_LDA_vecs[:,0:N])

In [112]:
train_imgs.shape

(280, 2576)

In [130]:
from numpy.linalg import inv

In [133]:
NUM_FOLDS = 3
K_options = [1,3,5]
N_options = [3,10,39]

best_acc, best_N, best_k = 0, 0, 0
for N in N_options:
    for K in K_options:
        mean_acc = 0
        for fold in range(NUM_FOLDS):
            my_train = []
            my_train_labels = []
            
            my_val = []
            my_val_labels = []
            for idx, img in enumerate(train_imgs):
                if idx % 7 in val_idxs[fold]:
                    my_val.append(img)
                    my_val_labels.append(train_labels[idx])
                else:
                    my_train.append(img)
                    my_train_labels.append(train_labels[idx])
            my_train = np.array(my_train)
            my_val = np.array(my_val)
            my_mean_img = my_train.mean(axis=0)
#             print('my_train.shape', my_train.shape)
#             print('my_mean_img.shape', my_mean_img.shape)
#             print('my_val.shape', my_val.shape)
            
            train_cov_matrix = np.cov(my_train.reshape(-1,2576).T)
            train_eig_vals, train_eig_vecs = np.linalg.eig(train_cov_matrix)
            train_eig_vecs = train_eig_vecs.astype(float)
            
            eigenfaces_used = len(my_train)-40
            train_imgs_PCA = (my_train - my_mean_img).dot(train_eig_vecs[:,0:eigenfaces_used])
            val_imgs_PCA = (my_val - my_mean_img).dot(train_eig_vecs[:,0:eigenfaces_used])
            
            S_W = np.zeros((eigenfaces_used, eigenfaces_used))
            class_means = []
            for target_label in range(1,41):
                x = []
                for idx, label in enumerate(my_train_labels):
                    if label == target_label:
                        x.append(train_imgs_PCA[idx])
                x = np.array(x)
                class_mean = x.mean(axis=0)
                class_means.append(class_mean)
                x_minus_u = x - class_mean
                S_W += (x_minus_u.T).dot(x_minus_u)
            class_means = np.array(class_means)
            
            PCA_global_mean = train_imgs_PCA.mean(axis=0)
            u_class_minus_u_global = class_means - PCA_global_mean
            S_B = (u_class_minus_u_global.T).dot(u_class_minus_u_global)
                
            LDA_eig_vals, W = np.linalg.eig(inv(S_W).dot(S_B))
            W = W.astype(float)
            
            train_imgs_LDA = train_imgs_PCA.dot(W[:,0:N])
            val_imgs_LDA = val_imgs_PCA.dot(W[:,0:N])
                
            mean_acc += KNN_acc(K, train_imgs_LDA, my_train_labels, val_imgs_LDA, my_val_labels) / NUM_FOLDS
        print('N: {}   K: {}   val_acc: {}'.format(N, K, mean_acc))
        if mean_acc > best_acc:
            best_acc = mean_acc
            best_N = N
            best_K = K
print()
print('best_N: {}   best_K: {}   best_val_acc: {}'.format(best_N, best_K, best_acc))

train_imgs_LDA = (train_imgs - mean_img).dot(train_PCA_vecs[:,0:train_LDA_vecs.shape[0]]).dot(train_LDA_vecs[:,0:best_N])
test_imgs_LDA = (test_imgs - mean_img).dot(train_PCA_vecs[:,0:train_LDA_vecs.shape[0]]).dot(train_LDA_vecs[:,0:best_N])

test_acc = KNN_acc(best_K, train_imgs_LDA, train_labels, test_imgs_LDA, test_labels)
print('test_acc:', test_acc)



N: 3   K: 1   val_acc: 0.41944444444444445
N: 3   K: 3   val_acc: 0.4152777777777778
N: 3   K: 5   val_acc: 0.4138888888888889
N: 10   K: 1   val_acc: 0.7833333333333332
N: 10   K: 3   val_acc: 0.7833333333333332
N: 10   K: 5   val_acc: 0.786111111111111
N: 39   K: 1   val_acc: 0.9236111111111112
N: 39   K: 3   val_acc: 0.9291666666666667
N: 39   K: 5   val_acc: 0.9319444444444445

best_N: 39   best_K: 5   best_val_acc: 0.9319444444444445


ValueError: Found input variables with inconsistent numbers of samples: [280, 200]

In [136]:
test_acc = KNN_acc(best_K, train_imgs_LDA, train_labels, test_imgs_LDA, test_labels)
print('test_acc:', test_acc)

test_acc: 0.9166666666666666
