In [1]:
import glob
import os

%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import sklearn.decomposition

import imageio
import numpy as np
import math

In [2]:
dataset_path = './toy-dataset/'

In [3]:
def read_pgm(f):
    header = f.readline()
    size = [int(i) for i in f.readline().split()]
    depth = int(f.readline())
    image = []
    for i in range(size[0]):
        row = []
        for j in range(size[1]):
            row.append(ord(f.read(1)))
        image.append(row)
    return image

In [4]:
def ex6_load_dataset(path):
    data = {}
    for root, dirs, files in os.walk(dataset_path):
        for d in dirs:
            data.update({d: []})
        for name in files:            
            f = open(os.path.join(root, name), 'rb')
            image = read_pgm(f)
            for d in data.keys():
                if d in name:
                    data[d].append(np.array(image) / np.max(image))
    for key, values in data.items():
        data[key] = np.array(data[key])
    return data

In [5]:
data = ex6_load_dataset(dataset_path)

In [6]:
def ex6_split_dataset(data, test_fraction):
    training_data = []
    test_data = []
    
    for key, values in data.items():
        test_size = int(values.shape[0] * test_fraction)
        train_size = values.shape[0] - test_size
        
        for i in range(0, train_size):
            np.random.shuffle(data[key])
            idx = np.random.randint(0, data[key].shape[0], 1)
            training_data.append((data[key][idx], key))
            data[key] = np.delete(data[key], idx, axis=0)
            
        for i in range(0, test_size):
            np.random.shuffle(data[key])
            idx = np.random.randint(0, data[key].shape[0], 1)
            test_data.append((data[key][idx], key))
            data[key] = np.delete(data[key], idx, axis=0)
    
    
    return np.array(training_data), np.array(test_data)

In [7]:
train, test = ex6_split_dataset(data, 0.5)

In [9]:
def ex6_feature_extraction_simple(training_data, test_data):
    training_features = []
    test_features = []
    
    for sample in training_data:
        training_features.append((np.array(sample[0]).flatten(), sample[1]))
                                
    for sample in test_data:
        test_features.append((np.array(sample[0]).flatten(), sample[1]))
    
    return np.array(training_features), np.array(test_features)

In [10]:
simple_train, simple_test = ex6_feature_extraction_simple(train, test)

In [11]:
def ex6_feature_extraction_pca(training_data, test_data, k=5):
    training_data_matrix = []
    # matrix of shape n x p
    for i in range(0, training_data.shape[0]):
        feature_vec = np.array(training_data[i][0].flatten())
        training_data_matrix.append(feature_vec)
    training_data_matrix = np.array(training_data_matrix)
    training_data_matrix = np.transpose(training_data_matrix)
    
    # mean of shape p x 1
    mean = np.mean(training_data_matrix, axis=0)
    training_data_matrix -= mean
    
    # cov matrix of shape p x p
    cov_matrix = np.cov(training_data_matrix)
    
    # diagonal matrix eig_vec of shape p x p
    eig_val, eig_vec = np.linalg.eig(cov_matrix)
    
    eig = list(zip(eig_val, eig_vec))
    eig = sorted(eig,  key=lambda x: x[0])

    # create matrix of shape p x k
    a = []
    for i in range(0, k):
        a.append(eig[i][1])
    a = np.transpose(np.array(a))
        
    # project training data
    proj_train_features = np.dot(np.transpose(a), training_data_matrix)
    
    test_data_matrix = []
    for i in range(0, test_data.shape[0]):
        test_feature_vec = np.array(test_data[i][0].flatten())
        test_data_matrix.append(test_feature_vec)
    test_data_matrix = np.array(test_data_matrix)
    test_data_matrix = np.transpose(test_data_matrix)
    
    proj_test_features = np.dot(np.transpose(a), test_data_matrix)

    return proj_train_features, proj_test_features

In [12]:
pca_train, pca_test = ex6_feature_extraction_pca(train, test)