In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Principal Component Analysis

In [2]:
class PCA:
    
    def __init__(self, k, scaling=False):
        
        # scalar : #reduced features
        self.k = k
        self.scaling = scaling
        
    def standardScaler(self, data):
        
        # shape(#features, )
        data_mean = data.mean(axis=0)
        data_std = data.std(axis=0)

        # shape(#samples, #features)
        scaled_data = (data - data_mean) / data_std

        return scaled_data
    
    def covariance(self, data):
        
        # shape(#features, #samples)
        data = (data.T - data.T.mean(axis=0)).T
        
        # shape(#features, #features)
        return (data.dot(data.T))/(data.shape[1]-1)
        
    def fit_transform(self, data):
        
        # data : shape(#samples, #features)
        
        if self.scaling:
            # shape(#samples, #features)
            data = self.standardScaler(data)
            
        # shape(#features, #features)
        self.data_cov = self.covariance(data.T)
        
        # eig_vec : shape(#features, #features)
        # eig_val : shape(#features, )
        self.eig_vec, self.eig_val, _ = np.linalg.svd(self.data_cov)
        self.eig_vec = (-1) * self.eig_vec
        
        # shape(#samples, # reduced features)
        return data.dot(self.eig_vec[:, self.eig_val.argsort()[::-1]][:, :self.k])

### Data Preparation

In [3]:
df = pd.read_csv('data/Iris.csv')
X = df.drop(['Species'], axis=1).values
print(X.shape)

(150, 4)


#### Feature scaling

In [4]:
X_std = StandardScaler().fit_transform(X)
print(X_std.shape)

(150, 4)


### Train

Reducing 4-dimensions to 2-dimensions.

In [5]:
p = PCA(2, scaling=True)
X_pca = p.fit_transform(X)
print(X_pca.shape)
X_pca

(150, 2)


array([[-2.26454173e+00,  5.05703903e-01],
       [-2.08642550e+00, -6.55404729e-01],
       [-2.36795045e+00, -3.18477311e-01],
       [-2.30419716e+00, -5.75367713e-01],
       [-2.38877749e+00,  6.74767397e-01],
       [-2.07053681e+00,  1.51854856e+00],
       [-2.44571134e+00,  7.45626750e-02],
       [-2.23384186e+00,  2.47613932e-01],
       [-2.34195768e+00, -1.09514636e+00],
       [-2.18867576e+00, -4.48629048e-01],
       [-2.16348656e+00,  1.07059558e+00],
       [-2.32737775e+00,  1.58587455e-01],
       [-2.22408272e+00, -7.09118158e-01],
       [-2.63971626e+00, -9.38281982e-01],
       [-2.19229151e+00,  1.88997851e+00],
       [-2.25146521e+00,  2.72237108e+00],
       [-2.20275048e+00,  1.51375028e+00],
       [-2.19017916e+00,  5.14304308e-01],
       [-1.89407429e+00,  1.43111071e+00],
       [-2.33994907e+00,  1.15803343e+00],
       [-1.91455639e+00,  4.30465163e-01],
       [-2.20464540e+00,  9.52457317e-01],
       [-2.77416979e+00,  4.89517027e-01],
       [-1.

# Cross check with sklearn

### Train

Reducing 4-dimensions to 2-dimensions.

In [6]:
from sklearn.decomposition import PCA
pc = PCA(n_components=2)
X_pca = pc.fit_transform(X_std)
print(X_pca.shape)
X_pca

(150, 2)


array([[-2.26454173e+00,  5.05703903e-01],
       [-2.08642550e+00, -6.55404729e-01],
       [-2.36795045e+00, -3.18477311e-01],
       [-2.30419716e+00, -5.75367713e-01],
       [-2.38877749e+00,  6.74767397e-01],
       [-2.07053681e+00,  1.51854856e+00],
       [-2.44571134e+00,  7.45626750e-02],
       [-2.23384186e+00,  2.47613932e-01],
       [-2.34195768e+00, -1.09514636e+00],
       [-2.18867576e+00, -4.48629048e-01],
       [-2.16348656e+00,  1.07059558e+00],
       [-2.32737775e+00,  1.58587455e-01],
       [-2.22408272e+00, -7.09118158e-01],
       [-2.63971626e+00, -9.38281982e-01],
       [-2.19229151e+00,  1.88997851e+00],
       [-2.25146521e+00,  2.72237108e+00],
       [-2.20275048e+00,  1.51375028e+00],
       [-2.19017916e+00,  5.14304308e-01],
       [-1.89407429e+00,  1.43111071e+00],
       [-2.33994907e+00,  1.15803343e+00],
       [-1.91455639e+00,  4.30465163e-01],
       [-2.20464540e+00,  9.52457317e-01],
       [-2.77416979e+00,  4.89517027e-01],
       [-1.