# Implementation of Principal Component Analysis from scratch


In [1]:
import numpy as np
import pandas as pd

# Loading Data
This is the iris dataset hat has been deposited on the UCI machine learning repository 
(https://archive.ics.uci.edu/ml/datasets/Iris).

The iris dataset contains measurements for 150 iris flowers from three different species.<br>

The three classes in the Iris dataset are:

1. Iris-setosa (n=50)<br>
2. Iris-versicolor (n=50)<br>
3. Iris-virginica (n=50)<br>

And the four features of in Iris dataset are:

1. sepal length in cm<br>
2. sepal width in cm<br>
3. petal length in cm<br>
4. petal width in cm<br>

In [3]:
df = pd.read_csv("./iris-species/Iris.csv")

In [4]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


## Seperating labels and data

In [5]:
df.dropna(how = "all", inplace = True)

Y = df["Species"]
X = df.drop(["Species", "Id"], axis = 1)

In [7]:
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Step 1: Calculating Mean

In [44]:
mean_X = np.mean(X, axis = 0)
mean_X

SepalLengthCm    5.843333
SepalWidthCm     3.054000
PetalLengthCm    3.758667
PetalWidthCm     1.198667
dtype: float64

## Step 2: Subtracting Mean from data

In [12]:
X_std = X - mean_X
X_std.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.743333,0.446,-2.358667,-0.998667
1,-0.943333,-0.054,-2.358667,-0.998667
2,-1.143333,0.146,-2.458667,-0.998667
3,-1.243333,0.046,-2.258667,-0.998667
4,-0.843333,0.546,-2.358667,-0.998667


## Step 3: Calculating covariance 

In [15]:
cov_mat = np.cov(X_std.T)
cov_mat

array([[ 0.68569351, -0.03926846,  1.27368233,  0.5169038 ],
       [-0.03926846,  0.18800403, -0.32171275, -0.11798121],
       [ 1.27368233, -0.32171275,  3.11317942,  1.29638747],
       [ 0.5169038 , -0.11798121,  1.29638747,  0.58241432]])

## Step 4: Calculating eigenvalues and eigenvectors

In [29]:
evalues, evectors = np.linalg.eig(cov_mat)

In [30]:
evalues

array([4.22484077, 0.24224357, 0.07852391, 0.02368303])

In [31]:
evectors

array([[ 0.36158968, -0.65653988, -0.58099728,  0.31725455],
       [-0.08226889, -0.72971237,  0.59641809, -0.32409435],
       [ 0.85657211,  0.1757674 ,  0.07252408, -0.47971899],
       [ 0.35884393,  0.07470647,  0.54906091,  0.75112056]])

## Step 5: Preparing feature vector
### Removing a eigenvector with smallest eigenvalue

In [39]:
FeatureVectors = np.delete(evectors, (3), axis = 1)

In [40]:
FeatureVectors

array([[ 0.36158968, -0.65653988, -0.58099728],
       [-0.08226889, -0.72971237,  0.59641809],
       [ 0.85657211,  0.1757674 ,  0.07252408],
       [ 0.35884393,  0.07470647,  0.54906091]])

## Step 6: Deriving new data by projecting feature vector

In [42]:
FinalData = FeatureVectors.T.dot(X_std.T)

In [43]:
FinalData.shape

(3, 150)