# Machine Learning in Python: Performing Principal Component Analysis (PCA)

## 1. Iris data set
### Load library

In [1]:
from sklearn import datasets

### Load dataset

In [2]:
iris = datasets.load_iris()

### Inpunt features

In [3]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

### Output features

In [4]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

### Assigning Input (X) and Output (Y) variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [5]:
X = iris.data
Y = iris.target

### Let's examine the data dimension

In [6]:
X.shape, Y.shape

((150, 4), (150,))

# 2. PCA analysis
### 2.1. Load library

In [7]:
from sklearn.preprocessing import scale # Data scaling
from sklearn import decomposition #PCA
import pandas as pd # pandas

### 2.2. Data scaling

In [8]:
X = scale(X)

### 2.3. Perform PCA analysis
Here we define the number of PC to use as 3



In [9]:
pca = decomposition.PCA(n_components=3)
pca.fit(X)

PCA(n_components=3)

### 2.4. Compute and retrieve the scores values

In [10]:
scores = pca.transform(X)

In [11]:
scores_df = pd.DataFrame(scores, columns=['PC1', 'PC2', 'PC3'])
scores_df

Unnamed: 0,PC1,PC2,PC3
0,-2.264703,0.480027,-0.127706
1,-2.080961,-0.674134,-0.234609
2,-2.364229,-0.341908,0.044201
3,-2.299384,-0.597395,0.091290
4,-2.389842,0.646835,0.015738
...,...,...,...
145,1.870503,0.386966,0.256274
146,1.564580,-0.896687,-0.026371
147,1.521170,0.269069,0.180178
148,1.372788,1.011254,0.933395


In [12]:
Y_label = []

for i in Y:
  if i == 0:
    Y_label.append('Setosa')
  elif i == 1:
    Y_label.append('Versicolor')
  else:
    Y_label.append('Virginica')

Species = pd.DataFrame(Y_label, columns=['Species'])

In [13]:
df_scores = pd.concat([scores_df, Species], axis=1)

In [14]:
df_scores

Unnamed: 0,PC1,PC2,PC3,Species
0,-2.264703,0.480027,-0.127706,Setosa
1,-2.080961,-0.674134,-0.234609,Setosa
2,-2.364229,-0.341908,0.044201,Setosa
3,-2.299384,-0.597395,0.091290,Setosa
4,-2.389842,0.646835,0.015738,Setosa
...,...,...,...,...
145,1.870503,0.386966,0.256274,Virginica
146,1.564580,-0.896687,-0.026371,Virginica
147,1.521170,0.269069,0.180178,Virginica
148,1.372788,1.011254,0.933395,Virginica


### 2.5. Retrieve the loadings values

In [15]:
loadings = pca.components_.T
df_loadings = pd.DataFrame(loadings, columns=['PC1', 'PC2','PC3'], index=iris.feature_names)
df_loadings

Unnamed: 0,PC1,PC2,PC3
sepal length (cm),0.521066,0.377418,-0.719566
sepal width (cm),-0.269347,0.923296,0.244382
petal length (cm),0.580413,0.024492,0.142126
petal width (cm),0.564857,0.066942,0.634273


### 2.6. Explained variance for each PC

In [16]:
explained_variance = pca.explained_variance_ratio_
explained_variance

array([0.72962445, 0.22850762, 0.03668922])

how pc1 contributes to the total variance, how pc2 contributes to the total variance and how pc3 contributes to the total variance.

## 3. Scree Plot
### 3.1. Import library

In [29]:
import numpy as np


### 3.2. Preparing explained variance and cumulative variance
#### 3.2.1. Preparing the explained variance data

In [18]:
explained_variance

array([0.72962445, 0.22850762, 0.03668922])

In [19]:
expalained_variance = np.insert (explained_variance , 0, 0)

In [20]:
explained_variance

array([0.72962445, 0.22850762, 0.03668922])

#### 3.2.2. Preparing the cumulative variance data

In [21]:
cumulative_variance = np.cumsum(np.round(explained_variance, decimals = 3))

In [22]:
cumulative_variance

array([0.73 , 0.959, 0.996])

#### 3.2.3. Combining the dataframe

In [23]:
pc_df = pd.DataFrame(['','PC1','PC2','PC3'], columns = ['PC'])

In [24]:
explained_variance_df = pd.DataFrame(explained_variance, columns = ['Explained Variance'])

In [25]:
cumulative_variance_df = pd.DataFrame(cumulative_variance, columns = ['Cumulative Variance'])

In [26]:
df_explained_variance = pd.concat([pc_df, explained_variance_df, cumulative_variance_df], axis = 1)

In [27]:
df_explained_variance

Unnamed: 0,PC,Explained Variance,Cumulative Variance
0,,0.729624,0.73
1,PC1,0.228508,0.959
2,PC2,0.036689,0.996
3,PC3,,
