## Principal Component Analsys and Dimension Reduction

In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as LA
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px

In [2]:
df0 = px.data.iris()

In [3]:
columns = ["sepal_length",	"sepal_width",	"petal_length",	"petal_width"]
df = df0[columns]
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [4]:
df1 = df.copy()
df_standardized_manual = (df1 - df1.mean()) / df1.std() # manual standardization, not using sklearn's StandardScaler class
df_standardized_manual.cov() # covariance matrix function for standardized data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


In [5]:
df1 = df.copy() # copy the original data frame for standardization
df_standardized_manual = (df1 - df1.mean()) / df1.std(ddof=0)
df_standardized_manual.cov()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.006711,-0.110103,0.877605,0.823443
sepal_width,-0.110103,1.006711,-0.423338,-0.358937
petal_length,0.877605,-0.423338,1.006711,0.969219
petal_width,0.823443,-0.358937,0.969219,1.006711


In [6]:

eigenvalues, eigenvectors = LA.eig(df_standardized_manual.cov()) # .eig() returns eigenvalues and eigenvectors of a matrix, both are numpy arrays. Eigenvalues are not sorted.
# eigenvalues are basically the variance of the data along the eigenvector directions
# eigenvectors are the directions along which the data has the most variance

In [7]:
eigenvalues

array([2.93035378, 0.92740362, 0.14834223, 0.02074601])

In [8]:
eigenvectors

array([[ 0.52237162, -0.37231836, -0.72101681,  0.26199559],
       [-0.26335492, -0.92555649,  0.24203288, -0.12413481],
       [ 0.58125401, -0.02109478,  0.14089226, -0.80115427],
       [ 0.56561105, -0.06541577,  0.6338014 ,  0.52354627]])

In [9]:
df_pca=pd.DataFrame(np.matmul(df_standardized_manual, eigenvectors))
df_pca_2D = df_pca[[0,1]]
df_pca_2D.columns = ["x", "y"]
df_pca_2D

Unnamed: 0,x,y
0,-2.264542,-0.505704
1,-2.086426,0.655405
2,-2.367950,0.318477
3,-2.304197,0.575368
4,-2.388777,-0.674767
...,...,...
145,1.870522,-0.382822
146,1.558492,0.905314
147,1.520845,-0.266795
148,1.376391,-1.016362


In [10]:
px.scatter(df_pca_2D, x="x", y ="y",title = "PCA Results of IRIS dataset",
          labels={
                     "x": f"Principal Component 1: {eigenvalues[0]/np.sum(eigenvalues)*100:.1f}%",
                     "y": f"Principal Component 2: {eigenvalues[1]/np.sum(eigenvalues)*100:.1f}%"                   
                 },height=500)

In [11]:
kmeans = KMeans(n_clusters = 3, random_state = 0, n_init='auto')
kmeans.fit(df_pca_2D)

In [12]:
px.scatter(df_pca_2D, x="x", y ="y",title = "PCA Results of IRIS dataset", color= kmeans.labels_.astype(str),
          labels={
                     "x": f"Principal Component 1: {eigenvalues[0]/np.sum(eigenvalues)*100:.1f}%",
                     "y": f"Principal Component 2: {eigenvalues[1]/np.sum(eigenvalues)*100:.1f}%",
                     "color": "Species of Iris"
                 },height=500)

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)


In [14]:
pca = PCA(n_components=4)
principal_components = pca.fit_transform(df_standardized)

In [15]:
df_pca = pd.DataFrame(principal_components)
df_pca

Unnamed: 0,0,1,2,3
0,-2.264542,0.505704,-0.121943,-0.023073
1,-2.086426,-0.655405,-0.227251,-0.103208
2,-2.367950,-0.318477,0.051480,-0.027825
3,-2.304197,-0.575368,0.098860,0.066311
4,-2.388777,0.674767,0.021428,0.037397
...,...,...,...,...
145,1.870522,0.382822,0.254532,-0.388890
146,1.558492,-0.905314,-0.025382,-0.221322
147,1.520845,0.266795,0.179277,-0.118903
148,1.376391,1.016362,0.931405,-0.024146


In [16]:
df_pca.cov()

Unnamed: 0,0,1,2,3
0,2.930354,1.430623e-16,-1.22199e-16,5.960929e-18
1,1.430623e-16,0.9274036,5.197185e-17,-1.490232e-18
2,-1.22199e-16,5.197185e-17,0.1483422,-7.451160999999999e-19
3,5.960929e-18,-1.490232e-18,-7.451160999999999e-19,0.02074601


In [17]:
df_pca_2D = df_pca[[0,1]]
df_pca_2D.columns = ["x", "y"]
df_pca_2D

Unnamed: 0,x,y
0,-2.264542,0.505704
1,-2.086426,-0.655405
2,-2.367950,-0.318477
3,-2.304197,-0.575368
4,-2.388777,0.674767
...,...,...
145,1.870522,0.382822
146,1.558492,-0.905314
147,1.520845,0.266795
148,1.376391,1.016362


In [18]:
px.scatter(df_pca_2D, x="x", y ="y",title = "PCA Results of IRIS dataset",
          labels={
                     "x": f"Principal Component 1: {pca.explained_variance_ratio_[0]*100:.1f}%",
                     "y": f"Principal Component 2: {pca.explained_variance_ratio_[1]*100:.1f}%"
                     
                 },height=500)

In [19]:
pca.explained_variance_ratio_

array([0.72770452, 0.23030523, 0.03683832, 0.00515193])

In [20]:
pca.explained_variance_ 

array([2.93035378, 0.92740362, 0.14834223, 0.02074601])

In [21]:
pca.explained_variance_ /pca.explained_variance_.sum()

array([0.72770452, 0.23030523, 0.03683832, 0.00515193])

In [22]:
eigenvalues = pca.explained_variance_
components = pca.components_

In [23]:
eigenvalues

array([2.93035378, 0.92740362, 0.14834223, 0.02074601])

In [24]:
components

array([[ 0.52237162, -0.26335492,  0.58125401,  0.56561105],
       [ 0.37231836,  0.92555649,  0.02109478,  0.06541577],
       [-0.72101681,  0.24203288,  0.14089226,  0.6338014 ],
       [-0.26199559,  0.12413481,  0.80115427, -0.52354627]])

In [25]:
np.transpose(components)

array([[ 0.52237162,  0.37231836, -0.72101681, -0.26199559],
       [-0.26335492,  0.92555649,  0.24203288,  0.12413481],
       [ 0.58125401,  0.02109478,  0.14089226,  0.80115427],
       [ 0.56561105,  0.06541577,  0.6338014 , -0.52354627]])

In [None]:
pd.DataFrame(np.matmul(df_scaled, np.transpose(components)))

In [None]:
pd.DataFrame(np.matmul(df_scaled, np.transpose(components[0:2,:])))