# Principal Component Analysis

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib widget
import common

In [4]:
data = common.loadFile("CleanedData")

Before we start we need to separate unused columns to the PCA. therefore, we are removing "RID" and "VISCODE" from the dataset. 
The column "DX" is our target and later will be represented on the Y axis of our graph

In [5]:
y =  data.loc[:,['DX']].values
patients = data.loc[:, ["RID"]].values

# We can now clean the dataset dropping what will not be used
x = data.drop(["RID", "VISCODE", "DX"], axis=1)

Once done time to scale the data bringing mean to 0 and the variance to 1

In [6]:
x = StandardScaler().fit_transform(x)
# data = data.sample(frac=1).reset_index(drop=True) # In case you need to randomize the lines

PCA Projection

In [7]:
pca = PCA(n_components=10)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents,  columns = ['principal component 1', 'principal component 2', 
                                                                   'principal component 3', 'principal component 4',
                                                                   'principal component 5', 'principal component 6',
                                                                   'principal component 7', 'principal component 8',
                                                                   'principal component 9', 'principal component 10'])
finalDf = pd.concat([principalDf, data[['DX']]], axis = 1)
finalDf.head(5)

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,DX
0,-2.010043,0.092752,2.23022,0.362468,1.423488,3.325878,0.033419,-0.36268,-1.320133,0.513975,1
1,-1.001153,-1.625148,5.468004,-0.11287,1.16541,2.105341,0.547053,0.878741,0.44615,-0.372421,1
2,-0.034755,-0.547715,5.905928,-0.831589,1.339325,1.228721,1.725662,1.048602,0.332997,0.273529,1
3,-3.088482,-3.483007,0.804803,1.185235,1.162946,0.894524,-1.034714,1.050683,0.904149,-0.239758,1
4,-1.344012,-3.64098,1.988552,1.59232,3.03425,1.548755,-1.875531,-0.165516,1.10103,0.148524,1


In [8]:
plt.close()
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0, 1, 2]
colors = ['r', 'y', 'b']
for target, color in zip(targets, colors):
    indicesToKeep = finalDf['DX'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
    
ax.legend(['CN', 'MCI', 'Dementia'])
ax.grid()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [9]:
pca.explained_variance_ratio_


array([0.18543   , 0.05538921, 0.03611538, 0.02198693, 0.01527396,
       0.01361526, 0.01250761, 0.01248018, 0.01137561, 0.01056115])

In [10]:
plt.close()
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 4', fontsize = 15)
ax.set_ylabel('Principal Component 5', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0, 1, 2]
colors = ['r', 'y', 'b']
for target, color in zip(targets, colors):
    indicesToKeep = finalDf['DX'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 4']
               , finalDf.loc[indicesToKeep, 'principal component 5']
               , c = color
               , s = 50)
    
ax.legend(['CN', 'MCI', 'Dementia'])
ax.grid()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [11]:
plt.close()
fig = plt.figure()
#ax = fig.add_subplot(1,1,1) 
ax = plt.axes(projection='3d')

ax.set_xlabel('Principal Component 1', fontsize=12)
ax.set_ylabel('Principal Component 2', fontsize=12)
ax.set_zlabel('Principal Component 3', fontsize=12)
ax.set_title('3 Component PCA', fontsize = 20)

targets = [0, 1, 2]
colors = ['r', 'y', 'b']
for target, color in zip(targets, colors):
    indicesToKeep = finalDf['DX'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , finalDf.loc[indicesToKeep, 'principal component 3']
               , c = color
               , s = 50)
    
ax.legend(['CN', 'MCI', 'Dementia'])
ax.grid()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …