# PCA

![pca2d](PCA_2d.svg)
![pca](pca.png)

## Customer Satisfaction Analysis

A survey in order to evaluate 20 different healthcare structures. 200 customers have evaluated, with a 1-10 scale, each of six features of the service:

1. Courtesy
2. Clarity
3. Competence
4. Condition (of the structure)
5. Promptness (of the service)
6. Opening times

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [None]:
df2 = pd.read_csv("csat.csv")
print(df2.head(10))

df2.describe()


In [None]:
%matplotlib inline
df2.boxplot()


In [None]:
#dataset scaling and visualizing
from sklearn.preprocessing import StandardScaler
scaler2 = StandardScaler(copy=False) 
scaler2.fit(df2.astype(float)) # 
scaler2.transform(df2.astype(float))
df2.boxplot()

In [None]:
df2_scaled=pd.DataFrame(scaler2.transform(df2.astype(float))) 
df2_scaled.columns=df2.columns
df2_scaled.head()

In [None]:
df2_scaled.boxplot()

In [None]:
df2.apply(lambda s: df2.corrwith(s))

In [None]:
#PCA fit
from sklearn.decomposition import PCA
# we can choose the number of components e.g. 10, the percentage of the total variance or set it to None (that means it automatically chooses the number of components)
pca2 = PCA()
pca2.fit(df2_scaled) #The fit learns some quantities from the data, most importantly the "components" and "explained variance"

In [None]:
#let's use the pca to transform the dataset
df2_pca = pd.DataFrame(pca2.transform(df2_scaled))
df2_pca

In [None]:
#Let's analyse what happened
#VISUALIZE The amount of variance explained by each of the 10 selected principal components.
pd.DataFrame(pca2.explained_variance_).transpose()

In [None]:
#VISUALIZE The percentage of variance explained by each of the selected components.
explained_var=pd.DataFrame(pca2.explained_variance_ratio_).transpose()
explained_var

In [None]:
#VISUALIZE The cumulative percentage of explained variance
cum_explained_var=np.cumsum(pca2.explained_variance_ratio_)
pd.DataFrame(cum_explained_var).transpose()

In [None]:
%matplotlib inline
import seaborn as sns
ax = sns.barplot( data=explained_var)

In [None]:
pd.DataFrame(pca2.components_,index=['PC1','PC2','PC3','PC4','PC5','PC6'],columns=df2.columns)


In [None]:

def myplot(score,coeff,labels=None):
    xs = score[:,0]
    ys = score[:,1]
    n = coeff.shape[0]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    plt.scatter(xs * scalex,ys * scaley)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(1))
    plt.ylabel("PC{}".format(2))
    plt.grid()

#Call the function. Use only the 2 PCs.
myplot(pca2.transform(df2_scaled)[:,0:2],np.transpose(pca2.components_[0:2, :]), df2.columns)
plt.show()

### 1st component:

The variables

   - Condition
   - Promptness
   - Opening-times

show a high correlation with the first component. This component can be summarized as an index of the **structure’s performances**

### 2nd component:

The variables
   - Courtesy
   - Clarity
   - Competence

show a high correlation with the second component. 
This component can be summarized as an index of the **personnel’s performance**

**Notice that the Principal Components have negative values in the variables that they explain.**

In [None]:
df2_pca.columns=['PC1','PC2','PC3','PC4','PC5','PC6']
df2_pca

In [None]:
p1=sns.scatterplot(x="PC1", y="PC2",
              alpha=.3, 
              hue="PC6", legend=False,
              data=df2_pca);

# add annotations one by one with a loop
for line in range(0,df2_pca.shape[0]):
     p1.text(df2_pca.PC1[line], df2_pca.PC2[line], line, horizontalalignment='left', size='medium', color='black')


We conclude that 
 - centers 18,11 has a GOOD infrastructure but a BAD service
 - the group near 1,4,6 has BAD infrastructure but GOOD service quality 
 - the group 0,3,2,9 has GOOD infrastructure and service
 - center 17 has BAD infrastructure and service!

## Breast cancer wisconsin (diagnostic) dataset

In [None]:
#upload a toy datasets from scikit-learn
#sklearn comes with a few small standard datasets that do not require to download any file from some external website
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer() #The breast cancer dataset is a classic and very easy binary classification dataset.

#create the dataframe
dataset_df = pd.DataFrame(dataset.data)
columns = dataset.feature_names
dataset_df.columns = columns

print(dataset["DESCR"])

In [None]:
dataset.target

In [None]:
dataset_df


In [None]:
#dataset visualization tools
%matplotlib inline
dataset_df.boxplot()
dataset_df.head()

In [None]:
#dataset scaling and visualizing
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False) #or alternatively use MinMaxScaler
scaler.fit(dataset_df) 
scaler.transform(dataset_df) 
dataset_df.boxplot()
dataset_df.head()

In [None]:
dataset_df.columns

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = dataset_df.loc[:,'texture error']
y = dataset_df.loc[:,'worst symmetry']


plt.scatter(x, y,alpha=0.2,c=dataset.target )
plt.show()

In [None]:
%matplotlib inline
import seaborn as sns

df_sample = dataset_df.copy()
df_sample = df_sample.iloc[:,:8]
df_sample['target']=dataset.target
sns.pairplot(df_sample, hue='target')

In [None]:
#PCA fit
from sklearn.decomposition import PCA
# we can choose the number of components e.g. 10, the percentage of the total variance or set it to None (that means it automatically chooses the number of components)
pca = PCA(n_components=10)
pca.fit(dataset_df) #The fit learns some quantities from the data, most importantly the "components" and "explained variance"

In [None]:
#let's use the pca to transform the dataset
x_pca = pca.transform(dataset_df)
print("Dataset shape before PCA: ", dataset_df.shape)
print("Dataset shape after PCA: ", x_pca.shape)

In [None]:
#Let's analyse what happened
#VISUALIZE The amount of variance explained by each of the 10 selected principal components.
pd.DataFrame(pca.explained_variance_).transpose()

In [None]:
#VISUALIZE The percentage of variance explained by each of the selected components.
explained_var=pd.DataFrame(pca.explained_variance_ratio_).transpose()
explained_var

In [None]:
%matplotlib inline
import seaborn as sns
sns.barplot( data=explained_var)

In [None]:
#VISUALIZE The cumulative percentage of explained variance
cum_explained_var = np.cumsum(pca.explained_variance_ratio_)
pd.DataFrame(cum_explained_var).transpose()

In [None]:
#PRINT the total percentage of explained variance 
print(cum_explained_var[-1]) 

In [None]:
#CHOOSING THE NUMBER OF COMPONENTS - we can plot the cumulative percentage of explained variance
import matplotlib.pyplot as plt
plt.plot(cum_explained_var)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

This previous curve quantifies how much of the total, 30-dimensional variance is contained within the first 10 components. 
For example, we see that the first 4 components contain approximately 79% of the variance, 
while you need around 6 components to describe close to 95% of the variance.

In [None]:
pd.DataFrame(pca.components_,index=['pc1', 'pc2','pc3','pc4','pc5', 'pc6','pc7','pc8','pc9','pc10'],columns=dataset_df.columns)

In [None]:
# Let see the coordinates of the data in the PCA 
principalDf = pd.DataFrame(data = x_pca
             , columns = ['pc1', 'pc2','pc3','pc4','pc5', 'pc6','pc7','pc8','pc9','pc10'])
principalDf.head()

In [None]:
principalDf['target']=dataset.target
dataset.target_names

In [None]:
# The data in the first PCA 
sns.scatterplot(x="pc1",y=[0]*(principalDf['target'].size),
              hue="target", alpha=.2,
              data =principalDf);

In [None]:
# The data in the first two PCA 
sns.scatterplot(x="pc1", y="pc2",
              hue="target", alpha=.3,
              data=principalDf);

In [None]:
# The data in the first three PCA 

from mpl_toolkits.mplot3d import axes3d

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(principalDf['pc1'], principalDf['pc2'],principalDf['pc3'], c=principalDf['target'], s=40)
ax.view_init(60, 60)
plt.show()

In [None]:
#A PART FROM EXPLICITLY CHOOSE THE NUMBER OF PRINCIPAL COMPONENTS, YOU CAN RESORT TO SOME AUTOMATIC TOOLS SUCH AS:

#(1) You can leave the pca implementation of sklearn to choose the number of components by using:
    #Set n_components == 'mle' and svd_solver == 'full' and Minka’s MLE is used to guess the dimension. 
    
pca = PCA(n_components='mle',svd_solver='full') 
pca.fit(dataset_df)
pca.n_components_ 
#and then transform the dataset as we have already seen above

In [None]:
#let's use the pca to transform the dataset
x_pca = pca.transform(dataset_df)
print("Dataset shape before PCA: ", dataset_df.shape)
print("Dataset shape after PCA: ", x_pca.shape)

In [None]:
#OR (2) you can ask for the components able to explain a certain percentage of variance by using:
    #Set 0 < n_components < 1 and svd_solver == 'full' to select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.

pca = PCA(n_components=0.9,svd_solver='full') 
pca.fit(dataset_df)
pca.n_components_ 
#and then transform the dataset as we have already seen above

In [None]:
#let's use the pca to transform the dataset
x_pca = pca.transform(dataset_df)
print("Dataset shape before PCA: ", dataset_df.shape)
print("Dataset shape after PCA: ", x_pca.shape)

## mtcars

In [None]:
# Import CSV mtcars
cars = pd.read_csv('mtcars.csv',index_col = 0)

The car types are a mix that includes sedans (Datsun, Ford, Honda,…), luxury sedans (Mercedes, Cadellac,..), muscle cars (Javelin, Challenger, Camaro…) and high-end sports cars (Porsche, Lotus, Maserati, Ferrari…)

- 	mpg 	Miles/US Gallon 	mpg is the determinant of fuel efficiency
- 	cyl 	Number of cylinders 	Data includes vehicles with 4,6,8 cylinder engines.
- 	disp 	Displacement (cu.in.) 	Displacement measures overall volume in the engine as a factor of cylinder circumfrance, depth and total number of cylinders. This metric gives a good proxy for the total amount of power the engine can generate.
- 	hp 	Gross horsepower 	Gross horsepower measures the theoretical output of an engine’s power output
- 	drat 	Rear axle ratio 	The rear axle gear ratio indicates the number of turns of the drive shaft for every one rotation of the wheel axle. 
-   wt      weigth of the car 
- 	qsec 	1/4 mile time 	A performance measure, primarily of acceleration. Fastest time to travel 1/4 mile from standstill (in seconds).
- 	vs 	V/S 	Binary variable signaling the engine cylinder configuration a V-shape (vs=0) or Straight Line (vs=1). V==0 and S==1. 
- 	am 	Transmission Type 	A binary variable signaling whether vehicle has automatic (am=0) or manual (am=1) transmission configuration.
- 	gear 	Number of forward gears 	Number of gears in the transmission.
- 	carb 	Number of carburetors 	The number of carburetor barrels.

In [None]:
cars.shape

In [None]:
#dataset scaling and visualizing
from sklearn.preprocessing import StandardScaler
scaler3 = StandardScaler(copy=False) #or alternatively use MinMaxScaler
scaler3.fit(cars.astype(float)) 
df_cars=pd.DataFrame(scaler3.transform(cars.astype(float))) 
df_cars.columns=cars.columns
df_cars.boxplot()

In [None]:
from sklearn.decomposition import PCA
pca3 = PCA()
pca3.fit(df_cars)

In [None]:
explained = pd.DataFrame(pca3.explained_variance_ratio_).transpose()
sns.barplot(data=explained)

In [None]:
pd.DataFrame(pca3.components_,columns=cars.columns)

## pd.DataFrame(pca3.components_,columns=cars.columns)

In [None]:
cars_pca = pd.DataFrame(pca3.transform(df_cars),columns = ['pc1', 'pc2','pc3','pc4','pc5', 'pc6','pc7','pc8','pc9','pc10','pc11']
                        ,index=cars.index.values)
cars_pca.head()

In [None]:
p2=sns.scatterplot(x="pc1", y="pc2",
              alpha=.3,
              data=cars_pca);
# add annotations one by one with a loop
for line in range(0,cars_pca.shape[0]):
     p2.text(cars_pca.pc1[line], cars_pca.pc2[line], cars_pca.index[line], horizontalalignment='left', size='medium', color='black')


## Iris database

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets

iris = datasets.load_iris()


In [None]:
print(iris['DESCR'])

In [None]:
#create the dataframe
iris_df = pd.DataFrame(iris.data)
iris_df.columns = iris.feature_names

In [None]:
from sklearn.preprocessing import StandardScaler
iris_scaler = StandardScaler(copy=False) #or alternatively use MinMaxScaler
iris_scaler.fit(iris_df) # 
iris_scaler.transform(iris_df)
iris_df.boxplot()

In [None]:
#PCA fit
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(iris_df)

In [None]:
#VISUALIZE The percentage of variance explained by each of the selected components.
pd.DataFrame(pca.explained_variance_ratio_).transpose()


In [None]:
explained = pd.DataFrame(pca.explained_variance_ratio_).transpose()
sns.barplot(data=explained)

In [None]:
pd.DataFrame(pca.components_,columns=iris_df.columns)

In [None]:
# Let see the coordinates of the data in the PCA 
iris_pca = pd.DataFrame(pca.transform(iris_df),columns = ['pc1', 'pc2','pc3','pc4']
                        ,index=iris_df.index.values)
iris_pca 


In [None]:
# The data in the first PCA 
sns.scatterplot(x="pc1", y=0,
              hue=iris['target'], alpha=.8,
              data=iris_pca,
               palette="deep");

In [None]:
# The data in the first two PCA 
sns.scatterplot(x="pc1", y="pc2",
              hue=iris['target'], alpha=.3,
              data=iris_pca);