# PCA: Principal Component Analysis 


https://medium.com/apprentice-journal/pca-application-in-machine-learning-4827c07a61db

Principal Component Analysis (PCA) is an unsupervised, non-parametric statistical technique primarily used for dimensionality reduction in machine learning.

Example with Iris
https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import pandas as pd
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

output_notebook()
%matplotlib inline

#import pour la classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

#import PCA
from sklearn.decomposition import PCA




# Load data

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
# print(iris)


In [3]:
X = iris.data
y = iris.target

X



array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
noms_target = iris.target_names
print(noms_target)

['setosa' 'versicolor' 'virginica']


# Effectuer le PCA

In [5]:
pca = PCA(n_components=2)
X_resultat = pca.fit(X).transform(X)
X_resultat

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,

# Pourcentage de variance pris en charge par les 2 composants PCA.

In [6]:
print ("Composant 1: " + str(pca.explained_variance_ratio_[0]))
print ("Composant 2: " + str(pca.explained_variance_ratio_[1]))

Composant 1: 0.924618723201727
Composant 2: 0.05306648311706785


In [7]:
#On voit que les 2 composantes prennent en charge presque la totalité de la variance.  


## Tracé des point du dataset dans le système de coordonnées ou bases PCA.  Les labels utilisés sont ceux des targets. On prépare d’abord les points en couleurs.


In [8]:
echelle_couleur = {0:'red', 1:'blue', 2:'orange'}
couleurs = list()

for valeur in y:
    couleur = echelle_couleur[valeur]
    couleurs.append(couleur)

p_fig = figure(title="PCA Iris DataSet - 2 composant reduction", tools='')
p1_values = X_resultat[:,0]
p2_values = X_resultat[:,1]
p_fig.circle(x = p1_values, y=p2_values, size=3, color=couleurs)
show(p_fig)



## On utilise maintenant une technique simple pour evaluer si le choix de n=2 est adéquat. On refait la même procédure mais avec toutes les composantes:

In [9]:
pca = PCA() #sans especifier le numero de composants
X_resultat = pca.fit(X).transform(X)

In [10]:
variance_explique = pca.explained_variance_ratio_
print(variance_explique)

[0.92461872 0.05306648 0.01710261 0.00521218]


In [11]:
print(pca.components_)

[[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [ 0.65658877  0.73016143 -0.17337266 -0.07548102]
 [-0.58202985  0.59791083  0.07623608  0.54583143]
 [-0.31548719  0.3197231   0.47983899 -0.75365743]]


In [12]:
comp_id = [1,2,3,4] #id de composant
p_pca = figure(title="Composantes pricipales", tools='', x_axis_label="principal component", y_axis_label="Variance Expliquee")
p_pca.line(comp_id, variance_explique, color='red')
show(p_pca)

In [13]:
# On voit qu’avec 2 composantes, on est capable de capturer l’essentiel des features.


# On évalue maintenant si la vitesse de traitement est améliorée en prenant moins de feature ou plus de features

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [15]:
# Pour simplifier, on utilise la fonction `%%timeit` pour effectuer une classification selon la sélection de features. 
# L’algorithme de classification utilisé est le KNN

%%timeit
modele = KNeighborsClassifier(2)
modele.fit(Xtrain, y_train)


UsageError: Line magic function `%%timeit` not found.
