### Principal Component Analysis applied to the Iris dataset

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from sklearn import decomposition
from sklearn.cross_validation import cross_val_score

# increase default figure and font sizes for easier viewing
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['font.size'] = 14

In [None]:
# Load in the data
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
col_names = ['sepal_length','sepal_width','petal_length','petal_width', 'species']
iris = pd.read_csv(url, header=None, names=col_names)

In [None]:
# Create X
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = iris[feature_cols]
target_names = iris.species
X.head()

In [None]:
# all features and response need to be numbers for scikit-learn
# map each iris species to a number
iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})
y = iris.species_num

In [None]:
#Let's reduce it to three components
from sklearn import decomposition
pca3 = decomposition.PCA(n_components=3)
X_trf_3 = pca3.fit_transform(X)
print(X_trf_3[0:5])

In [None]:
pca3.explained_variance_ratio_

In [None]:
pca3.explained_variance_ratio_.sum()

In [None]:
#Let's reduce it to two components
pca2 = decomposition.PCA(n_components=2)
X_trf_2 = pca2.fit_transform(X)
pca2.explained_variance_ratio_.sum()

In [None]:
#Let's reduce it to one components
pca1 = decomposition.PCA(n_components=1)
X_trf_1 = pca1.fit_transform(X)
pca1.explained_variance_ratio_.sum()

In [None]:
#All components (same as the original features)
pca_all = decomposition.PCA() # alternate way to use all 4 components: decomposition.PCA(n_components=4)
X_trf_all = pca_all.fit_transform(X)
pca_all.explained_variance_ratio_.sum()

In [None]:
#Choosing components
pca4 = decomposition.PCA(n_components=4)
X_trf = pca4.fit_transform(X)

print 'explained variance ratio (all 4 components): ', pca4.explained_variance_ratio_
print 'sum of explained variance ratio (all 4 components): ', pca4.explained_variance_ratio_.sum()
plt.cla()
plt.plot(pca4.explained_variance_ratio_)
plt.title('Variance explained by each principal component\n')
plt.ylabel(' % Variance Explained')
plt.xlabel('Principal component')

In [None]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
log_reg = logistic.fit(X,y)
scores = cross_val_score(log_reg, X, y, cv=10, scoring='accuracy')
scores.mean()

In [None]:
from sklearn.pipeline import Pipeline
pipe_trf_3 = Pipeline([('pca_3', decomposition.PCA(n_components=3)),
                 ('logistic', LogisticRegression())])
scores_trf_3 = cross_val_score(pipe_trf_3, X, y, cv=10, scoring='accuracy')
scores_trf_3.mean()

In [None]:
pipe_trf_2 = Pipeline([('pca_2', decomposition.PCA(n_components=2)),
                 ('logistic', LogisticRegression())])
scores_trf_2 = cross_val_score(pipe_trf_2, X, y, cv=10, scoring='accuracy')
scores_trf_2.mean()

In [None]:
# fits PCA, transforms data, fits the Logistic Regression mode with one component, and scores accuracy
# on the transformed data
pipe_trf_1 = Pipeline([('pca_1', decomposition.PCA(n_components=1)),
                 ('logistic', LogisticRegression())])
scores_trf_1 = cross_val_score(pipe_trf_1, X, y, cv=10, scoring='accuracy')
scores_trf_1.mean()

In [None]:
pipe_trf_2 = Pipeline([('pca_2', decomposition.PCA(n_components=2)),
                 ('logistic', LogisticRegression())])
scores_trf_2 = cross_val_score(pipe_trf_2, X, y, cv=10, scoring='accuracy')
scores_trf_2.mean()

In [None]:
# Turn it back into its 4 components using only 2 principal components
X_reconstituted2 = pca2.inverse_transform(X_trf_2)
scores_trf_recon2 = cross_val_score(log_reg, X_reconstituted2, y, cv=10, scoring='accuracy')
scores_trf_recon2.mean()

In [None]:
# Turn it back into its 4 components using only 1 principal components
X_reconstituted1 = pca1.inverse_transform(X_trf_1)
scores_trf_recon1 = cross_val_score(log_reg, X_reconstituted1, y, cv=10, scoring='accuracy')
scores_trf_recon1.mean()

In [None]:
# Turn it back into its 4 components using only 3 principal components
X_reconstituted3 = pca3.inverse_transform(X_trf_3)
scores_trf_recon3 = cross_val_score(log_reg, X_reconstituted3, y, cv=10, scoring='accuracy')
scores_trf_recon3.mean()

In [None]:
# Turn it back into its 4 components using 4 principal components
pca4 = decomposition.PCA(n_components=4)
X_trf_4 = pca4.fit_transform(X)
pca4.explained_variance_ratio_.sum()
X_reconstituted4 = pca4.inverse_transform(X_trf_4)
scores_trf_recon4 = cross_val_score(log_reg, X_reconstituted4, y, cv=10, scoring='accuracy')
scores_trf_recon4.mean()

In [None]:
print X_reconstituted1[0]
print X_reconstituted2[0]
print X_reconstituted3[0]
print X_reconstituted4[0]