## Comparison of PCA and LDA
- Principal Component Analysis (PCA) identifies the combination of attributes (principal components, or directions in the feature space) that account for the most variance in the data.
- Linear Discriminant Analysis (LDA) tries to identify attributes that account for the most variance between classes. In particular, LDA, in contrast to PCA, is a supervised method, using known class labels.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn import datasets

from sklearn.decomposition import PCA

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
sns.set()

In [None]:
iris_df = sns.load_dataset('iris')
iris_df.head()

In [None]:
iris_long = pd.melt(iris_df, id_vars=['species'], var_name='measure', value_name='value')
iris_long.head()

In [None]:
sns.catplot(x="species", y="value", hue="species", height=10, aspect=0.5,
                 col="measure", data=iris_long);


In [None]:
# Use the iris dataset from sklearn

In [None]:
iris = datasets.load_iris()

X = iris.data
y = iris.target
target_names = iris.target_names
feature_names = iris.feature_names

In [None]:
X[0:5]

In [None]:
y[0:5]

In [None]:
target_names

In [None]:
feature_names

### PCA with 1 component

In [None]:
# PCA

pca = PCA(n_components=1)

X_r = pca.fit(X).transform(X)

X_r.shape

In [None]:
X_r[0:5]

In [None]:
np.zeros(X_r.shape[0])

In [None]:
plt.scatter(X_r[:,0], np.zeros(X_r.shape[0]), c=iris.target, 
            alpha=0.8, s=120,  marker='o', edgecolors='white');

In [None]:
print(pca.explained_variance_ratio_)

### PCA with 2 components

In [None]:
#PCA

pca = PCA(n_components=2)

X_r = pca.fit(X).transform(X)

X_r.shape

In [None]:
X_r[0:5]

In [None]:
pca.inverse_transform(X_r)[0:5]

In [None]:
print(pca.explained_variance_ratio_)

### LDA (fit based on class)

In [None]:
# LDA

lda = LinearDiscriminantAnalysis(n_components=2)

X_r2 = lda.fit(X, y).transform(X)

In [None]:
X_r2[0:5]

In [None]:
print(lda.explained_variance_ratio_)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 7))

colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    ax1.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
    
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    ax2.scatter(X_r2[y == i, 0], X_r2[y == i, 1], alpha=.8, color=color,
                label=target_name)
    
ax1.set_title('PCA of IRIS dataset')
ax1.set_xlabel('1st principal component')
ax1.set_ylabel('2nd principal component')


ax2.set_title('LDA of IRIS dataset')
ax2.set_xlabel('1st component')
ax2.set_ylabel('2nd component')

for ax in (ax1, ax2):
    ax.legend(loc='best')
    ax.grid()
    
plt.tight_layout()