[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncerdan/HandsOnML/blob/master/Ch_08_Dimensionality_Reduction.ipynb)

# PCA

## Principal Components

In [2]:
# make a 3D dataset
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [4]:
# you can compute them by hand using svd
# if computing by hand, be sure to center the dataset!

# compute svd
X_centered = X - X.mean(axis=0)
U, s, vT = np.linalg.svd(X_centered)

# extract the first two PCs
c1 = vT.T[:, 0]
c2 = vT.T[:, 1]

## Projecting Down into d Dimensions

In [5]:
# to project into d-dimensions, just use the first d PCs
# here: w=2

W_2 = vT.T[:, :2]
X_2D = X_centered.dot(W_2)

## Using Sklearn

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)   # projects to 2-D
X_2D = pca.fit_transform(X)

# the components_ attr holds the W_d matrix used
print('Components:', pca.components_)

# this would be the first PC
print('First PC:', pca.components_.T[:, 0])

Components: [[-0.93636116 -0.29854881 -0.18465208]
 [ 0.34027485 -0.90119108 -0.2684542 ]]
First PC: [-0.93636116 -0.29854881 -0.18465208]


## Explained Variance Ratio

In [9]:
# shows how much variance each PC contains
pca.explained_variance_ratio_   # as you can see, the third PC only would have 2%

array([0.84248607, 0.14631839])

## Choosing the Right Number of Dimensions

In [10]:
# first lets load in MNIST
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist_target = mnist.target.astype(np.uint8)

In [12]:
# and partition it
from sklearn.model_selection import train_test_split

X = mnist['data']
y = mnist['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
# here, we can use pca without reducing the data then pick the smallest
# dimension that maintains at least 95% of the variance

pca = PCA()
pca.fit(X_train)
cumulativeSum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumulativeSum >= 0.95) + 1
print(d)

154


In [16]:
# then we could run
pca = PCA(n_components=d)
X_reduced = pca.fit_transform(X_train)

In [17]:
# however, instead of this, this is built in to the PCA class
# if you pass a float between 0 and 1 it will assume you are 
# setting the minimum variance retained:

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

## PCA for Compression

In [None]:
TODO