# 第 10 章 PCA 算法

In [1]:
import numpy as np
np.set_printoptions(suppress=True)

A = np.array(
    [[3, 2000], [2, 3000], [4, 5000], [5, 8000], [1, 2000]], dtype=float)

In [2]:
A

array([[   3., 2000.],
       [   2., 3000.],
       [   4., 5000.],
       [   5., 8000.],
       [   1., 2000.]])

## 手写 PCA

In [3]:
# 得到每一列的均值
mean = A.mean(axis=0)
# 去均值以后的矩阵
norm = A - mean
print(norm.mean(axis=0))

[0. 0.]


去均值以后，再计算均值，就是 $0$ 了。

In [9]:
norm

array([[    0., -2000.],
       [   -1., -1000.],
       [    1.,  1000.],
       [    2.,  4000.],
       [   -2., -2000.]])

In [4]:
U, S, VT = np.linalg.svd(norm)

In [5]:
U

array([[-0.39223216,  0.68640654, -0.22308683,  0.35519197,  0.44617366],
       [-0.19611618, -0.29417417,  0.16243237,  0.86202009, -0.32486473],
       [ 0.19611618,  0.29417417,  0.92172121,  0.03032723,  0.15655759],
       [ 0.78446452, -0.0980582 , -0.22308683,  0.35519197,  0.44617366],
       [-0.39223237, -0.58834834,  0.15655759, -0.06065446,  0.68688482]])

In [6]:
S

array([5099.0202528 ,    1.56892885])

In [7]:
VT

array([[ 0.00053846,  0.99999986],
       [ 0.99999986, -0.00053846]])

In [8]:
# 取第 1 列
U_reduce = U[:, 0]
U_reduce = U_reduce.reshape(-1, 1)

X_reduce = U_reduce.dot(S[0])
X_reduce

array([[-1999.99971006],
       [-1000.00039349],
       [ 1000.00039349],
       [ 4000.00049704],
       [-2000.00078698]])

In [11]:
# 第 1 主成分
principal_component = VT.transpose()[:, 0].reshape(-1, 1)

In [12]:
X_reduced2 = norm.dot(principal_component)
X_reduced2

array([[-1999.99971006],
       [-1000.00039349],
       [ 1000.00039349],
       [ 4000.00049704],
       [-2000.00078698]])

In [19]:
# 恢复的数据
X_reduced2.dot(principal_component.T) + mean

array([[   1.92307713, 2000.00057988],
       [   2.46153828, 2999.99975148],
       [   3.53846172, 5000.00024852],
       [   5.15384631, 7999.99991716],
       [   1.92307655, 1999.99950296]])

In [14]:
norm

array([[    0., -2000.],
       [   -1., -1000.],
       [    1.,  1000.],
       [    2.,  4000.],
       [   -2., -2000.]])

## 使用 scikit-learn 进行 PCA 降维

In [17]:
X_reduced2

array([[-1999.99971006],
       [-1000.00039349],
       [ 1000.00039349],
       [ 4000.00049704],
       [-2000.00078698]])

In [41]:
import sklearn

print(sklearn.__version__)

0.20.3


In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)

pca.fit(A)
X_reduce_sklearn = pca.transform(A)
X_reduce_sklearn

array([[-1999.99971006],
       [-1000.00039349],
       [ 1000.00039349],
       [ 4000.00049704],
       [-2000.00078698]])

In [18]:
X_recover_sklearn = pca.inverse_transform(X_reduce_sklearn)
X_recover_sklearn

array([[   1.92307713, 2000.00057988],
       [   2.46153828, 2999.99975148],
       [   3.53846172, 5000.00024852],
       [   5.15384631, 7999.99991716],
       [   1.92307655, 1999.99950296]])

In [20]:
X_reduced2.dot(principal_component.T) + mean

array([[   1.92307713, 2000.00057988],
       [   2.46153828, 2999.99975148],
       [   3.53846172, 5000.00024852],
       [   5.15384631, 7999.99991716],
       [   1.92307655, 1999.99950296]])

In [23]:
S

array([5099.0202528 ,    1.56892885])

In [22]:
pca.singular_values_

array([5099.0202528])

In [26]:
principal_component

array([[0.00053846],
       [0.99999986]])

In [36]:
pca.components_

array([[0.00053846, 0.99999986]])

In [30]:
pca.explained_variance_

array([6500001.88461556])

In [40]:
S / S.sum()

array([0.9996924, 0.0003076])

In [35]:
pca.explained_variance_ratio_

array([0.99999991])

In [42]:
dir(pca)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_fit',
 '_fit_full',
 '_fit_svd_solver',
 '_fit_truncated',
 '_get_param_names',
 'components_',
 'copy',
 'explained_variance_',
 'explained_variance_ratio_',
 'fit',
 'fit_transform',
 'get_covariance',
 'get_params',
 'get_precision',
 'inverse_transform',
 'iterated_power',
 'mean_',
 'n_components',
 'n_components_',
 'n_features_',
 'n_samples_',
 'noise_variance_',
 'random_state',
 'score',
 'score_samples',
 'set_params',
 'singular_values_',
 'svd_solver',
 'tol',
 '

# scikit-learn 中使用 PCA

In [61]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def std_PCA(**argv):
    scaler = StandardScaler()
    pca = PCA(**argv)
    pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
    return pipeline

In [62]:
A

array([[   3., 2000.],
       [   2., 3000.],
       [   4., 5000.],
       [   5., 8000.],
       [   1., 2000.]])

In [63]:
std_PCA = std_PCA(n_components=1)
std_PCA.fit(A)
R2 = std_PCA.transform(A)

In [64]:
R2

array([[-0.62017367],
       [-0.81008684],
       [ 0.81008684],
       [ 2.24034735],
       [-1.62017367]])

In [66]:
# inverse_transform 的操作就是 pipeline 反过来做的
A_recover2 = std_PCA.inverse_transform(R2)
A_recover2

array([[   2.37982633, 3000.        ],
       [   2.18991316, 2693.77422517],
       [   3.81008684, 5306.22577483],
       [   5.24034735, 7612.45154966],
       [   1.37982633, 1387.54845034]])

In [68]:
A

array([[   3., 2000.],
       [   2., 3000.],
       [   4., 5000.],
       [   5., 8000.],
       [   1., 2000.]])

In [73]:
pca = PCA(n_components=1)
A_reduced = pca.fit_transform(A)
pca.inverse_transform(A_reduced)

array([[   1.92307713, 2000.00057988],
       [   2.46153828, 2999.99975148],
       [   3.53846172, 5000.00024852],
       [   5.15384631, 7999.99991716],
       [   1.92307655, 1999.99950296]])