# 常用的与线性代数相关的函数

def mean(a, axis, dtype, out，keepdims )

def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None,aweights=None)

# PCA

创建三维数据集

In [68]:
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

主成分矩阵

$
\mathbf{V} =
\begin{pmatrix}
  \mid & \mid & & \mid \\
  \mathbf{c_1} & \mathbf{c_2} & \cdots & \mathbf{c_n} \\
  \mid & \mid & & \mid
\end{pmatrix}
$

$X = U\bullet{s}\bullet{V}$

In [69]:
X_centered = X - X.mean(axis=0)
U,s,V = np.linalg.svd(X_centered)
print(V)
print(np.diag(s))

[[ 0.93636116  0.29854881  0.18465208]
 [-0.34027485  0.90119108  0.2684542 ]
 [-0.08626012 -0.31420255  0.94542898]]
[[6.77645005 0.         0.        ]
 [0.         2.82403671 0.        ]
 [0.         0.         0.78116597]]


In [70]:
c1 = V.T[:,0]
c2 = V.T[:,1]
print(c1)

[0.93636116 0.29854881 0.18465208]


In [71]:
m,n = X.shape
S = np.zeros(X_centered.shape)
S[:n,:n] = np.diag(s)
np.allclose(X_centered,U.dot(S).dot(V))

True

底维度投影

一旦确定了所有的主成分，就可以将数据集投影到由前d个主成分定义的超平面上，从而将数据集的维度降到d维，这个超平面的选择，能确保投影保留尽可能多的差异性

将训练集投影到底维度：$$X_{d-proj} = X \bullet W_d$$

In [72]:
w2 = V.T[:,:2]
X2D = X_centered.dot(w2)
print(X2D.shape)

(60, 2)


# NumPy实现PCA

In [38]:
#使用\datasets\testSet.txt数据集
import numpy as np
def loadDataset(file):
    with open(file,'r') as pf:
        linestring = [line.strip().split('\t') for line in pf]
        features = [[float(i) for i in features] for features in linestring]
        return np.mat(features)

In [45]:
datamat = loadDataset('.\\datasets\\testSet.txt')
print(datamat.shape)

(1000, 2)


In [78]:
def pac(datamat):
    mean_vals = np.mean(datamat,axis = 0) 
    mean_proc = datamat - mean_vals #对每个特征值进行中心化处理
    covmat = np.cov(mean_proc,rowvar=False) #计算协方差矩阵
    print(covmat)
    eigvals,eigvects = np.linalg.eig(np.mat(covmat)) #计算协方差矩阵的特征值和特征向量
    eigval_index = np.argsort(eigvals)
    eigval_index = eigval_index[:-2:-1] #从小到大排序
    red_eigvects = eigvects[:,eigval_index] 
    low_d_datamat = mean_proc.dot(red_eigvects) #最佳投影矩阵为协方差特征向量矩阵
    reconmat = (low_d_datamat.dot(red_eigvects.T)) + mean_vals
    print(low_d_datamat.shape)
    print(reconmat.shape)
    return low_d_datamat,reconmat

In [79]:
low_d_datamat,reconmat = pac(datamat)

[[1.05198368 1.1246314 ]
 [1.1246314  2.21166499]]
(1000, 1)
(1000, 2)
[[-2.51033597]]
