## 编写my_PCA

In [1]:
import numpy as np

#### 协方差特征值my_PCA

In [2]:
#eigenvalue
def my_PCA(data,k):
    n_samples,n_features = data.shape
    # 求均值
    mean = np.array([np.mean(data[:,i]) for i in range(n_features)])
    # 去中心化
    normal_data = data - mean
    # 得到协方差矩阵
    matrix_ = np.dot(np.transpose(normal_data),normal_data)/(n_samples-1)
    # 特征值和特征矩阵
    eig_val,eig_vec = np.linalg.eig(matrix_)
    # 从大到小排序
    eigIndex = np.argsort(eig_val)
    eigVecIndex = eigIndex[:-(k+1):-1]
    feature = eig_vec[:,eigVecIndex]
    new_data = np.dot(normal_data,feature)
    # 降维后的数据映射回原空间
    rec_data = np.dot(new_data,np.transpose(feature))
    return rec_data

#### SVD my_PCA

In [3]:
#SVD
def my_PCA_SVD(data,k):
    n_samples,n_features = data.shape
    # 求均值
    mean = np.array([np.mean(data[:,i]) for i in range(n_features)])
    # 去中心化
    normal_data = data - mean
    # 得到协方差矩阵
    matrix_ = np.dot(np.transpose(normal_data),normal_data)/(n_samples-1)
    # SVD分解
    u,d,v=np.linalg.svd(matrix_)
    return np.dot(data,u[:,:k])

#### 通过对lfw数据降维比较速度

In [4]:
from time import time
from sklearn.datasets import fetch_lfw_people
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
X = lfw_people.data

t0=time()
new_data=my_PCA(X,150)
print("eigenvalue:done in %0.3fs"%(time()-t0))

t0=time()
new_data=my_PCA_SVD(X,150)
print("SVD:done in %0.3fs"%(time()-t0))

eigenvalue:done in 2.620s
SVD:done in 2.583s


可以发现，每次pca处理的时间有一定差异，但SVD所用时间均比协方差特征值短。

## ex1.npz分类

#### 采用PCA+SVM进行分类

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
x=np.load('ex1.npz')
X=x['X']
y=x['y']

#### 分为训练集和测试集

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

#### PCA聚类

In [7]:
n_components=100
pca=PCA(n_components=n_components,svd_solver='randomized',whiten=True).fit(X_train)
X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)

#### 高斯核函数SVM分类，并使用GirdSearch寻找更优参量

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")  # 忽略版本问题

param_grid = {'C': np.linspace(0.01,5,15),
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), param_grid
)
clf = clf.fit(X_train_pca, y_train)
clf.best_estimator_

SVC(C=1.4357142857142857, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### 直接打印classification_report评估分类结果

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test_pca)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.80      0.86        15
           1       0.78      0.85      0.81        46
           2       0.76      0.90      0.83        21
           3       0.86      0.91      0.88        92
           4       0.94      0.73      0.82        22
           5       0.94      0.84      0.89        19
           6       0.86      0.70      0.78        27

   micro avg       0.85      0.85      0.85       242
   macro avg       0.87      0.82      0.84       242
weighted avg       0.85      0.85      0.85       242



可见分类成功率基本都在80%以上，成功率还可以接收

#### 对测试数据分类并写入output.txt

In [10]:
x1=np.load('test.npz')
X_test1=x1['X']
X_test1_pca=pca.transform(X_test1)
y1=clf.predict(X_test1_pca)

In [11]:
f="output.txt"
with open(f,"w") as file:
    for i in y1:
        file.write(" %s"%(str(i)))