In [1]:
##this is the code for the T1 1)
from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import numpy as np

def meanX(dataX):
    return np.mean(dataX,axis=0)

def my_pca_FV(X_train, X_test):
    average = meanX(X_train) 
    m, n = np.shape(X_train)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = X_train - avgs
    covX = np.cov(data_adjust.T)   #计算协方差矩阵
    featValue, featVec=  np.linalg.eig(covX)  #求解协方差矩阵的特征值和特征向量
    index = np.argsort(-featValue) #依照featValue进行从大到小排序
    finalData = []
    selectVec = np.matrix(featVec.T[index[:n_components]]) #这里要转置，和下面SVD对比
    X_train_pca = -(data_adjust * selectVec.T).real/700

    average = meanX(X_test) 
    m, n = np.shape(X_test)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = X_test - avgs
    X_test_pca = -(data_adjust * selectVec.T).real/700
    return X_train_pca, X_test_pca

def my_pca_SVD(X_train, X_test):
    X_train_mypca=[]
    X_test_mypca=[]
    average = meanX(X_train) 
    m, n = np.shape(X_train)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = X_train - avgs
    u,v,w=np.linalg.svd(data_adjust)
    featValue, featVec=  np.linalg.eig(covX)  #求解协方差矩阵的特征值和特征向量
    index = np.argsort(-v) #依照featValue进行从大到小排序
    selectVec = np.matrix(w[index[:n_components]]) #注意这个地方不需要转置！
    X_train_mypca = -(data_adjust * selectVec.T).real/700

    average = meanX(X_test) 
    m, n = np.shape(X_test)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = X_test - avgs
    X_test_mypca = -(data_adjust * selectVec.T).real/700
    return X_train_mypca, X_test_mypca


print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


# #############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)


# #############################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)


# #############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca,X_test_pca=my_pca_FV(X_train,X_test)
#X_train_pca,X_test_pca=my_pca_SVD(X_train,X_test)
print("done in %0.3fs" % (time() - t0))


# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5, iid=False)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


# #############################################################################
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))


# #############################################################################
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# plot the result of the prediction on a portion of the test set

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)

Automatically created module for IPython interactive environment
Total dataset size:
n_samples: 639
n_features: 1850
n_classes: 2
Projecting the input data on the eigenfaces orthonormal basis
done in 2.374s
Fitting the classifier to the training set
done in 3.145s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Predicting people's names on the test set
done in 0.006s
                   precision    recall  f1-score   support

    George W Bush       0.98      0.98      0.98       138
Gerhard Schroeder       0.86      0.86      0.86        22

      avg / total       0.96      0.96      0.96       160

[[135   3]
 [  3  19]]


# T1-1

## 说明

我直接将训练集数据和测试集数据作为函数参数，输出降维后的训练数据和测试数据，注意测试数据的降维必须使用训练数据的特征方向。另外，svd分解的w矩阵是转置过的。

我将特征值法的函数命名为my_pca_FV，SVD方法命名为my_pca_SVD，分别调用两个函数记录运行时间。

## 结果

特征值法2.455秒，SVD法2.347秒，我没有使用任何优化方法，与sklearn（0.005s）相比还是非常慢，不过直接SVD分解和直接求协方差矩阵的特征值所用时间似乎差不多。

In [3]:
##this is the code for the T1 2)
from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import numpy as np


print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


# #############################################################################
# Download the data, if not already on disk and load it as numpy arrays

x = np.load('ex1.npz')
X = x['X'] 
y = x['y']

work = np.load('test.npz')
work_x = work['X']


# #############################################################################
# Split into a training set and a test set using a stratified k fold

# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)#在生成结果的程序中，训练集占比百分之百


# #############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
print("done in %0.3fs" % (time() - t0))


print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
work_pca = pca.transform(work_x)
print("done in %0.3fs" % (time() - t0))

# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1, 2, 5, 10, 20, 50, 1e2, 500],
              'gamma': [0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5, iid=False)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


# #############################################################################
# Quantitative evaluation of the model quality on the test set

print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print('match: {0}/{1}'.format(np.sum(np.equal(y_pred,y_test)),y_test.shape[0]))

work_y=clf.predict(work_pca)
#file_output = open("output.txt",'w',encoding='utf-16')
#for i in work_y:
    #file_output.write(str(i)+' ')
#file_output.close()

Automatically created module for IPython interactive environment
Extracting the top 150 eigenfaces from 724 faces
done in 0.339s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.073s
Fitting the classifier to the training set
done in 56.095s
Best estimator found by grid search:
SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.002, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Predicting people's names on the test set
done in 0.031s
match: 197/242


# T1-2

## 说明

在生成真正输出数据的时候，我将全部x数据作为训练集。

这里的程序展示的是一个训练效果，此时将x划分为1:3的测试/训练集

## 结果

运行多次发现正确率可以到达83.3％左右，估计output.txt代表的结果可能稍好一点（因为训练集更大）。