In [1]:
import matplotlib  # noqa
import matplotlib.pyplot as plt  # noqa

matplotlib.use("TkAgg")

In [8]:
import numpy as np

from scipy.stats import sem

from sklearn.svm import SVC
from sklearn.datasets import fetch_olivetti_faces
from sklearn.metrics import (
    classification_report,
    confusion_matrix
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
)

In [3]:
data = fetch_olivetti_faces()

In [4]:
print(data.DESCR)

.. _olivetti_faces_dataset:

The Olivetti faces dataset
--------------------------

`This dataset contains a set of face images`_ taken between April 1992 and 
April 1994 at AT&T Laboratories Cambridge. The
:func:`sklearn.datasets.fetch_olivetti_faces` function is the data
fetching / caching function that downloads the data
archive from AT&T.

.. _This dataset contains a set of face images: http://www.cl.cam.ac.uk/research/dtg/attarchive/facedatabase.html

As described on the original website:

    There are ten different images of each of 40 distinct subjects. For some
    subjects, the images were taken at different times, varying the lighting,
    facial expressions (open / closed eyes, smiling / not smiling) and facial
    details (glasses / no glasses). All the images were taken against a dark
    homogeneous background with the subjects in an upright, frontal position 
    (with tolerance for some side movement).

**Data Set Characteristics:**

    Classes                        

In [5]:
svc_1 = SVC(kernel="linear")

In [6]:
def evaluate_cross_validation(clf, x, y, k):
    # 创建K折交叉验证迭代
    cv = KFold(k, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=cv)
    print(scores)
    print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores)))

In [9]:
def train_and_evaluate(clf, x_train, x_test, y_train, t_test):
    clf.fit(x_train, y_train)

    print("Accuracy on training set:")
    print(clf.score(x_train, y_train))
    print("Accuracy on testing set:")
    print(clf.score(x_test, y_test))

    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


In [10]:
x_train, x_test, y_train, y_test = train_test_split(
    data.data,
    data.target,
    test_size=0.25,
    random_state=0,
)
evaluate_cross_validation(svc_1, x_train, y_train, 5)
train_and_evaluate(svc_1, x_train, x_test, y_train, y_test)

[0.93333333 0.86666667 0.91666667 0.93333333 0.91666667]
Mean score: 0.913 (+/-0.012)
Accuracy on training set:
1.0
Accuracy on testing set:
0.99
              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         4
           7       1.00      0.67      0.80         3
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         2
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         5
          15       1.00      1.00      1.00