# 天使人综合征与正常人脸二分类非端到端模型

In [94]:
from deepface import DeepFace
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import math
import joblib
from sklearn.metrics import classification_report

设置数据源路径以及保存模型位置

In [95]:
NORMALIZED_IMG_DIR = './dataset/'
clf_model_name = 'angelman-normal.pkl'
pca_model_name = 'angelman-normal_pca.pkl'

读取所有遗传病类型

In [96]:
class_list = os.listdir(NORMALIZED_IMG_DIR)

引入FaceNet

In [97]:
model = DeepFace.build_model('Facenet')

使用FaceNet获取人脸embedding特征向量

In [98]:
embeddings = {}

In [99]:
for class_name in class_list:
#     embeddings[class_name] = DeepFace.represent(os.listdir(os.path.join(NORMALIZED_IMG_DIR, class_name)), model_name = 'Facenet', enforce_detection=False)
    embeddings[class_name] = []
    for file_name in os.listdir(os.path.join(NORMALIZED_IMG_DIR, class_name)):
        print(f"getting {file_name}'s embedding")
        embeddings[class_name].append(
            DeepFace.represent(os.path.join(NORMALIZED_IMG_DIR, class_name, file_name), model_name = 'Facenet', model=model, enforce_detection=False)
        )

getting 0.jpg's embedding
getting 1.jpg's embedding
getting 10.jpg's embedding
getting 11.jpg's embedding
getting 12.jpg's embedding
getting 13.jpg's embedding
getting 14.jpg's embedding
getting 15.jpg's embedding
getting 16.jpg's embedding
getting 17.jpg's embedding
getting 18.jpg's embedding
getting 19.jpg's embedding
getting 2.jpg's embedding
getting 20.jpg's embedding
getting 21.jpg's embedding
getting 23.jpg's embedding
getting 3.jpg's embedding
getting 4.jpg's embedding
getting 5.jpg's embedding
getting 6.jpg's embedding
getting 7.jpg's embedding
getting 8.jpg's embedding
getting 9.jpg's embedding
getting 1491.png's embedding
getting 1690.png's embedding
getting 1739.png's embedding
getting 1742.png's embedding
getting 1824.png's embedding
getting 1924.png's embedding
getting 2152.png's embedding
getting 2383.png's embedding
getting 2662.png's embedding
getting 2719.png's embedding
getting 2784.png's embedding
getting 2827.png's embedding
getting 2967.png's embedding
getting 3624

In [100]:
X = []
Y = []
for key, value in embeddings.items():
    X = X + value
    Y = Y + [key] * len(value)

In [118]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3)

网格搜索

In [119]:
best_score = -1
best_kernel = ''
best_pca_dim = -1
min_dim = math.inf
for class_name in class_list:
    min_dim = min(min_dim, len(os.listdir(os.path.join(NORMALIZED_IMG_DIR, class_name))))
min_dim = min(min_dim, 129)
print(f'searching pca dim in range 1 t4o {min_dim}')
for n_dim in range(1, min_dim):
    pca = PCA(n_components=n_dim)
    pca = pca.fit(X)
    X_dr = pca.transform(Xtrain)
    kernels = ["linear","poly","rbf","sigmoid"]
    for kernel in kernels:
        clf = SVC(kernel=kernel)
        score = cross_val_score(clf, X_dr, Ytrain, cv=4, scoring='accuracy').mean()
        if score >= best_score:
            best_score = score
            best_pca_dim = n_dim
            best_kernel = kernel
            print("The accuracy under kernel %s and pca dimension %d is %f" % (kernel, n_dim, score))

searching pca dim in range 1 t4o 23
The accuracy under kernel linear and pca dimension 1 is 0.437500
The accuracy under kernel poly and pca dimension 1 is 0.562500
The accuracy under kernel rbf and pca dimension 1 is 0.656250
The accuracy under kernel linear and pca dimension 2 is 0.656250
The accuracy under kernel poly and pca dimension 2 is 0.687500
The accuracy under kernel sigmoid and pca dimension 2 is 0.687500
The accuracy under kernel linear and pca dimension 3 is 0.687500
The accuracy under kernel poly and pca dimension 3 is 0.718750
The accuracy under kernel sigmoid and pca dimension 3 is 0.718750
The accuracy under kernel linear and pca dimension 4 is 0.718750
The accuracy under kernel poly and pca dimension 4 is 0.718750
The accuracy under kernel rbf and pca dimension 4 is 0.718750
The accuracy under kernel sigmoid and pca dimension 4 is 0.750000
The accuracy under kernel linear and pca dimension 10 is 0.750000
The accuracy under kernel rbf and pca dimension 14 is 0.750000


使用最优参数构建模型

In [120]:
pca = PCA(n_components=best_pca_dim)
pca = pca.fit(X)
X_dr = pca.transform(X)
joblib.dump(pca, pca_model_name)

['angelman-normal_pca.pkl']

In [121]:
clf = SVC(kernel=best_kernel, probability = True)
X_train_dr = pca.transform(Xtrain)
X_test_dr = pca.transform(Xtest)
clf.fit(X_train_dr, Ytrain)
Y_test_predict = clf.predict(X_test_dr)
target_names = class_list
print(classification_report(Ytest, Y_test_predict, target_names=target_names))

              precision    recall  f1-score   support

    Angelman       0.83      0.62      0.71         8
      normal       0.62      0.83      0.71         6

    accuracy                           0.71        14
   macro avg       0.73      0.73      0.71        14
weighted avg       0.74      0.71      0.71        14



In [105]:
joblib.dump(clf, clf_model_name)

['angelman-normal.pkl']

部分情况，上述搜索出的最优参数在测试集上并不一定是最优解，也一定程度上说明了参数量过少

In [87]:
pca = PCA(n_components=12)
pca = pca.fit(X)
X_dr = pca.transform(X)
joblib.dump(pca, pca_model_name)

['angelman-normal_pca.pkl']

In [88]:
clf = SVC(kernel='rbf', probability = True)
X_train_dr = pca.transform(Xtrain)
X_test_dr = pca.transform(Xtest)
clf.fit(X_train_dr, Ytrain)
Y_test_predict = clf.predict(X_test_dr)
target_names = class_list
print(classification_report(Ytest, Y_test_predict, target_names=target_names))

              precision    recall  f1-score   support

    Angelman       1.00      0.71      0.83         7
      normal       0.78      1.00      0.88         7

    accuracy                           0.86        14
   macro avg       0.89      0.86      0.85        14
weighted avg       0.89      0.86      0.85        14

