In [13]:
import logging

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

In [14]:
import sklearn
import numpy as np
pos = np.load('pic_data/pos.npy')
pos_filename = np.load('pic_data/pos_filename.npy', allow_pickle=True)
neg = np.load('pic_data/neg.npy')
neg_filename = np.load('pic_data/neg_filename.npy', allow_pickle=True)

train_set = sklearn.utils.Bunch()
train_set.data = np.concatenate((pos,neg))
train_set.target = np.array([1]*len(pos)+[0]*len(neg))
train_set.target_names = ['neg', 'pos']
train_set.filename = np.concatenate((pos_filename,neg_filename))

In [15]:
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = [len(train_set.target), 64, 64]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % train_set.data.shape[1])
print("n_classes: %d" % len(train_set.target_names))

Total dataset size:
n_samples: 3111
n_features: 4096
n_classes: 2


In [16]:
test = np.load('pic_data/test.npy')
test_filename = np.load('pic_data/test_filename.npy',allow_pickle=True)
test_set = sklearn.utils.Bunch()
test_set.data = test
test_set.filename = test_filename

In [17]:
from time import time
from sklearn.decomposition import PCA

t0 = time()
pca = PCA(n_components=150, whiten=True)

print("Projecting the input data on the eigenfaces orthonormal basis")
X_train_pca = pca.fit_transform(train_set.data)
X_test_pca = pca.transform(test_set.data)

eigenfaces = pca.components_.reshape((pca.n_components_,h, w))
print("done in %0.3fs" % (time() - t0))

Projecting the input data on the eigenfaces orthonormal basis
done in 0.988s


In [18]:
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve
from sklearn.svm import SVC

# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced', probability=True) , param_grid)
# train_sizes_abs,train_scores,test_scores,fit_times,score_times = learning_curve(clf,X_train_pca, train_set.target, n_jobs=12, return_times=True)
clf = clf.fit(X_train_pca, train_set.target)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set


KeyboardInterrupt: 

In [None]:
import pandas as pd
train_proba = clf.predict_proba(X_train_pca)
pd.DataFrame(train_proba,index=train_set.filename,columns=train_set.target_names).to_csv('result/svm_embedding.csv', float_format='%.16f')

sum_file = pd.read_csv('result/sum_train.csv', index_col=0)
for i,p in enumerate(train_set.filename):
    sum_file.loc[p,'svm_pos'] = train_proba[i, 1]
sum_file.to_csv('result/sum_train.csv', float_format='%.16f')

In [None]:
pred_proba = clf.predict_proba(X_test_pca)
pd.DataFrame(pred_proba,index=test_set.filename).to_csv('result/pic_SVM_result.csv', float_format='%.16f')

sum_file = pd.read_csv('result/sum_test.csv', index_col=0)
for i,p in enumerate(test_set.filename):
    sum_file.loc[p,'svm_pos'] = pred_proba[i, 1]
sum_file.to_csv('result/sum_test.csv', float_format='%.16f')

In [None]:
import matplotlib.pyplot as plt
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# plot the result of the prediction on a portion of the test set

def title(y_pred, y_val, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_val[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

#prediction_titles = [title(y_pred, y_val, train_set.target_names, i) for i in range(y_pred.shape[0])]

#plot_gallery(X_val, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()