In [24]:
import pandas as pd
WORKSPACE = '/Users/houtonglei/OneDrive - stu.hqu.edu.cn/数学建模/2021美赛/'
DATASET_INDEX = WORKSPACE + '2021_ICM_ProblemC/2021MCM_ProblemC_Images_by_GlobalID.xlsx'
dataset_index = pd.read_excel(DATASET_INDEX)

In [25]:
import logging

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

%matplotlib inline

In [26]:
import sklearn
import numpy as np
pos = np.load('pic_data/pos.npy')
neg = np.load('pic_data/neg.npy')
train_set = sklearn.utils.Bunch()
train_set.data = np.concatenate((pos,neg))
train_set.target = np.array([1]*len(pos)+[0]*len(neg))
train_set.target_names = ['neg''pos']

In [27]:
# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = [len(train_set.target), 64, 64]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % train_set.data.shape[1])
print("n_classes: %d" % len(train_set.target_names))

Total dataset size:
n_samples: 3111
n_features: 4096
n_classes: 1


In [28]:
test = np.load('pic_data/test.npy')
test_filename = np.load('pic_data/test_filename.npy',allow_pickle=True)
test_set = sklearn.utils.Bunch()
test_set.data = test
test_set.filename = test_filename

In [29]:
from time import time
from sklearn.decomposition import PCA

# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeleddataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"% (n_components, train_set.data.shape[0]))
t0 = time()
pca = PCA(n_components=n_components, svd_solver='randomized',whiten=True).fit(train_set.data)
print("done in %0.3fs" % (time() - t0))

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(train_set.data)
X_test_pca = pca.transform(test_set.data)
print("done in %0.3fs" % (time() - t0))

Extracting the top 150 eigenfaces from 3111 faces
done in 1.146s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.119s


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced', probability=True) , param_grid)
clf = clf.fit(X_train_pca, train_set.target)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
done in 203.896s
Best estimator found by grid search:
SVC(C=1000.0, class_weight='balanced', gamma=0.005, probability=True)


In [None]:
train_proba = clf.predict_proba(train_set.data)
pd.DataFrame(train_proba,index=train_set.filenames,columns=train_set.target_names).to_csv('result/svm_embedding.csv', float_format='%.16f')

In [33]:
pred_proba = clf.predict_proba(X_test_pca)
pd.DataFrame(pred_proba,index=test_set.filename).to_csv('result/pic_SVM_result.csv', float_format='%.16f')