# Install and Import sckit-optimization

In [1]:
import skopt
print('skopt %s' % skopt.__version__)

skopt 0.8.1


# Load Dataset

In [2]:
from pandas import read_csv
dataframe = read_csv('ionosphere.csv', header=None)

In [3]:
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.00000,0.03760,...,-0.51171,0.41078,-0.46168,0.21266,-0.34090,0.42267,-0.54487,0.18641,-0.45300,g
1,1,0,1.00000,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.00000,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.19040,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.00000,-0.03365,1.00000,0.00485,1.00000,-0.12062,0.88965,0.01198,...,-0.40220,0.58984,-0.22145,0.43100,-0.17365,0.60436,-0.24180,0.56045,-0.38238,g
3,1,0,1.00000,-0.45161,1.00000,1.00000,0.71216,-1.00000,0.00000,0.00000,...,0.90695,0.51613,1.00000,1.00000,-0.20099,0.25682,1.00000,-0.32382,1.00000,b
4,1,0,1.00000,-0.02401,0.94140,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.13290,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,1,0,0.83508,0.08298,0.73739,-0.14706,0.84349,-0.05567,0.90441,-0.04622,...,-0.04202,0.83479,0.00123,1.00000,0.12815,0.86660,-0.10714,0.90546,-0.04307,g
347,1,0,0.95113,0.00419,0.95183,-0.02723,0.93438,-0.01920,0.94590,0.01606,...,0.01361,0.93522,0.04925,0.93159,0.08168,0.94066,-0.00035,0.91483,0.04712,g
348,1,0,0.94701,-0.00034,0.93207,-0.03227,0.95177,-0.03431,0.95584,0.02446,...,0.03193,0.92489,0.02542,0.92120,0.02242,0.92459,0.00442,0.92697,-0.00577,g
349,1,0,0.90608,-0.01657,0.98122,-0.01989,0.95691,-0.03646,0.85746,0.00110,...,-0.02099,0.89147,-0.07760,0.82983,-0.17238,0.96022,-0.03757,0.87403,-0.16243,g


In [4]:
data = dataframe.values
X, y = data[:,:-1], data[:,-1]
print(X.shape, y.shape)

(351, 34) (351,)


# Import Other Packages

In [5]:
from numpy import mean  # 計算平均
from numpy import std   # 計算標準差
from sklearn.model_selection import cross_val_score # 計算交叉驗證分數，避免只取單一驗證集導致驗證結果過於片面
from sklearn.model_selection import RepeatedStratifiedKFold # 重複k折交叉驗證，一種常見的交叉驗證方法
from sklearn.svm import SVC # SVM

# Training Without Tuning Hyperparameters 

In [6]:
# Define Model &  RepeatedStratifiedKFold
model = SVC()   
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)  # 將資料集切成n_splits份後，輪流取其中1份作驗證，n-1份進行訓練，重複n_repeats次


m_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(m_scores), std(m_scores)))

Accuracy: 0.937 (0.038)


# Tune Hyperparameters
scikit-optimization 提供許多調整超參數的方法(GridSearch, RandomizedSearch等等)，這裡我們使用Bayesian Optimization對我們的SVM模型進行超參數調整

SVM可調整的超參數如下：
- C, the regularization parameter.
- kernel, the type of kernel used in the model.
- degree, used for the polynomial kernel.
- gamma, used in most other kernels.

In [7]:
from skopt import BayesSearchCV

# 使用BayesSearchCV時，超參數須以dict形式包裝，包括超參數名及範圍
params = dict()
params['C'] = (1e-6, 100.0, 'log-uniform')
params['gamma'] = (1e-6, 100.0, 'log-uniform')
params['degree'] = (1,5)
params['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']

In [8]:
# 定義BayesSearchCV所需的模型 & 要調整的超參數
# 因為我們要使用cross-validation進行評估，因此另外傳入要使用的產生器cv
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
search = BayesSearchCV(estimator=SVC(), search_spaces=params, n_jobs=-1, cv=cv)

In [9]:
search.fit(X, y)
print(search.best_score_)
print(search.best_params_)



0.9515669515669516
OrderedDict([('C', 9.595708891378774), ('degree', 5), ('gamma', 0.06458522648595245), ('kernel', 'rbf')])
