# 実験　
## GridSearchに事前に標準化した訓練データを入れる場合と、pipelineで標準化の操作を渡す場合の性能の違い
Pythonで始める機械学習6章いわく、GridSearchに標準化した訓練データをそのまま入れると、検証データの情報がリークするため、性能が変わってしまうとあったので、確かめてみる。  
結果：変わった。やはりPipelineで訓練データと検証データを別々に標準化しないとよくないみたい。変化が顕著なのは線形分離モデルがはっきり表れる。これは、データのスケーリングに敏感な分類器であるから。  
逆に、スケーリングに鈍感な決定木やランダムフォレストでも値が変わったのが気になる。シード値をそろえたので、変化しているのはスケールだけのはずなのだが。。。  

In [1]:
import sklearn
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from IPython.display import display
import seaborn as sns

## Breast Cancer wisconsinのデータセット

In [2]:
import urllib
try:
    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
                     '/breast-cancer-wisconsin/wdbc.data', header=None)
except urllib.error.URLError:
    df = pd.read_csv('https://raw.githubusercontent.com/rasbt/'
                     'python-machine-learning-book/master/code/'
                     'datasets/wdbc/wdbc.data', header=None)
print('rows, columns:', df.shape)

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X=df.iloc[:,2:]
y=df.iloc[:,1]
y=le.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
df.head()

rows, columns: (569, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## ロジスティクス回帰

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('lr',LogisticRegression())])
grid_dict=[{'lr__C':[10**i for i in range(-2,2)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)

lr=LogisticRegression()
grid_dict=[{'C':[10**i for i in range(-2,2)]}]
gs_std=GridSearchCV(estimator=lr,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs_std.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9849246231155779
best parameter: {'lr__C': 1}
accuracy train:0.990, test:0.977 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01111,0.007396,0.002741,0.001096,0.01,{'lr__C': 0.01},0.975,0.975,0.95,0.95,0.975,0.95,1.0,0.975,1.0,0.973684,0.972362,0.017529,4
1,0.015665,0.003222,0.003343,0.000644,0.1,{'lr__C': 0.1},0.975,1.0,0.95,0.95,0.975,0.975,1.0,1.0,0.975,1.0,0.979899,0.018701,2
2,0.008879,0.000928,0.001937,0.000337,1.0,{'lr__C': 1},1.0,1.0,0.975,0.95,0.975,0.95,1.0,1.0,1.0,1.0,0.984925,0.020022,1
3,0.009275,0.000776,0.001912,0.000236,10.0,{'lr__C': 10},1.0,1.0,0.95,0.925,1.0,0.925,1.0,1.0,0.975,1.0,0.977387,0.030551,3


validation score: 0.9849246231155779
best parameter: {'C': 1}
accuracy train:0.990, test:0.977 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002396,0.000805,0.000447,0.00048,0.01,{'C': 0.01},0.95,0.975,0.95,0.95,0.975,0.95,1.0,0.975,1.0,0.973684,0.969849,0.018722,4
1,0.002994,0.000997,0.000283,0.00034,0.1,{'C': 0.1},0.975,1.0,0.95,0.95,0.975,0.975,1.0,1.0,0.975,1.0,0.979899,0.018701,2
2,0.00382,0.000782,0.000448,0.00047,1.0,{'C': 1},1.0,1.0,0.975,0.95,0.975,0.95,1.0,1.0,1.0,1.0,0.984925,0.020022,1
3,0.005485,0.000922,0.000498,0.000498,10.0,{'C': 10},1.0,1.0,0.95,0.925,1.0,0.925,1.0,1.0,0.975,1.0,0.977387,0.030551,3


## ランダムフォレスト

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('forest',RandomForestClassifier())])
grid_dict=[{'forest__max_depth':[i for i in range(3,6)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)
cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
forest=RandomForestClassifier()
grid_dict=[{'max_depth':[i for i in range(3,6)]}]
gs_std=GridSearchCV(estimator=forest,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9522613065326633
best parameter: {'forest__max_depth': 5}
accuracy train:0.992, test:0.942 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.03017,0.006366,0.005635,0.003168,3,{'forest__max_depth': 3},0.925,1.0,0.925,0.9,0.925,0.975,0.9,0.975,0.95,0.947368,0.942211,0.031746,2
1,0.040506,0.005227,0.005387,0.000929,4,{'forest__max_depth': 4},0.9,1.0,0.925,0.875,0.925,0.925,0.925,0.975,0.975,0.973684,0.939698,0.037312,3
2,0.02957,0.004531,0.003811,0.00061,5,{'forest__max_depth': 5},0.975,1.0,0.975,0.9,0.975,0.925,0.9,0.975,0.975,0.921053,0.952261,0.034721,1


validation score: 0.9522613065326633
best parameter: {'forest__max_depth': 5}
accuracy train:0.975, test:0.965 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.025728,0.003426,0.002497,0.000504,3,{'max_depth': 3},0.95,0.975,0.925,0.875,0.975,0.975,0.925,0.975,1.0,0.947368,0.952261,0.034482,1
1,0.032585,0.003536,0.002765,0.0007,4,{'max_depth': 4},0.9,0.975,0.925,0.9,0.95,0.95,0.95,0.975,0.925,0.973684,0.942211,0.027325,2
2,0.024186,0.005159,0.002429,0.000437,5,{'max_depth': 5},0.925,1.0,0.925,0.9,0.9,0.975,0.95,0.975,0.95,0.921053,0.942211,0.032006,2


In [27]:
print(gs_std.estimator)
print(gs.estimator)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Pipeline(memory=None,
         steps=[('sc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('forest',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                    

## 決定木

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('tree',DecisionTreeClassifier())])
grid_dict=[{'tree__max_depth':[i for i in range(3,6)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)
cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)

tree=DecisionTreeClassifier()
grid_dict=[{'max_depth':[i for i in range(3,6)]}]
gs_std=GridSearchCV(estimator=tree,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9321608040201005
best parameter: {'tree__max_depth': 3}
accuracy train:0.967, test:0.947 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tree__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021469,0.004981,0.003286,0.000473,3,{'tree__max_depth': 3},0.95,0.975,0.875,0.925,0.925,0.925,0.9,0.975,0.95,0.921053,0.932161,0.029873,1
1,0.017127,0.003039,0.00339,0.00182,4,{'tree__max_depth': 4},0.925,0.9,0.85,0.875,0.95,0.925,0.9,0.975,0.95,0.894737,0.914573,0.036025,3
2,0.01378,0.001157,0.002096,0.000299,5,{'tree__max_depth': 5},0.925,0.9,0.875,0.875,0.95,0.925,0.9,0.975,0.925,0.921053,0.917085,0.029683,2


validation score: 0.9271356783919598
best parameter: {'tree__max_depth': 3}
accuracy train:0.967, test:0.936 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0047,0.000852,0.000558,0.000427,3,{'max_depth': 3},0.925,0.975,0.875,0.925,0.925,0.925,0.925,1.0,0.925,0.868421,0.927136,0.036999,1
1,0.006578,0.000664,0.000608,0.00046,4,{'max_depth': 4},0.925,0.9,0.85,0.875,0.975,0.925,0.9,1.0,0.9,0.921053,0.917085,0.041958,2
2,0.005986,0.001297,0.000405,0.000503,5,{'max_depth': 5},0.95,0.9,0.85,0.9,0.975,0.9,0.925,1.0,0.875,0.894737,0.917085,0.043695,2


## linearSVM

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('svm',SVC(kernel='linear'))])
grid_dict=[{'svm__C':[10**i for i in range(-2,2)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)

svm=SVC(kernel='linear')
grid_dict=[{'C':[10**i for i in range(-2,2)]}]
gs_std=GridSearchCV(estimator=lr,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9824120603015075
best parameter: {'svm__C': 0.1}
accuracy train:0.987, test:0.971 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00958,0.003359,0.002804,0.000807,0.01,{'svm__C': 0.01},0.925,0.975,0.975,0.925,0.975,0.95,0.975,0.975,0.95,1.0,0.962312,0.022953,4
1,0.012628,0.002354,0.003123,0.0007,0.1,{'svm__C': 0.1},0.975,1.0,0.975,0.95,0.975,0.975,1.0,1.0,0.975,1.0,0.982412,0.016,1
2,0.008149,0.000741,0.001969,0.000231,1.0,{'svm__C': 1},1.0,1.0,0.975,0.925,0.975,0.95,0.975,1.0,0.975,0.973684,0.974874,0.02242,2
3,0.010874,0.001436,0.001792,0.000599,10.0,{'svm__C': 10},0.975,1.0,0.95,0.925,1.0,0.925,1.0,1.0,0.975,1.0,0.974874,0.029601,2


validation score: 0.9849246231155779
best parameter: {'svm__C': 0.1}
accuracy train:0.990, test:0.977 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003335,0.001729,0.000387,0.000475,0.01,{'C': 0.01},0.95,0.975,0.95,0.95,0.975,0.95,1.0,0.975,1.0,0.973684,0.969849,0.018722,4
1,0.002821,0.000977,0.00049,0.000517,0.1,{'C': 0.1},0.975,1.0,0.95,0.95,0.975,0.975,1.0,1.0,0.975,1.0,0.979899,0.018701,2
2,0.003111,0.001138,0.000422,0.000468,1.0,{'C': 1},1.0,1.0,0.975,0.95,0.975,0.95,1.0,1.0,1.0,1.0,0.984925,0.020022,1
3,0.004636,0.000516,0.000427,0.000527,10.0,{'C': 10},1.0,1.0,0.95,0.925,1.0,0.925,1.0,1.0,0.975,1.0,0.977387,0.030551,3


## kernelSVM

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('svm',SVC(kernel='rbf'))])
grid_dict=[{'svm__C':[10**i for i in range(-2,2)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)

svm=SVC(kernel='rbf')
grid_dict=[{'C':[10**i for i in range(-2,2)]}]
gs_std=GridSearchCV(estimator=lr,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs_std.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9798994974874372
best parameter: {'svm__C': 1}
accuracy train:0.982, test:0.977 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026626,0.001673,0.004487,0.00092,0.01,{'svm__C': 0.01},0.625,0.625,0.625,0.625,0.625,0.625,0.625,0.625,0.625,0.631579,0.625628,0.001933,4
1,0.018649,0.002363,0.004058,0.001294,0.1,{'svm__C': 0.1},0.875,0.975,0.95,0.925,0.95,0.925,0.95,0.975,0.975,0.973684,0.947236,0.030419,3
2,0.01036,0.001078,0.002177,0.000631,1.0,{'svm__C': 1},1.0,1.0,0.95,0.925,0.975,0.95,1.0,1.0,1.0,1.0,0.979899,0.026956,1
3,0.009375,0.000798,0.002195,0.000399,10.0,{'svm__C': 10},0.975,1.0,0.925,0.925,0.95,0.95,0.975,1.0,1.0,0.947368,0.964824,0.028034,2


validation score: 0.9849246231155779
best parameter: {'C': 1}
accuracy train:0.990, test:0.977 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002588,0.000892,0.0003,0.000423,0.01,{'C': 0.01},0.95,0.975,0.95,0.95,0.975,0.95,1.0,0.975,1.0,0.973684,0.969849,0.018722,4
1,0.003031,0.000761,0.000397,0.000488,0.1,{'C': 0.1},0.975,1.0,0.95,0.95,0.975,0.975,1.0,1.0,0.975,1.0,0.979899,0.018701,2
2,0.004753,0.00153,0.0005,0.000502,1.0,{'C': 1},1.0,1.0,0.975,0.95,0.975,0.95,1.0,1.0,1.0,1.0,0.984925,0.020022,1
3,0.006617,0.001391,0.000759,0.000404,10.0,{'C': 10},1.0,1.0,0.95,0.925,1.0,0.925,1.0,1.0,0.975,1.0,0.977387,0.030551,3


## XGBoost

In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# Piplelineによって、訓練データと検証データを別々に標準化する場合
pipe=Pipeline([('sc',StandardScaler()),('xgb',xgb.XGBClassifier())])
grid_dict=[{'xgb__C':[10**i for i in range(-2,2)]}]

cv=StratifiedKFold(n_splits=10,shuffle=False,random_state=0)
gs=GridSearchCV(estimator=pipe,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
#Breast Cancer wisconsinのデータセット
gs.fit(X_train,y_train)
print('validation score:',gs.best_score_)
print('best parameter:',gs.best_params_)
print('accuracy train:%.3f, test:%.3f (pipe):' % (gs.score(X_train,y_train),gs.score(X_test,y_test)))
display(pd.DataFrame(gs.cv_results_))

sc=StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std=sc.transform(X_test)

xgb_clf=xgb.XGBClassifier()
grid_dict=[{'C':[10**i for i in range(-2,2)]}]
gs_std=GridSearchCV(estimator=xgb_clf,param_grid=grid_dict,scoring='accuracy',cv=cv,n_jobs=-1)
gs_std.fit(X_train_std,y_train)

print('validation score:',gs_std.best_score_)
print('best parameter:',gs_std.best_params_)
print('accuracy train:%.3f, test:%.3f (without pipe):' % (gs_std.score(X_train_std,y_train),gs_std.score(X_test_std,y_test)))
display(pd.DataFrame(gs_std.cv_results_))

validation score: 0.9522613065326633
best parameter: {'xgb__C': 0.01}
accuracy train:1.000, test:0.965 (pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.712094,0.072522,0.007002,0.000964,0.01,{'xgb__C': 0.01},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
1,0.485578,0.083811,0.004906,0.001235,0.1,{'xgb__C': 0.1},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
2,0.432922,0.052472,0.004487,0.000772,1.0,{'xgb__C': 1},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
3,0.343147,0.066725,0.003582,0.000782,10.0,{'xgb__C': 10},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1


validation score: 0.9522613065326633
best parameter: {'C': 0.01}
accuracy train:1.000, test:0.965 (without pipe):


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.396811,0.037817,0.002305,0.000383,0.01,{'C': 0.01},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
1,0.414287,0.034955,0.002376,0.000932,0.1,{'C': 0.1},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
2,0.424135,0.034763,0.002094,0.000414,1.0,{'C': 1},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
3,0.340787,0.065174,0.001717,0.000323,10.0,{'C': 10},0.975,1.0,0.925,0.875,0.975,0.925,0.95,0.975,0.95,0.973684,0.952261,0.034338,1
