In [1]:
#A_ch06_001_Pipelines_a.ipynb
#演算法 Chains 和 Pipelines
#使用 Pipeline 類別來建立轉換和建模的鏈結過程
#並結合 Pipeline和 GridSearchCV,一次就搜尋完所有參數可能組合。

#1. 使用 Cancer Dataset
#2. 以MinMaxScaler 作前處理-->改善SVM

In [2]:
#step 0: 範例資料集 --> breast_cancer
from sklearn.datasets import load_breast_cancer

In [3]:
#step 1: 演算法 --> SVC, 預處理 --> MinMaxScaler
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
#step 2: 載入與分割資料
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)

In [5]:
#step 3: create a MinMaxScler estimator(transfer) 'scaler'
#        This estimator scales and translates each feature individually such that it is 
#        in the given range on the training set, i.e. between zero and one.
scaler=MinMaxScaler()

In [6]:
#以 X_train fit transfer 'scaler'
scaler=scaler.fit(X_train)

In [7]:
#你可以檢視 estimator 'scaler' scaler?  scaler.data_max_  scaler.data_min_   scaler.data_range_ 

In [8]:
#step 4: 以scaler.transform() 重新縮放訓練資料 'X_train'
X_train_scaled=scaler.transform(X_train)

In [9]:
#step 5: 建立 SVC model 'svcModel'
svcModel=SVC()

In [10]:
#step 6: 以縮放過的資料 'X_train_scaled' 訓練模型
svcModel.fit(X_train_scaled,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
#step 7: score svcModel
#step 7.1 先將 'X_test' 縮放為 'X_test_scaled' --> scaler.transform()
X_test_scaled=scaler.transform(X_test)

In [12]:
svcModel.score(X_test_scaled,y_test)

0.95104895104895104

In [13]:
#=====================================================================
#step 8: 建立 Pipeline 物件整合上面流程

In [14]:
#建立 Pipeline 物件,
#Pipeline 參數是 list of tuples, 每個tuple 是 ("name",estimator/transform)
from sklearn.pipeline import Pipeline
pipe=Pipeline([("scaler",MinMaxScaler()),("svm",SVC())]) #兩個steps 的 Pipeline

In [15]:
#Pipeline.fit
pipe.fit(X_train,y_train)

Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [16]:
#Pipeline.score
pipe.score(X_test,y_test)

0.95104895104895104

In [17]:
#===================================================================
#在均格搜尋中使用 Pipelines

In [18]:
#定義一個參數均格 param_grid, 是一個 dict
#'svm__C' --> Pipeline 'pipe' 的 'svm' step, 參數 C 的值
#'svm__gamma' --> Pipeline 'pipe' 的 'svm' step, 參數 gamma 的值
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [19]:
#以 GridSearchCV(Pipeline object, param_grid) 建立 GridSearchCV 物件
from sklearn.model_selection import GridSearchCV

In [20]:
#GridSearchCV object 'giid',
#流程是 Pipeline 'pipe', 參數均格是 'param_grid', k=5
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

In [21]:
#fitting
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [22]:
#GridSearchCV.best_score_ 屬性,存放 cross-validation 最佳成績
print("Best cross-validation accuracy:",grid.best_score_)

Best cross-validation accuracy: 0.981220657277


In [23]:
#test score
print("Test set score:",grid.score(X_test,y_test))

Test set score: 0.972027972028


In [24]:
#GridSearchCV.best_params_ 屬性,存放最佳參數
print("Best parameters:",grid.best_params_)

Best parameters: {'svm__gamma': 1, 'svm__C': 1}
