In [1]:
#A_ch06_002_Pipelines_b.ipynb

In [2]:
#step 0:  載入原始資料集(from UCI machine learning repository)
#         569個樣本,每個樣本32個column(行)
#         第1個column是「unique ID number」, 第2個column是label(M=Malignant,B=Benign)
#         第3-32行是30個實數特徵(feature)

In [3]:
#step 0.1: 使用pandas.read_csv() 讀取原始資料 'wdbc.data', 成為 dataframe 'cancer_df'
#https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data
import pandas as pd
url_file='https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
cancer_df=pd.read_csv(url_file,header=None) #header=None
#cancer_df=pd.read_csv('wdbc.data',header=None) #如果離線

In [5]:
#step 0.2: 使用dataframe.loc 裁取第3-32行(30個特徵), 並取出dataframe.values(是ndarray) 'X'
#                         裁取第2行(label), 並取出dataframe.values(是ndarray) 'y'
X=cancer_df.loc[:,2:].values   #第3-32行是30個實數特徵(feature)
y=cancer_df.loc[:,1].values    #第2個column是label(M=Malignant,B=Benign)

In [6]:
#step 0.3: 使用 LabelEncoder 將Label字串('M'/'B') 轉成整數 1 / 0
from sklearn.preprocessing import LabelEncoder
lbc=LabelEncoder()      #create LabelEncoder object 'lbc'
y=lbc.fit_transform(y)  #以 y fit 'lbc'
lbc.transform(['M','B']) #transform

array([1, 0], dtype=int64)

In [6]:
#step 0.4: train_test_split (使用 sklearn.cross_validation / 也可以用 model_selection)
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20, random_state=0)

In [7]:
#===================================================================================
#建立一個 Pipeline 物件 'pipe_lr', 將下列各動作整合:
#1. preprocessing: StandardScaler
#2. preprocession: PCA
#3. classification: Logistic regression

#Pipeline 的參數是 list of tuples, 
#每個 tuple由 ('識別字串','transfer / Evaluator')組成

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

In [9]:
#建立ipeline 物件 'pipe_lr'
pipe_lr=Pipeline([('stdScl',StandardScaler()),
                 ('pca',PCA(n_components=2)),
                 ('lrc',LogisticRegression(random_state=1))])

In [10]:
#以train data fit pipe_lr
pipe_lr.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('stdScl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('lrc', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [11]:
#score pipe_lr
print('Test Accuracy: %.3f' % pipe_lr.score(X_test,y_test))

Test Accuracy: 0.921


In [7]:
#===================================================================
#動手做--在均格搜尋中使用Pipelines 練習

In [None]:
#定義一個參數均格 param_grid
#'pca_n_components' --> Pipeline 'pipe_lr' 的 'pca' step, 參數 n_components 的值
#'lrc_C' --> Pipeline 'pipe_lr' 的 'lrc' step, 參數 C 的值
param_grid={'pca_n_components':[1,2,3],
            'lrc_C':[0.01,0.1,1,10]}
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV()