# Scikit-learn 管線測試

## 載入相關套件

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

## 載入資料集

In [2]:
X, y = datasets.load_diabetes(return_X_y=True)

## 資料分割

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## 建立管線：特徵縮放、特徵萃取、模型訓練

In [4]:
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=5),
                        Lasso(random_state=0, max_iter=10000))
pipe_lr.fit(X_train, y_train)

## 模型評估

In [5]:
#y_pred = pipe_lr.predict(X_test)
print(f'R2={pipe_lr.score(X_test, y_test)}')

R2=0.4489534934850329


## 管線結合K折交叉驗證

In [6]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_lr,
                         X=X_test,
                         y=y_test,
                         cv=10,
                         n_jobs=-1)
print(f'K折分數: %s' % scores)
print(f'平均值: {np.mean(scores):.3f}, 標準差: {np.std(scores):.3f}')

K折分數: [ 0.49835011 -0.1414113   0.24826345  0.37295471  0.51573597  0.02635401
  0.17277211  0.71729214  0.37916185  0.10134832]
平均值: 0.289, 標準差: 0.245


## 管線結合K折交叉驗證、效能調校

In [7]:
from sklearn.model_selection import GridSearchCV

# 正則化強度：3種選擇
alphas = np.logspace(-4, -0.5, 30)
# 強迫係數(權重)須為正數
positive = (True, False)
tuned_parameters = [{"lasso__alpha": alphas, 'lasso__positive':positive}]

# 效能調校
clf = GridSearchCV(pipe_lr, tuned_parameters, cv=5, refit=False)
clf.fit(X, y)

scores_mean = clf.cv_results_["mean_test_score"]
scores_std = clf.cv_results_["std_test_score"]
print('平均分數:\n', scores_mean, '\n標準差:\n', scores_std)

平均分數:
 [0.47207195 0.4733645  0.47207199 0.47336459 0.47207203 0.47336471
 0.47207209 0.47336486 0.47207217 0.47336506 0.47207227 0.47336532
 0.4720724  0.47336567 0.47207258 0.47336614 0.47207282 0.47336675
 0.47207313 0.47336755 0.47207354 0.47336861 0.47207407 0.47337002
 0.47207479 0.47337187 0.47207573 0.47337431 0.47207697 0.47337754
 0.4720786  0.4733818  0.47208075 0.47338741 0.47208359 0.47339481
 0.47208732 0.47340457 0.47209222 0.47341742 0.47209865 0.47343434
 0.47210707 0.47345659 0.47211807 0.47348581 0.47213237 0.47352413
 0.47215089 0.47357424 0.47217468 0.47363958 0.47220496 0.47372441
 0.47224297 0.47383391 0.4722897  0.47397411 0.47234539 0.47415161] 
標準差:
 [0.05899342 0.06058938 0.05899334 0.06058934 0.05899322 0.06058928
 0.05899307 0.06058921 0.05899287 0.06058912 0.05899261 0.06058899
 0.05899226 0.06058883 0.05899181 0.06058861 0.0589912  0.06058832
 0.05899041 0.06058794 0.05898935 0.06058743 0.05898796 0.06058677
 0.05898613 0.06058589 0.0589837  0.06058473 0.

In [8]:
# 取得最佳參數組合
clf.best_params_

{'lasso__alpha': 0.31622776601683794, 'lasso__positive': False}

In [9]:
# 驗證
from math import floor
index = np.argmax(clf.cv_results_["mean_test_score"])
index, clf.cv_results_["mean_test_score"][index], clf.best_score_

(59, 0.47415160693556135, 0.47415160693556135)

## 以最佳參數組合重新訓練

In [10]:
pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=5),
                        Lasso(random_state=0, max_iter=10000,
                        alpha=clf.best_params_['lasso__alpha'],
                        positive=clf.best_params_['lasso__positive']))
pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_test, y_test)

0.4471126658437645

In [12]:
from sklearn.pipeline import Pipeline

pipe_lr = Pipeline([('scaler', StandardScaler()),
                    ('pca', PCA(n_components=5)),
                    ('lasso', Lasso(random_state=0, max_iter=10000,
                    alpha=clf.best_params_['lasso__alpha'],
                    positive=clf.best_params_['lasso__positive']))])
pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_test, y_test)

0.4471126658437645