### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
## 梯度提升樹算法 https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
## 梯度提升用法 補充資料: https://sklearn.apachecn.org/docs/master/12.html 
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
wine = datasets.load_wine()

In [3]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.25, random_state=42)

# 建立模型
clf = GradientBoostingRegressor(random_state=7)

# 先看看使用預設參數得到的結果，約為 8.379 的 MSE
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(y_pred)
print(metrics.mean_squared_error(y_test, y_pred))

[ 4.85157320e-03 -6.11544988e-02  1.03579309e+00  2.47034728e-03
  9.77929980e-01  2.56964822e-03  1.00538514e+00  2.00069171e+00
  8.76379899e-01  2.12887512e-01  2.98206257e-02  1.01516469e+00
 -8.90582704e-02  1.90344021e+00 -1.29149562e-03  1.00381041e+00
  1.00351325e+00  1.01148092e+00 -3.52097118e-04  1.00030087e+00
 -1.03837343e-03  9.35022660e-01  1.00785831e+00  2.00190476e+00
  2.00083148e+00  2.00144442e+00  1.00236765e+00  8.02429601e-01
  9.99729945e-01  6.91969307e-03 -1.31029288e-03  1.00630242e+00
  1.99778275e+00  4.03546776e-03 -8.87717408e-04  4.90931604e-03
  1.85531342e+00  1.94035978e+00  8.56024167e-01  1.96322471e+00
  5.40221304e-04  1.00313198e+00  1.00673704e+00  1.11478224e+00
  1.35440638e+00]
0.14351527250337612


In [4]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300, 400, 500]
max_depth = [1, 3, 5, 7, 9]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)
# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)
# 預設會跑 5-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [5]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -0.063167 using {'max_depth': 7, 'n_estimators': 100}


In [6]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)
y_pred

array([2.49637208e-05, 2.49637208e-05, 8.45850418e-01, 2.49637208e-05,
       9.99998402e-01, 2.49637208e-05, 9.99998402e-01, 1.99997184e+00,
       9.99998402e-01, 2.49637208e-05, 2.49637208e-05, 9.99998402e-01,
       2.49637208e-05, 1.99997184e+00, 2.49637208e-05, 9.99998402e-01,
       9.99998402e-01, 9.99998402e-01, 2.49637208e-05, 9.99998402e-01,
       2.49637208e-05, 9.99998402e-01, 9.99998402e-01, 1.99997184e+00,
       1.99997184e+00, 1.99997184e+00, 9.99998402e-01, 8.56404850e-01,
       9.99998402e-01, 2.49637208e-05, 2.49637208e-05, 9.99998402e-01,
       1.99997184e+00, 2.49637208e-05, 2.49637208e-05, 2.49637208e-05,
       1.99997184e+00, 1.99997184e+00, 8.56404850e-01, 1.99997184e+00,
       2.49637208e-05, 9.99998402e-01, 9.99998402e-01, 1.44845050e+00,
       1.44829956e+00])

In [7]:
print(metrics.mean_squared_error(y_test, y_pred))

0.17508063676025107
