## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [7]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

In [19]:
wine=datasets.load_wine()

x_train,x_test,y_train,y_test = train_test_split(wine.data,wine.target, test_size=0.25, random_state=42)

clf = GradientBoostingRegressor(random_state=7)

In [20]:
wine.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [25]:
data=pd.DataFrame(wine['data'],columns=wine['feature_names'])
first_5_columns=data.columns[:5]
data[first_5_columns].head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium
0,14.23,1.71,2.43,15.6,127.0
1,13.2,1.78,2.14,11.2,100.0
2,13.16,2.36,2.67,18.6,101.0
3,14.37,1.95,2.5,16.8,113.0
4,13.24,2.59,2.87,21.0,118.0


In [29]:
print(data.shape)
print(data.columns.values)

(178, 13)
['alcohol' 'malic_acid' 'ash' 'alcalinity_of_ash' 'magnesium'
 'total_phenols' 'flavanoids' 'nonflavanoid_phenols' 'proanthocyanins'
 'color_intensity' 'hue' 'od280/od315_of_diluted_wines' 'proline']


In [31]:
data.describe()[first_5_columns]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium
count,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573
std,0.811827,1.117146,0.274344,3.339564,14.282484
min,11.03,0.74,1.36,10.6,70.0
25%,12.3625,1.6025,2.21,17.2,88.0
50%,13.05,1.865,2.36,19.5,98.0
75%,13.6775,3.0825,2.5575,21.5,107.0
max,14.83,5.8,3.23,30.0,162.0


In [28]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
#回歸不能用accuracy score要用 mse ,r2-score, rmse
print(metrics.mean_squared_error(y_test, y_pred))

0.14672894294872418


In [36]:
clf_default=GradientBoostingRegressor()
#看GBR的超參數 例如:n_estimators默認為100
#max_depth限制深度
print(clf_default.get_params)

<bound method BaseEstimator.get_params of GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)>


In [37]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    4.0s finished


In [38]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -0.081166 using {'max_depth': 1, 'n_estimators': 100}


In [39]:
grid_result.best_params_

{'max_depth': 1, 'n_estimators': 100}

In [40]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingRegressor(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)

In [41]:
# 調整參數後約可降至 0.052 的 MSE
print(metrics.mean_squared_error(y_test, y_pred))

0.05264350419160253
