## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import trange

In [2]:
digits = load_digits()
X = digits.data
y = digits.target

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2)
X_train , X_val , y_train , y_val = train_test_split(X_train , y_train , test_size = 0.2)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [3]:
# Hyperparameter grid
param_grid = {'class_weight': [None, 'balanced'],
              'boosting_type': ['gbdt', 'goss', 'dart'],
              'num_leaves': list(range(30, 150)),
              'learning_rate': list(np.logspace(np.log(0.005), np.log(0.2), base = np.exp(1), num = 1000)),
              'subsample_for_bin': list(range(20000, 300000, 20000)),
              'min_child_samples': list(range(20, 500, 5)),
              'reg_alpha': list(np.linspace(0, 1)),
              'reg_lambda': list(np.linspace(0, 1)),
              'colsample_bytree': list(np.linspace(0.6, 1, 10))}

# Subsampling (只有'goss'才可用subsample_dist)
subsample_dist = list(np.linspace(0.5, 1, 100))

In [4]:
model = lgb.LGBMClassifier()
train_set = lgb.Dataset(X_train , label = y_train) 

In [5]:
def random_objective(params , iteration , n_folds):
   
    # Perform n_folds cross validation
    cv_results = lgb.cv(params , 
                        train_set ,
                        num_boost_round = 10000 ,
                        nfold = n_folds , 
                        early_stopping_rounds = 100 , 
                        metrics = 'auc' ,
                        seed = 50)

    best_score = np.max(cv_results['auc-mean'])
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(len(cv_results['auc-mean'])) # 這樣寫也可以
    
    # Return list of results
    return [loss , params , iteration , n_estimators]

In [6]:
# Dataframe to hold cv results
random_results = pd.DataFrame(columns = ['loss' , 'params' , 'iteration' , 'estimators'],
                              index = list(range(0 , 50)))

In [7]:
random.seed(50)

# Iterate through the specified number of evaluations
for i in trange(0 , 50):
    
    # Randomly sample parameters for gbm
    params = {}
    for key in param_grid.keys():
        params[key] = random.sample(param_grid[key] , 1)
    
    if params['boosting_type'] == 'goss':
        # Cannot subsample with goss
        params['subsample'] = 1.0
    else:
        # Subsample supported for gdbt and dart
        params['subsample'] = random.sample(subsample_dist, 1)[0]
        
        
    results_list = random_objective(params = params , iteration = i , n_folds = 10)
    
    # Add results to next row in dataframe
    random_results.iloc[i, :] = results_list

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [05:46<00:00,  3.98s/it]


In [9]:
random_results = random_results.sort_values('loss' , ascending = True)
random_results = random_results.reset_index(drop = True)
random_results.head()

Unnamed: 0,loss,params,iteration,estimators
0,0.0289758,"{'class_weight': ['balanced'], 'boosting_type'...",12,196
1,0.0338931,"{'class_weight': [None], 'boosting_type': ['gb...",32,542
2,0.0351536,"{'class_weight': [None], 'boosting_type': ['gb...",27,1019
3,0.0371429,"{'class_weight': [None], 'boosting_type': ['gb...",2,1217
4,0.0383874,"{'class_weight': [None], 'boosting_type': ['gb...",20,647
