In [3]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, RandomizedSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from scipy.stats import uniform 
from scipy.stats import randint as sp_randint

import lightgbm as lgb
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
import plotly.express as px

### Grid Search

In [2]:
# データロード
dataset = fetch_california_housing()
# データの説明
print(dataset['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [3]:
# データ準備
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']

# hold-out
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# モデル
lgbmr = lgb.LGBMRegressor(random_state=0)
param_grid = {'num_leaves': [10, 20, 30, 40, 50, 60],
              'max_depth': [5, 10, 15], 
              'reg_alpha': [0, 0.01, 0.03]}
cv = KFold(n_splits=3, random_state=0, shuffle=True)
gs = GridSearchCV(lgbmr, param_grid=param_grid, cv=cv)

In [4]:
# early_stopping
callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}
gs.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.264841
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.256456
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.255538
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.255538
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.240402
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.240164
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.240164
Did not meet ea

In [5]:
# grid searchの結果確認
cv_results_df = pd.DataFrame(gs.cv_results_)
cv_results_df.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_num_leaves,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.068089,0.002219,0.005699,0.001125,5,10,0.0,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.797096,0.807785,0.805461,0.803448,0.00459,54
1,0.050133,0.007558,0.005216,0.000711,5,10,0.01,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.798894,0.806981,0.80605,0.803975,0.003613,53
2,0.062517,0.016543,0.005569,0.001226,5,10,0.03,"{'max_depth': 5, 'num_leaves': 10, 'reg_alpha'...",0.799277,0.806364,0.806658,0.8041,0.003413,52


In [6]:
# top5の結果を表示
for index, row in cv_results_df[cv_results_df['rank_test_score']<=5].iterrows():
    print(f'{row["rank_test_score"]}: {row["params"]}')
    print(f'{row["mean_test_score"]}')

4: {'max_depth': 10, 'num_leaves': 50, 'reg_alpha': 0.01}
0.8329725903236045
3: {'max_depth': 15, 'num_leaves': 50, 'reg_alpha': 0}
0.8331058990226472
2: {'max_depth': 15, 'num_leaves': 50, 'reg_alpha': 0.03}
0.8332059773155042
5: {'max_depth': 15, 'num_leaves': 60, 'reg_alpha': 0}
0.8327661059213919
1: {'max_depth': 15, 'num_leaves': 60, 'reg_alpha': 0.01}
0.8338051795879502


In [7]:
# 最も良いハイパーパラメータの組み合わせで再度学習し評価
lgbmr = lgb.LGBMRegressor(**gs.best_params_, random_state=0, learning_rate=0.01, n_estimators=1000)
callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}
lgbmr.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.195693


In [8]:
lgbmr.score(X_val, y_val)

0.8499232425233268

## Pipeline + Grid Search

In [9]:
df = pd.read_csv('penguins_size.csv')
# データクリーニング
df.loc[df[df['sex']=='.'].index[0], 'sex'] = np.nan
df.dropna(thresh=3, inplace=True)

In [10]:
X = df.drop('species', axis=1)
y = df['species']

In [11]:
# 欠損値代入, カテゴリカルカラムのエンコーディング

# 欠損値代入
cat_cols = X.select_dtypes(exclude=np.number).columns.to_list()
imputer = SimpleImputer(strategy='most_frequent')
ct = ColumnTransformer([('cat_imputer', imputer, cat_cols)], remainder='passthrough')
ct.set_output(transform='pandas') # numpy array で返ってくる結果をpandas形式に変換

# One Hot Encoding
class GetDummies(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.columns = None
        
    def fit(self, X, y=None):
        # trainデータのカテゴリ数より
        # 推論時のtestデータのカテゴリ数が少ないとエラーとなる
        # trainデータのカテゴリでダミー変数化した場合のカラム名を保存しておく
        self.columns = pd.get_dummies(X).columns
        return self
    
    def transform(self, X):
        X_new = pd.get_dummies(X)
        # fit時に保存していた、trainデータでダミー変数化した場合のカラム名でカラムを準備
        return X_new.reindex(columns=self.columns, fill_value=0)

# モデル
lgbmc = lgb.LGBMClassifier(random_state=0)

# pipeline
pipeline = Pipeline([('impute', ct), ('dummy', GetDummies()), ('model', lgbmc)])

# cv
cv = KFold(n_splits=3, random_state=0, shuffle=True)

# grid search
param_grid = {'model__num_leaves': [10, 20, 30, 40, 50, 60],
              'model__max_depth': [5, 10, 15], 
              'model__reg_alpha': [0, 0.01, 0.03]}
gs = GridSearchCV(pipeline, param_grid=param_grid, cv=cv)
gs.fit(X, y)

In [2]:
cv_results_df = pd.DataFrame(gs.cv_results_)
# cv_results_df

NameError: name 'pd' is not defined

## Random Search CV

In [5]:
# データロード
dataset = fetch_california_housing()
# # データの説明
# print(dataset['DESCR'])

In [31]:
# データ準備
X = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
y = dataset['target']

# hold-out
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# モデル
lgbmr = lgb.LGBMRegressor(random_state=0)
param_dist = {'num_leaves': sp_randint(10, 60),
              'max_depth': sp_randint(5, 15), 
              'reg_alpha': uniform(0, 0.03)}

# early_stopping
eval_set = [(X_val, y_val)]
callbacks = [lgb.early_stopping(stopping_rounds=10)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}

# cv
cv = KFold(n_splits=3, random_state=0, shuffle=True)

# random search
rs = RandomizedSearchCV(lgbmr, param_distributions=param_dist, cv=cv, n_iter=36)
rs.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.222354
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[98]	valid_0's l2: 0.21749
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.215539
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.215539
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.215539
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.215539
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.215539
Did not meet early stopping

In [32]:
cv_results_df = pd.DataFrame(rs.cv_results_)
cv_results_df.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_num_leaves,param_reg_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.184874,0.117982,0.007898,0.000741,8,41,0.009865,"{'max_depth': 8, 'num_leaves': 41, 'reg_alpha'...",0.825931,0.83417,0.828054,0.829385,0.003492,10
1,0.081635,0.008035,0.00594,0.000187,10,22,0.007179,"{'max_depth': 10, 'num_leaves': 22, 'reg_alpha...",0.819109,0.826322,0.820391,0.821941,0.003142,27
2,0.096185,0.010123,0.007964,0.000886,12,33,0.02446,"{'max_depth': 12, 'num_leaves': 33, 'reg_alpha...",0.827764,0.832737,0.827092,0.829198,0.002518,12


In [33]:
# top5の結果を表示
for index, row in cv_results_df[cv_results_df['rank_test_score']<=5].iterrows():
    print(f'{row["rank_test_score"]}: {row["params"]}')
    print(f'{row["mean_test_score"]}')

5: {'max_depth': 10, 'num_leaves': 57, 'reg_alpha': 4.265592431303311e-05}
0.8314620721088906
1: {'max_depth': 14, 'num_leaves': 49, 'reg_alpha': 0.008708320326080377}
0.8332766016859434
4: {'max_depth': 12, 'num_leaves': 52, 'reg_alpha': 0.028859294403991865}
0.8326229021887682
2: {'max_depth': 11, 'num_leaves': 59, 'reg_alpha': 0.01658502064437605}
0.8331101235678036
3: {'max_depth': 14, 'num_leaves': 51, 'reg_alpha': 0.02238934084784933}
0.8326890914242048


In [34]:
# 最も良いハイパーパラメータの組み合わせで再度学習し評価
lgbmr = lgb.LGBMRegressor(**rs.best_params_, random_state=0, learning_rate=0.01, n_estimators=1000)
callbacks = [lgb.early_stopping(stopping_rounds=10)]
eval_set = [(X_val, y_val)]
fit_params = {'callbacks': callbacks, 'eval_set': eval_set}
lgbmr.fit(X_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l2: 0.199471


In [35]:
lgbmr.score(X_val, y_val)

0.8470264823005282