In [1]:
import pandas as pd
import pandas_profiling
import os
import pickle
import gc
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb

import optuna

#データ読み込み
train = pd.read_csv("data_EDA/train.csv")
test = pd.read_csv("data_EDA/test.csv")

ModuleNotFoundError: No module named 'optuna'

In [None]:
# EDA後の特徴量
X_train = train[['DiabetesPedigreeFunction',
                 'BMI',
                 'Glucose',
                 'Age',
                 'Pregnancies',
                 'Pregnancies_bin',
                 'BloodPressure_na' ,
                 'BloodPressure_0', 
                 'SkinThickness',
                 'Insulin_0',
                 'Insulin_na',
                 
                 
                ]]
id_train = train[['index']]
y_train = train[['Outcome']]

In [None]:
# 探索しないパラメータ

params_base = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metrics': 'auc',
    'learning_rate': 0.02,
    'n_estimators': 100000,
    'bagging_freq': 1,
    'seed': 123,
}
random_state=123

def objective(trial):
    # 探索するパラメータ
    params_tuning = {
        'num_leaves': trial.suggest_int('num_leaves',8,256),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',5,200),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-5, 1e-2, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1e2, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state).split(X_train, y_train))
    for nfold in np.arange(4):
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = X_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = X_train.loc[idx_va, :], y_train.loc[idx_va, :]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = accuracy_score(y_va, np.where(y_va_pred>0.5, 1, 0))
        list_metrics.append(metric_va)
        
    # 評価値の計算
    metrics = np.mean(list_metrics)
    
    return metrics

In [25]:
sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=30)

[32m[I 2023-03-13 15:09:23,293][0m A new study created in memory with name: no-name-485b408d-5441-4e5a-954d-5a8b0201c9b0[0m




[32m[I 2023-03-13 15:09:25,775][0m Trial 0 finished with value: 0.7993333333333333 and parameters: {'num_leaves': 181, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 4.792414358623587e-05, 'feature_fraction': 0.7756573845414456, 'bagging_fraction': 0.8597344848927815, 'reg_alpha': 0.492522233779106, 'reg_lambda': 83.76388146302445}. Best is trial 0 with value: 0.7993333333333333.[0m




[32m[I 2023-03-13 15:09:26,644][0m Trial 1 finished with value: 0.806 and parameters: {'num_leaves': 178, 'min_data_in_leaf': 99, 'min_sum_hessian_in_leaf': 0.00015009027543233888, 'feature_fraction': 0.6715890080754348, 'bagging_fraction': 0.8645248536920208, 'reg_alpha': 0.567922374174008, 'reg_lambda': 0.01732652966363563}. Best is trial 1 with value: 0.806.[0m




[32m[I 2023-03-13 15:09:27,921][0m Trial 2 finished with value: 0.8056666666666668 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'reg_alpha': 1.3406343673102123, 'reg_lambda': 3.4482904089131434}. Best is trial 1 with value: 0.806.[0m




[32m[I 2023-03-13 15:09:29,020][0m Trial 3 finished with value: 0.8013333333333333 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 146, 'min_sum_hessian_in_leaf': 0.0006808799287054756, 'feature_fraction': 0.8612216912851107, 'bagging_fraction': 0.6614794569265892, 'reg_alpha': 0.2799978022399009, 'reg_lambda': 0.08185645330667264}. Best is trial 1 with value: 0.806.[0m




[32m[I 2023-03-13 15:09:29,888][0m Trial 4 finished with value: 0.8073333333333333 and parameters: {'num_leaves': 81, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.889360449174926e-05, 'feature_fraction': 0.7168505863397641, 'bagging_fraction': 0.7154313816648219, 'reg_alpha': 0.9434967110751797, 'reg_lambda': 0.5050346330980694}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:30,863][0m Trial 5 finished with value: 0.8043333333333333 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 0.004788147156768277, 'feature_fraction': 0.9720800091019398, 'bagging_fraction': 0.7509183379421682, 'reg_alpha': 3.1319282717196035, 'reg_lambda': 0.029005047452739414}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:31,213][0m Trial 6 finished with value: 0.7366666666666667 and parameters: {'num_leaves': 87, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 0.003971252247766701, 'feature_fraction': 0.6252276826982534, 'bagging_fraction': 0.7415171321313522, 'reg_alpha': 87.54657140659076, 'reg_lambda': 1.1965765212602313}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:32,113][0m Trial 7 finished with value: 0.7973333333333333 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.0030131614432849746, 'feature_fraction': 0.8015300642054637, 'bagging_fraction': 0.7725340032332324, 'reg_alpha': 0.23499322154972468, 'reg_lambda': 0.1646202117975735}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:33,093][0m Trial 8 finished with value: 0.8043333333333333 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 138, 'min_sum_hessian_in_leaf': 0.00423029374725911, 'feature_fraction': 0.7552111687390055, 'bagging_fraction': 0.8346568914811361, 'reg_alpha': 2.206714812711709, 'reg_lambda': 3.1594683442464033}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:34,239][0m Trial 9 finished with value: 0.801 and parameters: {'num_leaves': 175, 'min_data_in_leaf': 170, 'min_sum_hessian_in_leaf': 1.7765808030254076e-05, 'feature_fraction': 0.8818414207216692, 'bagging_fraction': 0.6218331872684371, 'reg_alpha': 0.05982625838323253, 'reg_lambda': 1.9490717640641542}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:35,552][0m Trial 10 finished with value: 0.7896666666666667 and parameters: {'num_leaves': 32, 'min_data_in_leaf': 199, 'min_sum_hessian_in_leaf': 1.3879073485411927e-05, 'feature_fraction': 0.5040305717020102, 'bagging_fraction': 0.5232420679967527, 'reg_alpha': 0.010612397212799423, 'reg_lambda': 0.2712616227691688}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:36,721][0m Trial 11 finished with value: 0.804 and parameters: {'num_leaves': 244, 'min_data_in_leaf': 113, 'min_sum_hessian_in_leaf': 0.00012473136964355074, 'feature_fraction': 0.6730913658889374, 'bagging_fraction': 0.9816139361200471, 'reg_alpha': 6.343590915843685, 'reg_lambda': 0.010211649165953098}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:38,033][0m Trial 12 finished with value: 0.8033333333333335 and parameters: {'num_leaves': 18, 'min_data_in_leaf': 114, 'min_sum_hessian_in_leaf': 0.0001700347841423779, 'feature_fraction': 0.6931854383164288, 'bagging_fraction': 0.9257126652760607, 'reg_alpha': 8.500734420947317, 'reg_lambda': 0.010578283293099854}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:39,017][0m Trial 13 finished with value: 0.802 and parameters: {'num_leaves': 63, 'min_data_in_leaf': 55, 'min_sum_hessian_in_leaf': 5.379941491915366e-05, 'feature_fraction': 0.6920119879687722, 'bagging_fraction': 0.8731035990698028, 'reg_alpha': 0.8451915837483444, 'reg_lambda': 0.38417866772574927}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:40,415][0m Trial 14 finished with value: 0.806 and parameters: {'num_leaves': 148, 'min_data_in_leaf': 7, 'min_sum_hessian_in_leaf': 1.0109437923471968e-05, 'feature_fraction': 0.7161201212015965, 'bagging_fraction': 0.9449350856793115, 'reg_alpha': 0.11194210530145045, 'reg_lambda': 0.04529618491562219}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:41,355][0m Trial 15 finished with value: 0.8053333333333333 and parameters: {'num_leaves': 206, 'min_data_in_leaf': 125, 'min_sum_hessian_in_leaf': 0.000570873649098293, 'feature_fraction': 0.60948287198866, 'bagging_fraction': 0.8236856487189648, 'reg_alpha': 0.775281834910724, 'reg_lambda': 0.0778191240079955}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:42,272][0m Trial 16 finished with value: 0.8053333333333333 and parameters: {'num_leaves': 129, 'min_data_in_leaf': 89, 'min_sum_hessian_in_leaf': 9.007497380495332e-05, 'feature_fraction': 0.6477959610523824, 'bagging_fraction': 0.9091183450956876, 'reg_alpha': 0.07648427867294036, 'reg_lambda': 0.3055770639520798}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:44,119][0m Trial 17 finished with value: 0.798 and parameters: {'num_leaves': 59, 'min_data_in_leaf': 169, 'min_sum_hessian_in_leaf': 0.00023054084963414055, 'feature_fraction': 0.7418786932898618, 'bagging_fraction': 0.7129812895222762, 'reg_alpha': 11.43073067302782, 'reg_lambda': 0.031183188001114515}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:45,102][0m Trial 18 finished with value: 0.803 and parameters: {'num_leaves': 201, 'min_data_in_leaf': 62, 'min_sum_hessian_in_leaf': 2.8193797545311158e-05, 'feature_fraction': 0.5562532923816212, 'bagging_fraction': 0.8094893324492629, 'reg_alpha': 1.697919634013935, 'reg_lambda': 0.7265123426763697}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:45,905][0m Trial 19 finished with value: 0.8033333333333333 and parameters: {'num_leaves': 255, 'min_data_in_leaf': 97, 'min_sum_hessian_in_leaf': 8.128421667349862e-05, 'feature_fraction': 0.6492851364966901, 'bagging_fraction': 0.9938187588498406, 'reg_alpha': 0.2832675564036369, 'reg_lambda': 0.11677834396822391}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:47,610][0m Trial 20 finished with value: 0.803 and parameters: {'num_leaves': 120, 'min_data_in_leaf': 185, 'min_sum_hessian_in_leaf': 2.2567123691109275e-05, 'feature_fraction': 0.7191139014183054, 'bagging_fraction': 0.6977974704856518, 'reg_alpha': 4.012997903592451, 'reg_lambda': 7.506084691793486}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:48,964][0m Trial 21 finished with value: 0.8013333333333333 and parameters: {'num_leaves': 150, 'min_data_in_leaf': 15, 'min_sum_hessian_in_leaf': 1.2542651460282271e-05, 'feature_fraction': 0.7115183856463484, 'bagging_fraction': 0.9212587153898287, 'reg_alpha': 0.11671238991111285, 'reg_lambda': 0.03258280278191472}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:50,381][0m Trial 22 finished with value: 0.8033333333333335 and parameters: {'num_leaves': 147, 'min_data_in_leaf': 7, 'min_sum_hessian_in_leaf': 1.078507679320643e-05, 'feature_fraction': 0.671375748720645, 'bagging_fraction': 0.9546833812183388, 'reg_alpha': 0.6359205742166641, 'reg_lambda': 0.06159344773901362}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:51,185][0m Trial 23 finished with value: 0.8003333333333333 and parameters: {'num_leaves': 179, 'min_data_in_leaf': 43, 'min_sum_hessian_in_leaf': 1.0257826229045205e-05, 'feature_fraction': 0.7329065314871402, 'bagging_fraction': 0.8855625568541348, 'reg_alpha': 0.03948515354373503, 'reg_lambda': 0.023470778251090726}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:52,063][0m Trial 24 finished with value: 0.802 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 34, 'min_sum_hessian_in_leaf': 2.370395004037637e-05, 'feature_fraction': 0.7969525303427156, 'bagging_fraction': 0.949528707322588, 'reg_alpha': 0.13963301854908172, 'reg_lambda': 0.15562933607555834}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:53,017][0m Trial 25 finished with value: 0.8036666666666668 and parameters: {'num_leaves': 142, 'min_data_in_leaf': 78, 'min_sum_hessian_in_leaf': 4.19518762467201e-05, 'feature_fraction': 0.6399983873301289, 'bagging_fraction': 0.8012766517656202, 'reg_alpha': 0.4656289092128395, 'reg_lambda': 0.022846300770785023}. Best is trial 4 with value: 0.8073333333333333.[0m




[32m[I 2023-03-13 15:09:54,045][0m Trial 26 finished with value: 0.8073333333333335 and parameters: {'num_leaves': 55, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.9759304689278844e-05, 'feature_fraction': 0.6739586896533928, 'bagging_fraction': 0.8537266200070116, 'reg_alpha': 1.282793526227369, 'reg_lambda': 0.08400546637715904}. Best is trial 26 with value: 0.8073333333333335.[0m




[32m[I 2023-03-13 15:09:55,071][0m Trial 27 finished with value: 0.8083333333333333 and parameters: {'num_leaves': 48, 'min_data_in_leaf': 126, 'min_sum_hessian_in_leaf': 6.972104745468846e-05, 'feature_fraction': 0.591372148877871, 'bagging_fraction': 0.8492100478235491, 'reg_alpha': 1.196667720289308, 'reg_lambda': 0.10114064587525043}. Best is trial 27 with value: 0.8083333333333333.[0m




[32m[I 2023-03-13 15:09:56,496][0m Trial 28 finished with value: 0.8009999999999999 and parameters: {'num_leaves': 44, 'min_data_in_leaf': 129, 'min_sum_hessian_in_leaf': 2.2546017139609767e-05, 'feature_fraction': 0.5864960275913353, 'bagging_fraction': 0.7882961845828448, 'reg_alpha': 1.274718498587182, 'reg_lambda': 0.12755942719839233}. Best is trial 27 with value: 0.8083333333333333.[0m




[32m[I 2023-03-13 15:09:57,784][0m Trial 29 finished with value: 0.804 and parameters: {'num_leaves': 60, 'min_data_in_leaf': 162, 'min_sum_hessian_in_leaf': 4.834522689773493e-05, 'feature_fraction': 0.6055886806527228, 'bagging_fraction': 0.841048432152907, 'reg_alpha': 2.2213318222687475, 'reg_lambda': 0.5724629864564882}. Best is trial 27 with value: 0.8083333333333333.[0m


In [26]:
trial = study.best_trial
print('acc(best)={:.4f}'.format(trial.value))
display(trial.params)

acc(best)=0.8083


{'num_leaves': 48,
 'min_data_in_leaf': 126,
 'min_sum_hessian_in_leaf': 6.972104745468846e-05,
 'feature_fraction': 0.591372148877871,
 'bagging_fraction': 0.8492100478235491,
 'reg_alpha': 1.196667720289308,
 'reg_lambda': 0.10114064587525043}

In [None]:
params_best = trial.params
params_best.update(params_base)
display(params_best)