- label 1 : 불만족
- label 0: 만족

- [Kaggle data set url](https://www.kaggle.com/competitions/santander-customer-satisfaction/data)

In [1]:
!pip uninstall lightgbm
!pip install lightgbm==3.3.2

Found existing installation: lightgbm 4.1.0
Uninstalling lightgbm-4.1.0:
  Would remove:
    /usr/local/lib/python3.10/dist-packages/lightgbm-4.1.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/lightgbm/*
Proceed (Y/n)? y
  Successfully uninstalled lightgbm-4.1.0
Collecting lightgbm==3.3.2
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings

warnings.filterwarnings('ignore')
cust_df = pd.read_csv('/content/drive/MyDrive/Kaggle - 파이썬 머신러닝 완벽 가이드/kaggleData/santander_customer_satisfaction/train_santander.csv', encoding = 'latin-1')

In [3]:
# Outlier를 가장 많은 값인 2로 변환
cust_df['var3'].replace(-999999, 2, inplace = True)
cust_df.drop('ID', axis = 1, inplace = True)

# feature set과 label set 분리
X_features = cust_df.iloc[: , : -1]
y_labels = cust_df.iloc[: , -1]
print('feature data shape : {0}'.format(X_features.shape))

feature data shape : (76020, 369)


In [4]:
from sklearn.model_selection import train_test_split

y_labels = cust_df.iloc[: , -1]
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size = 0.2, random_state = 0, stratify = y_labels )

train_cnt = y_train.count()
test_cnt = y_test.count()
print('학습 세트 Shape:{0}, 테스트 세트 Shape:{1}'.format(X_train.shape , X_test.shape))

학습 세트 Shape:(60816, 369), 테스트 세트 Shape:(15204, 369)


In [5]:
# data 가 불균형한 분포를 이루고 있기
# target값 분포도가 학습 data와 test data set에 모두 비슷하게 추출되었는 지 확인
print(' 학습 세트 레이블 값 분포 비율')
print(y_train.value_counts()/train_cnt)
print('\n 테스트 세트 레이블 값 분포 비율')
print(y_test.value_counts()/test_cnt)

 학습 세트 레이블 값 분포 비율
0    0.960438
1    0.039562
Name: TARGET, dtype: float64

 테스트 세트 레이블 값 분포 비율
0    0.960405
1    0.039595
Name: TARGET, dtype: float64


In [6]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.3, random_state= 0, stratify = y_train)

In [7]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lgbm_clf = LGBMClassifier(n_estimators=500)

eval_set=[(X_tr, y_tr), (X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.824228	training's binary_logloss: 0.156831	valid_1's auc: 0.816744	valid_1's binary_logloss: 0.158547
[2]	training's auc: 0.835661	training's binary_logloss: 0.151145	valid_1's auc: 0.827251	valid_1's binary_logloss: 0.153561
[3]	training's auc: 0.843182	training's binary_logloss: 0.146979	valid_1's auc: 0.828392	valid_1's binary_logloss: 0.150106
[4]	training's auc: 0.8459	training's binary_logloss: 0.143663	valid_1's auc: 0.828847	valid_1's binary_logloss: 0.147372
[5]	training's auc: 0.854004	training's binary_logloss: 0.140824	valid_1's auc: 0.832929	valid_1's binary_logloss: 0.145213
[6]	training's auc: 0.857922	training's binary_logloss: 0.138439	valid_1's auc: 0.835462	valid_1's binary_logloss: 0.143508
[7]	training's auc: 0.859098	training's binary_logloss: 0.136496	valid_1's auc: 0.835801	valid_1's binary_logloss: 0.142097
[8]	training's auc: 0.861779	training's binary_logloss: 0.134831	valid_1's auc: 0.838382	valid_1's binary_logloss: 0.140869
[9]	traini

In [8]:
# 검색 공간 설정
from hyperopt import hp
lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
                    }

In [9]:
# 목적 함수 정의
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def objective_func(search_space):
    lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=int(search_space['num_leaves']),
                               max_depth=int(search_space['max_depth']),
                               min_child_samples=int(search_space['min_child_samples']),
                               subsample=search_space['subsample'],
                               learning_rate=search_space['learning_rate'])
    # 3개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list = []

    # 3개 k-fold방식 적용
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리
        X_tr, y_tr = X_train.iloc[tr_index], y_train.iloc[tr_index]
        X_val, y_val = X_train.iloc[val_index], y_train.iloc[val_index]

        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행.
        lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
           eval_set=[(X_tr, y_tr), (X_val, y_val)])

        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
        score = roc_auc_score(y_val, lgbm_clf.predict_proba(X_val)[:, 1])
        roc_auc_list.append(score)

    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되,
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환.
    return -1*np.mean(roc_auc_list)

In [11]:
from hyperopt import fmin, tpe, Trials

trials = Trials()

best = fmin(fn = objective_func, space = lgbm_search_space, algo = tpe.suggest, max_evals = 10,
            trials = trials, rstate = np.random.default_rng(seed = 30))
print('best : ', best)

[1]	training's auc: 0.829117	training's binary_logloss: 0.166068	valid_1's auc: 0.813958	valid_1's binary_logloss: 0.16088
[2]	training's auc: 0.831752	training's binary_logloss: 0.163964	valid_1's auc: 0.814755	valid_1's binary_logloss: 0.159158
[3]	training's auc: 0.834075	training's binary_logloss: 0.162086	valid_1's auc: 0.817047	valid_1's binary_logloss: 0.157587
[4]	training's auc: 0.836349	training's binary_logloss: 0.160373	valid_1's auc: 0.819498	valid_1's binary_logloss: 0.156133
[5]	training's auc: 0.837338	training's binary_logloss: 0.158759	valid_1's auc: 0.819873	valid_1's binary_logloss: 0.154853
[6]	training's auc: 0.839626	training's binary_logloss: 0.157287	valid_1's auc: 0.821021	valid_1's binary_logloss: 0.153685
[7]	training's auc: 0.840867	training's binary_logloss: 0.155944	valid_1's auc: 0.82119	valid_1's binary_logloss: 0.152589
[8]	training's auc: 0.842884	training's binary_logloss: 0.154696	valid_1's auc: 0.822871	valid_1's binary_logloss: 0.15158
[9]	trainin

best: {'learning_rate': 0.08592271133758617, 'max_depth': 121.0, 'min_child_samples': 69.0, 'num_leaves': 41.0, 'subsample': 0.9148958093027029}

In [13]:
lgbm_clf =  LGBMClassifier(n_estimators=500, num_leaves=int(41.0),
                           max_depth=int(121.0),
                           min_child_samples=int(69.0),
                           subsample=round(0.9148958093027029, 5),
                           learning_rate=round(0.08592271133758617, 5)
                          )
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds = 100, eval_metric = "auc", eval_set = [(X_tr, y_tr), (X_val, y_val)])

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1])
print('ROC AUC : {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.82986	training's binary_logloss: 0.15805	valid_1's auc: 0.815935	valid_1's binary_logloss: 0.159504
[2]	training's auc: 0.839718	training's binary_logloss: 0.152698	valid_1's auc: 0.822508	valid_1's binary_logloss: 0.154689
[3]	training's auc: 0.846436	training's binary_logloss: 0.148634	valid_1's auc: 0.829075	valid_1's binary_logloss: 0.151322
[4]	training's auc: 0.851872	training's binary_logloss: 0.145408	valid_1's auc: 0.832295	valid_1's binary_logloss: 0.148709
[5]	training's auc: 0.85441	training's binary_logloss: 0.142737	valid_1's auc: 0.835056	valid_1's binary_logloss: 0.146557
[6]	training's auc: 0.859635	training's binary_logloss: 0.140363	valid_1's auc: 0.836698	valid_1's binary_logloss: 0.144798
[7]	training's auc: 0.861981	training's binary_logloss: 0.138341	valid_1's auc: 0.838116	valid_1's binary_logloss: 0.143312
[8]	training's auc: 0.863557	training's binary_logloss: 0.136601	valid_1's auc: 0.838875	valid_1's binary_logloss: 0.142066
[9]	trainin