# JBFG Data Analysis Competition

In [40]:

# #!pip install watermark
# %load_ext watermark
# %watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,imblearn,tensorflow,plotly,matplotlib,seaborn,missingno,lightgbm


#### 컬럼 데이터 및 Null 건수 확인

## Machine Learning
***

### Import Library

In [41]:
import pandas as pd
import numpy as np
from itertools import combinations
import time
import datetime
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

### Function Definition

#### encode_onehot()

In [42]:
# 원-핫 인코딩 처리 
# ----------------
def encode_onehot(df):
    '''
        데이터프레임의 object type 컬럼을 원-핫 인코딩하는 함수
        
        Args:
            df (df) : DataFrame
        Return:
            DataFrame
    '''
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df

#### proc_split_smote()

In [43]:


def proc_split_smote(X_df, y_df):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.25, stratify=y_df, random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test

In [44]:
#### proc_feature_split()

In [45]:
def proc_feature_split(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, stratify=y, random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test

#### proc_standardization() - 표준화 함수

In [46]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

def proc_standardization(X_train, X_test, X_eval):
    scaler  = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = scaler.transform(X_test)
    X_eval  = scaler.transform(X_eval)
    
    return X_train, X_test, X_eval

#### select_feature()

In [47]:
# 중요 Feature 식별
# ----------------
def select_feature(df, y_labels, chosen_model):

    np.random.seed(42)    
    
    available_models = {
    'ExtraTrees': ExtraTreesClassifier(n_estimators=700),
    'RandomForest': RandomForestClassifier(n_estimators=700),
    'LGBMC': LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS'),
    'LGBMR': LGBMRegressor(),
    'Xg Boost':XGBClassifier(booster='gbtree', importance_type='gain', eval_metric='auc'),
    }

    # Create the selected model
    clf = available_models[chosen_model]

    clf = clf.fit(df, y_labels)                                     # Train

    if chosen_model == 'LGBMC' or chosen_model == 'LGBMR': 
        feature_importances = clf.booster_.feature_importance(importance_type="gain")
    else:        
        feature_importances = clf.feature_importances_


    chosen_model = SelectFromModel(clf, prefit=True)
    X_df = chosen_model.transform(df.values) 
    selected_feature_indices = chosen_model.get_support(indices=True)

    selected_columns = df.columns[selected_feature_indices]         # Get the indices of the selected features
    
    return X_df, selected_columns

#### fit_predict_eval()

In [48]:

# 예측 및 평가
# -----------
def fit_predict_eval(models, model_comparison, X_train, y_train, X_test, y_test):
   
    # 초기화
    # ------
    best_roc_auc = 0
    
    # Define Models
    # ------------- 
    # No: 1 origin
    # models = [
    #     ('LogisticRegression', LogisticRegression()),
    #     ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    #     ('ExtraTrees', ExtraTreesClassifier(n_estimators=700)),
    #     ('KNN', KNeighborsClassifier(n_neighbors=5)),
    #     ('NaiveBayes', GaussianNB()),
    #     ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=0)),
    #     ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    #     ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ]

    
    # Model Fit and Testing
    # ---------------------
    for model_name, classifier in models:
        start_time = time.time()


        # 학습
        # ----            
        classifier.fit(X_train, y_train)
        
        
        # 학습된 모델 저장하기 - 필요한 경우 향후 사용할 예정
        # -----------------------------------------------
        # file_name = f'./models/{model_name}.pkl'
        # joblib.dump(classifier, file_name)


        # 예측
        # ---- 
        y_pred = classifier.predict(X_test)


        # 평가
        # ---- 
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )

        precision = precision_score(y_test , y_pred)
        recall = recall_score(y_test , y_pred)
        f1 = f1_score(y_test, y_pred)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        auces = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=skf, scoring="roc_auc")
        cv_auc = auces.mean()
        cv_std = auces.std()
        
        pred_proba = classifier.predict_proba(X_test)[:, 1]        
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        
        # 예측 평가 결과 저장
        # -----------------
        model_comparison[f'{model_name}'] = [accuracy, accuracy_class_0, accuracy_class_1, precision, recall, f1, cv_auc, cv_std, roc_auc]
        
        
        # Best ROC_AUC Value Return
        # -------------------------
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            
        
        # Print Log
        # ---------    
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        print(f'Model Name: [{model_name:<18}], {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]},  BEST AUC: {best_roc_auc:0.6f}, AUC: {roc_auc:0.6f}')
        
        return best_roc_auc


#### print_eval_result()

In [49]:
def print_eval_result(model_comparison):

    # # MODEL COMPARISSON
    # Model_com_df=pd.DataFrame(model_comparison).T
    # Model_com_df.columns=['Accuracy','Accuracy-0','Accuracy-1', 'Precision', 'Recall', 'F1-Score','CV AUC','CV std', 'AUC']
    # Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))

    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Accuracy', 'Accuracy-No', 'Accuracy-Yes', 'Precision', 'Recall', 'F1-Score', 'CV AUC', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    # styled_df = Model_com_df.iloc[:10,:].style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    styled_df = Model_com_df.style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    display(styled_df)

#### data_transform()

In [50]:

def data_transform(df):

    # 데이터 변환
    # ------------------- 
    df = df.drop('cstno', axis=1)
    df = df.drop('sex', axis=1)
    df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

   
    # 결측치 처리
    # ----------
    df = df.groupby(['marital_stat']).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df = df.reset_index(drop=True)
    df.dropna(axis=0, inplace=True)


    # One-Hot Encoding
    # ----------------
    df = encode_onehot(df)     
    
    return df

#### test_transform()

In [51]:
def test_transform(source_df, eval_df):

    # 데이터 변환
    # -----------
    source_df = data_transform(source_df)
    eval_df = data_transform(eval_df)

        
    # 테스트 및 평가를 위한 데이터 분리
    # -------------------------------
    X_Features = source_df.drop(['is_churned'], axis=1)
    y_labels = source_df['is_churned']

    X_eval=eval_df.drop(['is_churned'], axis=1)
    y_eval=eval_df['is_churned'].values


    # 중요 Feature Column 선택
    # -----------------------
    X_Features_new, selected_columns = select_feature(X_Features, y_labels, 'RandomForest')
    X_eval = X_eval[selected_columns]


    # Train and Test 데이터 생성 및 가공
    # ---------------------------------
    X_train, y_train, X_test, y_test = proc_split_smote(X_Features_new, y_labels.values)
    

    # standardization ~ StandardScaler 적용
    # -----------------------------------
    X_train, X_test, X_eval = proc_standardization(X_train, X_test, X_eval.values) 
    
    return X_train, y_train, X_test, y_test, X_eval, y_eval 


### 학습 및 Test 평가

#### 데이터 로딩 및 초기화

In [52]:
# 경진대회를 위해 주최측에서 제공한 데이터(EDA 및 ML 학습을 위한 데이터)
bank_churner_df = pd.read_csv("./data/bank_churner.csv")


 # 평가를 위한 데이터 로드 - 평가자님 평가데이터 경로를 입력해 주세요!!! - 평가 결과는 평가결과 출력을 확인
eval_churner_df = pd.read_csv("./data/eval_churner.csv")

#### 초기화 및 전처리

In [53]:
# 평가결과 저장소 초기화
# --------------------
model_test_comparison = {}                        
model_eval_comparison = {}                        


# 사용할 모델 정의
models = [
    # ('LogisticRegression', LogisticRegression(random_state=42)),
    # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700, random_state=42)),
    # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=42)),
    # ('KNN', KNeighborsClassifier(n_neighbors=5)),
    # ('NaiveBayes', GaussianNB()),
    ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    
]

# 전처리 
# -----
X_train, y_train, X_test, y_test, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)

#### 예측 및 결과

In [54]:
# Pridict 및 Test 평가
# --------------------
test_auc = fit_predict_eval(models, model_test_comparison, X_train, y_train, X_test, y_test)

# print_eval_result(model_test_comparison)

Model Name: [LightGBM          ], 2023-09-21 00:09:34, 0:00:04,  BEST AUC: 0.973968, AUC: 0.973968


In [55]:
print_eval_result(model_test_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM,94.47%,97.47%,78.77%,85.62%,78.77%,82.05%,99.65%,0.000764,0.973968


#### 평가자 확인 - Hidden Data 적용 후 결과 확인

In [56]:
# 예측 및 평가 수행
eval_auc = fit_predict_eval(models, model_eval_comparison, X_train, y_train, X_eval, y_eval)

# print_eval_result(model_eval_comparison)

Model Name: [LightGBM          ], 2023-09-21 00:09:39, 0:00:04,  BEST AUC: 0.991247, AUC: 0.991247


In [57]:
print_eval_result(model_eval_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM,96.69%,98.00%,89.91%,89.63%,89.91%,89.77%,99.65%,0.000764,0.991247


### (부록) 전처리 방법 및 각종 튜닝값 적용을 위한 상세 테스트

##### Machine Learning시 중요 컬럼 선택 관련 Classifier 선택
- ExtraTrees, RandomForest 사용시 다른 Classifier 보다 좋은 결과를 보여서 RandomForest 선택

In [58]:
def select_feature_model():
    models = ['ExtraTrees', 'RandomForest', 'LGBMC', 'LGBMR', 'Xg Boost']
    for model in models: 
        start_time = time.time()

        bank_churner_df = pd.read_csv("./data/bank_churner.csv")        
        eval_churner_df = pd.read_csv("./data/test_churner.csv")


        # 데이터 변환
        # -----------
        bank_churner_df = data_transform(bank_churner_df)
        eval_churner_df = data_transform(eval_churner_df)


        # 테스트 및 평가를 위한 데이터 분리
        # -------------------------------
        X_Features = bank_churner_df.drop(['is_churned'],axis=1)
        y_labels = bank_churner_df['is_churned']

        X_eval=eval_churner_df.drop(['is_churned'],axis=1)
        y_eval=eval_churner_df['is_churned']


        # 중요 Feature Column 선택
        # -----------------------
        X_Features_new, selected_columns = select_feature(X_Features, y_labels, model)
        X_eval = X_eval[selected_columns]
        

        # 로그 출력
        # --------
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        
        # print(f'{cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, model: {model}, X_Features_new: {X_Features_new.shape}, X_eval: {X_eval.shape}, selected_columns_len: {len(selected_columns)}, selected_columns: {selected_columns}')
        print(f'{cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, model: {model}, X_Features_new: {X_Features_new.shape}, X_eval: {X_eval.shape}, selected_columns_len: {len(selected_columns)}')

select_feature_model()    
        

2023-09-21 00:09:45, 0:00:05, model: ExtraTrees, X_Features_new: (8101, 13), X_eval: (840, 13), selected_columns_len: 13
2023-09-21 00:09:56, 0:00:10, model: RandomForest, X_Features_new: (8101, 13), X_eval: (840, 13), selected_columns_len: 13
2023-09-21 00:09:56, 0:00:00, model: LGBMC, X_Features_new: (8101, 11), X_eval: (840, 11), selected_columns_len: 11
2023-09-21 00:09:57, 0:00:00, model: LGBMR, X_Features_new: (8101, 12), X_eval: (840, 12), selected_columns_len: 12
2023-09-21 00:09:57, 0:00:00, model: Xg Boost, X_Features_new: (8101, 9), X_eval: (840, 9), selected_columns_len: 9


#### Light GBM 파라미터 튜닝
- Light GBM을 이용한 경우 좋은 결과가 있어 파라미터 튜닝을 세부적으로 수행 함.
    - 튜닝 파라미터 결과 : learning_rate: 0.14061, max_depth: 106, min_child_samples: 64, num_leaves: 41, subsample: 0.94623
- 튜닝을 진행했지만 이전보다 좋은 결과가 나오지 않음
    - 튜닝 적용후 ROC AUC: 0.9892

##### Best 튜닝 파라미터 적용하여 결과 확인

In [59]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
eval_churner_df = pd.read_csv("./data/eval_churner.csv")

# 전처리 
# -----
X_train, y_train, X_test, y_test, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)


# 튜닝 파라미터 적용 값
# -------------------
best = {'learning_rate': 0.1406105325029019, 'max_depth': 106.0, 'min_child_samples': 64.0, 'num_leaves': 41.0, 'subsample': 0.9462293554201169}


lgbm_clf =  LGBMClassifier(n_estimators=700, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5)
                          )


# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행.
# --------------------------------------------------------------------- 
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(X_train, y_train), (X_test, y_test)])

lgbm_roc_score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))



[1]	training's auc: 0.947246	training's binary_logloss: 0.606997	valid_1's auc: 0.874619	valid_1's binary_logloss: 0.617885
[2]	training's auc: 0.954415	training's binary_logloss: 0.540712	valid_1's auc: 0.886997	valid_1's binary_logloss: 0.560624
[3]	training's auc: 0.966035	training's binary_logloss: 0.484324	valid_1's auc: 0.896198	valid_1's binary_logloss: 0.51372
[4]	training's auc: 0.972323	training's binary_logloss: 0.43683	valid_1's auc: 0.90156	valid_1's binary_logloss: 0.474998
[5]	training's auc: 0.976108	training's binary_logloss: 0.398234	valid_1's auc: 0.907363	valid_1's binary_logloss: 0.442518
[6]	training's auc: 0.980323	training's binary_logloss: 0.361964	valid_1's auc: 0.914306	valid_1's binary_logloss: 0.411141
[7]	training's auc: 0.981715	training's binary_logloss: 0.33415	valid_1's auc: 0.915475	valid_1's binary_logloss: 0.388344
[8]	training's auc: 0.983032	training's binary_logloss: 0.309046	valid_1's auc: 0.916774	valid_1's binary_logloss: 0.367435
[9]	training

##### Best 파라미터 적용 절차

###### 데이터 로드 및 전처리

In [60]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
eval_churner_df = pd.read_csv("./data/eval_churner.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!

# 전처리 
# -----
X_tr, y_tr, X_val, y_val, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)

###### Best 파라미터 적용전 AUC 확인 - ROC AUC: 0.9886

In [61]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=700)

eval_set=[(X_tr, y_tr), (X_val, y_val)]
lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))



[1]	training's auc: 0.930081	training's binary_logloss: 0.63125	valid_1's auc: 0.848979	valid_1's binary_logloss: 0.639177
[2]	training's auc: 0.938565	training's binary_logloss: 0.580045	valid_1's auc: 0.85874	valid_1's binary_logloss: 0.595055
[3]	training's auc: 0.944351	training's binary_logloss: 0.537039	valid_1's auc: 0.862536	valid_1's binary_logloss: 0.558517
[4]	training's auc: 0.959692	training's binary_logloss: 0.497042	valid_1's auc: 0.882825	valid_1's binary_logloss: 0.525363
[5]	training's auc: 0.964264	training's binary_logloss: 0.46349	valid_1's auc: 0.892105	valid_1's binary_logloss: 0.497306
[6]	training's auc: 0.969688	training's binary_logloss: 0.432615	valid_1's auc: 0.900963	valid_1's binary_logloss: 0.469992
[7]	training's auc: 0.972911	training's binary_logloss: 0.406289	valid_1's auc: 0.905563	valid_1's binary_logloss: 0.447879
[8]	training's auc: 0.975774	training's binary_logloss: 0.383196	valid_1's auc: 0.909381	valid_1's binary_logloss: 0.428017
[9]	trainin

###### 파라미터 구간 정의

In [62]:
from hyperopt import hp

lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
                    }

###### Best 파라미터 찾기위한 학습 진행 

In [63]:
from sklearn.model_selection import KFold
X_Feature = X_tr
y_labels = y_tr
def objective_func(search_space):
    lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=int(search_space['num_leaves']),
                               max_depth=int(search_space['max_depth']),
                               min_child_samples=int(search_space['min_child_samples']), 
                               subsample=search_space['subsample'],
                               learning_rate=search_space['learning_rate'])

    roc_auc_list = []
    
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_Feature):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_Feature[tr_index], y_labels[tr_index]
        X_val, y_val = X_Feature[val_index], y_labels[val_index]


        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행. 
        # lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
        lgbm_clf.fit(X_tr, y_tr, eval_metric="auc", eval_set=[(X_tr, y_tr), (X_val, y_val)])

        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
        score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:, 1]) 
        roc_auc_list.append(score)
    
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환.
    return -1*np.mean(roc_auc_list)

In [68]:
'''
from hyperopt import fmin, tpe, Trials

trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
best = fmin(fn=objective_func, space=lgbm_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=30))

print('best:', best)

'''

"\nfrom hyperopt import fmin, tpe, Trials\n\ntrials = Trials()\n\n# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. \nbest = fmin(fn=objective_func, space=lgbm_search_space, algo=tpe.suggest,\n            max_evals=50, # 최대 반복 횟수를 지정합니다.\n            trials=trials, rstate=np.random.default_rng(seed=30))\n\nprint('best:', best)\n\n"

#### 전처리 테스트1

In [65]:
models = [
    # ('LogisticRegression', LogisticRegression(random_state=42)),
    # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700, random_state=42)),
    # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=42)),
    # ('KNN', KNeighborsClassifier(n_neighbors=5)),
    # ('NaiveBayes', GaussianNB()),
    ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    
]
def test_transform_pre1(df, drop_column, groupby_column):
    df = df.drop('cstno', axis=1)

    for col_name in drop_column:
        df = df.drop(col_name, axis=1)

    if 'imcome_cat' not in drop_column:
        df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

    df = df.groupby(groupby_column).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df = df.reset_index(drop=True)
    df.dropna(axis=0, inplace=True)
        
    df = encode_onehot(df)

    return df


def proc_null_groupby_test():
    
    from itertools import combinations
    model_preproc1_comparison = {}                        

          
    # 전처리 테스트 예측
    # -----------------

    # 데이터 로드 및 고객번호 삭제
    bank_churner_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_churner_df = pd.read_csv("./data/eval_churner.csv")

    fit_df = bank_churner_df.drop('cstno', axis=1)

    fit_df_columns = fit_df.columns
    best_auc = 0

    # 결측치 및 다중공선성 처리
    # -----------------------
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age']]

    for drop_no, drop_column in enumerate(result_list):
        for group_no, groupby_column in enumerate(fit_df_columns):
            start_time = time.time()
            if groupby_column == 'is_churned' or groupby_column in drop_column:
                continue

            fit_df = bank_churner_df
            eval_df = eval_churner_df
            tot_cnt = fit_df.shape
            
            # -----------------------------------------------------------------------------------    
            # 평가 for Competition
            # -----------------------------------------------------------------------------------

            # 전처리 단계
            # -----------
            fit_df = test_transform_pre1(fit_df, drop_column, groupby_column)
            eval_df = test_transform_pre1(eval_df, drop_column, groupby_column)
            after_drop_cnt = fit_df.shape
            
            # 평가를 위한 데이터 분리
            # ---------------------
            X_train=fit_df.drop(['is_churned'],axis=1)
            y_train=fit_df['is_churned']
            
            X_eval=eval_df.drop(['is_churned'],axis=1)
            y_eval=eval_df['is_churned']


            # 중요 Feature Column 선택
            # -----------------------
            X_new, selected_columns = select_feature(X_train, y_train, 'RandomForest')
            X_eval = X_eval[selected_columns]


            # Train and Test 데이터 생성 및 가공
            # ---------------------------------
            X_train, y_train, X_test, y_test = proc_split_smote(X_new, y_train)
            after_smote_cnt = X_train.shape


            # Standardization 적용
            # --------------------
            X_train, X_test, X_eval = proc_standardization(X_train, X_test, X_eval.values)   


            # 최종 평가
            # --------
            proc_type='E'
            pre_proc_auc = fit_predict_eval(models, model_preproc1_comparison, X_train, y_train, X_eval, y_eval.values)
            
            if pre_proc_auc > best_auc:
                best_type = f'{proc_type}_{drop_no}_{group_no}'
                best_auc = pre_proc_auc
                


            # 최종 평가 로그 출력
            # ------------------
            cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            end_time = time.time()
            delta_time = end_time - start_time
            # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
            print(f'[전처리 테스트1] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [G{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {pre_proc_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, groupby_column: {groupby_column}, drop_column: {drop_column}')

            # print_eval_result(model_preproc1_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
proc_null_groupby_test()

Model Name: [LightGBM          ], 2023-09-21 00:57:40, 0:00:05,  BEST AUC: 0.986599, AUC: 0.986599
[전처리 테스트1] 2023-09-21 00:57:40, 0:00:18, [GE_0_1], best-type: [E_0_1], Best-AUC: 0.986599, AUC: 0.986599, tot_cnt: (8101, 21), after_drop_cnt: (8096, 31), after_smote_cnt: (10194, 13), groupby_column: age, drop_column: ['sex']
Model Name: [LightGBM          ], 2023-09-21 00:57:54, 0:00:03,  BEST AUC: 0.988057, AUC: 0.988057
[전처리 테스트1] 2023-09-21 00:57:54, 0:00:13, [GE_0_3], best-type: [E_0_3], Best-AUC: 0.988057, AUC: 0.988057, tot_cnt: (8101, 21), after_drop_cnt: (8101, 31), after_smote_cnt: (10200, 12), groupby_column: dependent_num, drop_column: ['sex']
Model Name: [LightGBM          ], 2023-09-21 00:58:08, 0:00:03,  BEST AUC: 0.985348, AUC: 0.985348
[전처리 테스트1] 2023-09-21 00:58:08, 0:00:13, [GE_0_4], best-type: [E_0_3], Best-AUC: 0.988057, AUC: 0.985348, tot_cnt: (8101, 21), after_drop_cnt: (8101, 31), after_smote_cnt: (10200, 12), groupby_column: education, drop_column: ['sex']
Model 

#### 전처리 테스트2

In [69]:
models = [
    # ('LogisticRegression', LogisticRegression(random_state=42)),
    # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700, random_state=42)),
    # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=42)),
    # ('KNN', KNeighborsClassifier(n_neighbors=5)),
    # ('NaiveBayes', GaussianNB()),
    ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
]


def proc_null_drop_test():
    from itertools import combinations

    model_preproc2_comparison = {}

    def test_transform_pre2(df, drop_list):
        
        # 데이터 변환
        # ---------- 
        df = df.drop('cstno', axis=1)


        # 결측치 처리
        # -----------
        for col_name in drop_list:
            df = df.drop(col_name, axis=1)

        if 'imcome_cat' not in drop_list:
            df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

        df = df.fillna(df.mean(numeric_only=True))
        df = df.reset_index(drop=True)
        df.dropna(axis=0, inplace=True)
            

        # One-Hot Encoding
        # ----------------
        df = encode_onehot(df)  
    
        return df

            
    # -----------
    # 예측
    # -----------

    # 데이터 로드 및 고객번호 삭제
    bank_churner_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_churner_df = pd.read_csv("./data/eval_churner.csv")
   
    best_auc = 0

    # Null 처리
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age', 'imcome_cat']]

    for drop_no, drop_column in enumerate(result_list):
        start_time = time.time()

        fit_df = bank_churner_df
        eval_df = eval_churner_df
        tot_cnt = fit_df.shape
        
        # -----------------------------------------------------------------------------------    
        # 평가 for Competition
        # -----------------------------------------------------------------------------------

        # 전처리 단계
        # -----------
        fit_df = test_transform_pre2(fit_df, drop_column)
        eval_df = test_transform_pre2(eval_df, drop_column)
        after_drop_cnt =fit_df.shape
        
        
        # 평가를 위한 데이터 분리
        # ---------------------
        X_train=fit_df.drop(['is_churned'],axis=1)
        y_train=fit_df['is_churned']
        X_train_cnt = X_train.shape
        
        X_eval=eval_df.drop(['is_churned'],axis=1)
        y_eval=eval_df['is_churned']


        # 중요 Feature Column 선택
        # -----------------------
        X_new, selected_columns = select_feature(X_train, y_train, 'RandomForest')
        X_eval = X_eval[selected_columns]


        # Train and Test 데이터 생성 및 가공
        # ---------------------------------
        X_train, y_train, X_val, y_val = proc_split_smote(X_new, y_train)
        after_smote_cnt = X_train.shape

        # Evaluation 데이터 생성 및 가공
        # ---------------------------------
        X_train, X_val, X_eval = proc_standardization(X_train, X_val, X_eval.values)   
        X_train_cnt = X_train.shape

        # 최종 평가
        # --------
        proc_type='E'
        group_no=1
        # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
        pre_proc_auc = fit_predict_eval(models, model_preproc2_comparison, X_train, y_train, X_eval, y_eval.values)
        
        
        if pre_proc_auc > best_auc:
            best_type = f'{proc_type}_{drop_no}_{group_no}'
            best_auc = pre_proc_auc
            


        # 최종 평가 로그 출력
        # ------------------
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
        print(f'[전처리 테스트2] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {pre_proc_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, X_train_cnt: {X_train_cnt}, drop_column: {drop_column}')


        # print_eval_result(model_preproc2_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
proc_null_drop_test()        

Model Name: [LightGBM          ], 2023-09-21 08:51:59, 0:00:03,  BEST AUC: 0.989888, AUC: 0.989888
[전처리 테스트2] 2023-09-21 08:51:59, 0:00:13, [E_0_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.989888, tot_cnt: (8101, 21), after_drop_cnt: (8101, 31), after_smote_cnt: (10200, 13), X_train_cnt: (10200, 13), drop_column: ['sex']
Model Name: [LightGBM          ], 2023-09-21 08:52:11, 0:00:03,  BEST AUC: 0.989874, AUC: 0.989874
[전처리 테스트2] 2023-09-21 08:52:11, 0:00:12, [E_1_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.989874, tot_cnt: (8101, 21), after_drop_cnt: (7293, 32), after_smote_cnt: (9188, 13), X_train_cnt: (9188, 13), drop_column: ['imcome_cat']
Model Name: [LightGBM          ], 2023-09-21 08:52:25, 0:00:05,  BEST AUC: 0.983849, AUC: 0.983849
[전처리 테스트2] 2023-09-21 08:52:25, 0:00:13, [E_2_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.983849, tot_cnt: (8101, 21), after_drop_cnt: (7293, 32), after_smote_cnt: (9188, 12), X_train_cnt: (9188, 12), drop_column: ['tot_amt_ratio