# JBFG Data Analysis Competition

In [89]:

# #!pip install watermark
# %load_ext watermark
# %watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,imblearn,tensorflow,plotly,matplotlib,seaborn,missingno,lightgbm


#### 컬럼 데이터 및 Null 건수 확인

## Machine Learning
***

### Import Library

In [90]:
import pandas as pd
import numpy as np
from itertools import combinations
import time
import datetime
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

from imblearn.over_sampling import SMOTE

### Function Definition

#### encode_onehot()

In [91]:
# 원-핫 인코딩 처리 
# ----------------
def encode_onehot(df):
    '''
        데이터프레임의 object type 컬럼을 원-핫 인코딩하는 함수
        
        Args:
            df (df) : DataFrame
        Return:
            DataFrame
    '''
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df

#### proc_split_smote()

In [92]:
def proc_split_smote(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25, stratify=y, random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test

#### proc_feature_split()

In [93]:
def proc_feature_split(X_data, y_data):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    # 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
    print(f'[proc_feature_split()] X_data: {X_data.shape}, y_data: {y_data.shape}')
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, stratify=y_data, random_state=0)


    # 위에서 만든 X_train, y_train을 다시 쪼개서 90%는 학습과 10%는 검증용 데이터로 분리  
    X_tr, X_val, y_tr, y_val= train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=0)

    print(f'[proc_feature_split()] X_tr: {X_tr.shape}, y_tr: {y_tr.shape}, X_val: {X_val.shape}, y_val: {y_val.shape}, X_test: {X_test.shape}, y_test: {y_test.shape}')
    return X_tr, y_tr, X_val, y_val, X_test, y_test



#### proc_smote()

In [94]:
def proc_smote(X_data, y_data):

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_data, y_data = sm.fit_resample(X_data, y_data)
    
    return X_data, y_data

#### proc_normalization()

In [95]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

def proc_normalization(X_train, X_val, X_test, X_eval):
    scaler  = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val  = scaler.transform(X_val)
    X_test  = scaler.transform(X_test)
    X_eval  = scaler.transform(X_eval)
    
    return X_train, X_val, X_test, X_eval

#### select_feature()

In [96]:
# 중요 Feature 식별
# ----------------
def select_feature(df, y_labels, chosen_model):

    np.random.seed(42)    
    
    available_models = {
    'ExtraTrees': ExtraTreesClassifier(n_estimators=700),
    'RandomForest': RandomForestClassifier(n_estimators=700),
    'LGBMC': LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS'),
    'LGBMR': LGBMRegressor(),
    'Xg Boost':XGBClassifier(booster='gbtree', importance_type='gain', eval_metric='auc'),
    }

    # Create the selected model
    clf = available_models[chosen_model]

    clf = clf.fit(df, y_labels)                                     # Train

    if chosen_model == 'LGBMC' or chosen_model == 'LGBMR': 
        feature_importances = clf.booster_.feature_importance(importance_type="gain")
    else:        
        feature_importances = clf.feature_importances_


    chosen_model = SelectFromModel(clf, prefit=True)
    X_df = chosen_model.transform(df.values) 
    selected_feature_indices = chosen_model.get_support(indices=True)

    selected_columns = df.columns[selected_feature_indices]         # Get the indices of the selected features
    
    return X_df, selected_columns

#### fit_predict_eval()

In [97]:

# 예측 및 평가
# -----------
def fit_predict_eval(models, model_comparison, X_train, y_train, X_val, y_val, X_test, y_test):
   
    # 초기화
    # ------
    best_roc_auc = 0
    
    # Define Models
    # ------------- 
    # No: 1 origin
    # models = [
    #     ('LogisticRegression', LogisticRegression()),
    #     ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    #     ('ExtraTrees', ExtraTreesClassifier(n_estimators=700)),
    #     ('KNN', KNeighborsClassifier(n_neighbors=5)),
    #     ('NaiveBayes', GaussianNB()),
    #     ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=0)),
    #     ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    #     ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ]

    
    # Model Fit and Testing
    # ---------------------
    for model_name, classifier in models:
        start_time = time.time()


        # 학습
        # ----            
        classifier.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])
        
        
        # 학습된 모델 저장하기 - 필요한 경우 향후 사용할 예정
        # -----------------------------------------------
        # file_name = f'./models/{model_name}.pkl'
        # joblib.dump(classifier, file_name)


        # 예측
        # ---- 
        y_pred = classifier.predict(X_test)


        # 평가
        # ---- 
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )

        precision = precision_score(y_test , y_pred)
        recall = recall_score(y_test , y_pred)
        f1 = f1_score(y_test, y_pred)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        auces = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=skf, scoring="roc_auc")
        cv_auc = auces.mean()
        cv_std = auces.std()
        
        pred_proba = classifier.predict_proba(X_test)[:, 1]        
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        
        # 예측 평가 결과 저장
        # -----------------
        model_comparison[f'{model_name}'] = [accuracy, accuracy_class_0, accuracy_class_1, precision, recall, f1, cv_auc, cv_std, roc_auc]
        
        
        # Best ROC_AUC Value Return
        # -------------------------
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            
        
        # Print Log
        # ---------    
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        print(f'Model Name: [{model_name:<18}], {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]},  BEST AUC: {best_roc_auc:0.6f}, AUC: {roc_auc:0.6f}')


#### print_eval_result()

In [98]:
def print_eval_result(model_comparison):

    # # MODEL COMPARISSON
    # Model_com_df=pd.DataFrame(model_comparison).T
    # Model_com_df.columns=['Accuracy','Accuracy-0','Accuracy-1', 'Precision', 'Recall', 'F1-Score','CV AUC','CV std', 'AUC']
    # Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))

    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Accuracy', 'Accuracy-No', 'Accuracy-Yes', 'Precision', 'Recall', 'F1-Score', 'CV AUC', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    # styled_df = Model_com_df.iloc[:10,:].style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    styled_df = Model_com_df.style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    display(styled_df)

#### data_transform()

In [99]:

def data_transform(df):

    # 데이터 변환
    # ------------------- 
    df = df.drop('cstno', axis=1)
    df = df.drop('sex', axis=1)
    df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

   
    # 결측치 처리
    # ----------
    # df = df.fillna(df.mean(numeric_only=True))
    df = df.groupby(['marital_stat']).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df = df.reset_index(drop=True)
    df.dropna(axis=0, inplace=True)


    # One-Hot Encoding
    # ----------------
    df = encode_onehot(df)     
    
    return df

#### test_transform()

In [100]:
def test_transform(source_df, eval_df):

    # 데이터 변환
    # -----------
    source_df = data_transform(source_df)
    eval_df = data_transform(eval_df)

        
    # 테스트 및 평가를 위한 데이터 분리
    # -------------------------------
    X_Features = source_df.drop(['is_churned'], axis=1)
    y_labels = source_df['is_churned']

    X_eval=eval_df.drop(['is_churned'], axis=1)
    y_eval=eval_df['is_churned']


    # 중요 Feature Column 선택
    # -----------------------
    X_Features_new, selected_columns = select_feature(X_Features, y_labels, 'RandomForest')
    X_eval = X_eval[selected_columns]


    # Train, Validation, Test 데이터 Split
    # ------------------------------------
    # X_train, y_train, X_test, y_test = proc_split_smote(X_Features_new, y_labels.values)
    X_train, y_train, X_val, y_val, X_test, y_test = proc_feature_split(X_Features_new, y_labels.values)


    # SMOTE 적용
    # ---------------------------------
    X_train, y_train = proc_smote(X_train, y_train)


    # Normalization ~ StandardScaler 적용
    # -----------------------------------
    X_train, X_val, X_test, X_eval = proc_normalization(X_train, X_val, X_test, X_eval.values) 
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_eval, y_eval     


### 학습 및 Test 평가

#### 데이터 로딩 및 초기화

In [101]:
# 경진대회를 위해 주최측에서 제공한 데이터(EDA 및 ML 학습을 위한 데이터)
bank_churner_df = pd.read_csv("./data/bank_churner.csv")


 # 평가를 위한 데이터 로드 - 평가자님 평가데이터 경로를 입력해 주세요!!! - 평가 결과는 평가결과 출력을 확인
eval_churner_df = pd.read_csv("./data/eval_churner.csv")

#### 초기화 및 전처리

In [102]:
# 평가결과 저장소 초기화
# --------------------
model_test_comparison = {}                        
model_eval_comparison = {}                        


# 사용할 모델 정의
models = [
    # ('LogisticRegression', LogisticRegression(random_state=42)),
    # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=42)),
    # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700, random_state=42)),
    # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=42)),
    # ('KNN', KNeighborsClassifier(n_neighbors=5)),
    # ('NaiveBayes', GaussianNB()),
    ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    # ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    
]

# 전처리 
# -----
X_train, y_train, X_val, y_val, X_test, y_test, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)

[proc_feature_split()] X_data: (8101, 13), y_data: (8101,)
[proc_feature_split()] X_tr: (5832, 13), y_tr: (5832,), X_val: (648, 13), y_val: (648,), X_test: (1621, 13), y_test: (1621,)


#### 예측 및 결과

In [103]:
# Pridict 및 Test 평가
# --------------------
fit_predict_eval(models, model_test_comparison, X_train, y_train, X_val, y_val, X_test, y_test)

# print_eval_result(model_test_comparison)

[0]	validation_0-auc:0.94274	validation_1-auc:0.90440
[1]	validation_0-auc:0.96669	validation_1-auc:0.91648
[2]	validation_0-auc:0.97349	validation_1-auc:0.91967
[3]	validation_0-auc:0.98001	validation_1-auc:0.93116
[4]	validation_0-auc:0.98526	validation_1-auc:0.93675
[5]	validation_0-auc:0.98938	validation_1-auc:0.94309
[6]	validation_0-auc:0.99051	validation_1-auc:0.94785
[7]	validation_0-auc:0.99197	validation_1-auc:0.94842
[8]	validation_0-auc:0.99262	validation_1-auc:0.95158
[9]	validation_0-auc:0.99350	validation_1-auc:0.95417
[10]	validation_0-auc:0.99460	validation_1-auc:0.95583
[11]	validation_0-auc:0.99525	validation_1-auc:0.95770
[12]	validation_0-auc:0.99539	validation_1-auc:0.95993
[13]	validation_0-auc:0.99568	validation_1-auc:0.96046
[14]	validation_0-auc:0.99612	validation_1-auc:0.96400
[15]	validation_0-auc:0.99643	validation_1-auc:0.96603
[16]	validation_0-auc:0.99673	validation_1-auc:0.96651
[17]	validation_0-auc:0.99703	validation_1-auc:0.96769
[18]	validation_0-au

In [104]:
print_eval_result(model_test_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM,94.02%,97.35%,76.54%,84.68%,76.54%,80.40%,99.66%,0.000522,0.973001
XgBoost,93.95%,97.35%,76.15%,84.62%,76.15%,80.16%,99.57%,0.000776,0.966306


In [105]:
print_eval_result(model_test_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM,94.02%,97.35%,76.54%,84.68%,76.54%,80.40%,99.66%,0.000522,0.973001
XgBoost,93.95%,97.35%,76.15%,84.62%,76.15%,80.16%,99.57%,0.000776,0.966306


#### 평가자 확인 - Hidden Data 적용 후 결과 확인

In [106]:
# 예측 및 평가 수행
# fit_predict_eval(models, model_eval_comparison, X_train, y_train, X_eval, y_eval)
fit_predict_eval(models, model_eval_comparison, X_train, y_train, X_val, y_val, X_eval, y_eval)

#print_eval_result(model_eval_comparison)

[0]	validation_0-auc:0.94274	validation_1-auc:0.90440
[1]	validation_0-auc:0.96669	validation_1-auc:0.91648
[2]	validation_0-auc:0.97349	validation_1-auc:0.91967
[3]	validation_0-auc:0.98001	validation_1-auc:0.93116
[4]	validation_0-auc:0.98526	validation_1-auc:0.93675
[5]	validation_0-auc:0.98938	validation_1-auc:0.94309
[6]	validation_0-auc:0.99051	validation_1-auc:0.94785
[7]	validation_0-auc:0.99197	validation_1-auc:0.94842
[8]	validation_0-auc:0.99262	validation_1-auc:0.95158
[9]	validation_0-auc:0.99350	validation_1-auc:0.95417
[10]	validation_0-auc:0.99460	validation_1-auc:0.95583
[11]	validation_0-auc:0.99525	validation_1-auc:0.95770
[12]	validation_0-auc:0.99539	validation_1-auc:0.95993
[13]	validation_0-auc:0.99568	validation_1-auc:0.96046
[14]	validation_0-auc:0.99612	validation_1-auc:0.96400
[15]	validation_0-auc:0.99643	validation_1-auc:0.96603
[16]	validation_0-auc:0.99673	validation_1-auc:0.96651
[17]	validation_0-auc:0.99703	validation_1-auc:0.96769
[18]	validation_0-au

In [None]:
print_eval_result(model_eval_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM,96.45%,98.06%,88.07%,89.72%,88.07%,88.89%,99.66%,0.000522,0.989892
XgBoost,95.80%,96.82%,90.52%,84.57%,90.52%,87.44%,99.57%,0.000776,0.987188


### 전처리 방법 및 각종 튜닝값 적용을 위한 상세 테스트

##### Machine Learning시 중요 컬럼 선택 관련 Classifier 선택
- ExtraTrees, RandomForest 사용시 다른 Classifier 보다 좋은 결과를 보여서 RandomForest 선택

In [108]:
def select_feature_model():
    models = ['ExtraTrees', 'RandomForest', 'LGBMC', 'LGBMR', 'Xg Boost']
    for model in models: 
        start_time = time.time()

        bank_churner_df = pd.read_csv("./data/bank_churner.csv")        
        eval_churner_df = pd.read_csv("./data/test_churner.csv")


        # 데이터 변환
        # -----------
        bank_churner_df = data_transform(bank_churner_df)
        eval_churner_df = data_transform(eval_churner_df)


        # 테스트 및 평가를 위한 데이터 분리
        # -------------------------------
        X_Features = bank_churner_df.drop(['is_churned'],axis=1)
        y_labels = bank_churner_df['is_churned']

        X_eval=eval_churner_df.drop(['is_churned'],axis=1)
        y_eval=eval_churner_df['is_churned']


        # 중요 Feature Column 선택
        # -----------------------
        X_Features_new, selected_columns = select_feature(X_Features, y_labels, model)
        X_eval = X_eval[selected_columns]
        

        # 로그 출력
        # --------
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        
        # print(f'{cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, model: {model}, X_Features_new: {X_Features_new.shape}, X_eval: {X_eval.shape}, selected_columns_len: {len(selected_columns)}, selected_columns: {selected_columns}')
        print(f'{cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, model: {model}, X_Features_new: {X_Features_new.shape}, X_eval: {X_eval.shape}, selected_columns_len: {len(selected_columns)}')

select_feature_model()    
        

2023-09-20 16:08:27, 0:00:05, model: ExtraTrees, X_Features_new: (8101, 13), X_eval: (840, 13), selected_columns_len: 13
2023-09-20 16:08:38, 0:00:10, model: RandomForest, X_Features_new: (8101, 13), X_eval: (840, 13), selected_columns_len: 13
2023-09-20 16:08:39, 0:00:00, model: LGBMC, X_Features_new: (8101, 11), X_eval: (840, 11), selected_columns_len: 11
2023-09-20 16:08:39, 0:00:00, model: LGBMR, X_Features_new: (8101, 12), X_eval: (840, 12), selected_columns_len: 12
2023-09-20 16:08:39, 0:00:00, model: Xg Boost, X_Features_new: (8101, 9), X_eval: (840, 9), selected_columns_len: 9


#### Light GBM 파라미터 튜닝
- Light GBM을 이용한 경우 좋은 결과가 있어 파라미터 튜닝을 세부적으로 수행 함.
    - 튜닝 파라미터 결과 : learning_rate: 0.14061, max_depth: 106, min_child_samples: 64, num_leaves: 41, 'subsample': 0.94623
- 튜닝을 진행했지만 이전보다 좋은 결과가 나오지 않음
    - 튜닝 적용후 ROC AUC: 0.9892

##### Best 튜닝 파라미터 적용하여 결과 확인

In [109]:
# 튜닝 파라미터 적용 값
# -------------------
best = {'learning_rate': 0.1406105325029019, 'max_depth': 106.0, 'min_child_samples': 64.0, 'num_leaves': 41.0, 'subsample': 0.9462293554201169}


lgbm_clf =  LGBMClassifier(n_estimators=700, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5)
                          )


# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행.
# --------------------------------------------------------------------- 
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(X_train, y_train), (X_test, y_test)])

lgbm_roc_score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[1]	training's auc: 0.94588	training's binary_logloss: 0.607012	valid_1's auc: 0.857291	valid_1's binary_logloss: 0.619282
[2]	training's auc: 0.963884	training's binary_logloss: 0.539007	valid_1's auc: 0.87605	valid_1's binary_logloss: 0.561484
[3]	training's auc: 0.966936	training's binary_logloss: 0.48501	valid_1's auc: 0.888617	valid_1's binary_logloss: 0.515248
[4]	training's auc: 0.972102	training's binary_logloss: 0.438911	valid_1's auc: 0.892627	valid_1's binary_logloss: 0.475665
[5]	training's auc: 0.975532	training's binary_logloss: 0.3996	valid_1's auc: 0.893993	valid_1's binary_logloss: 0.445122
[6]	training's auc: 0.978245	training's binary_logloss: 0.36582	valid_1's auc: 0.896436	valid_1's binary_logloss: 0.416254
[7]	training's auc: 0.980964	training's binary_logloss: 0.337229	valid_1's auc: 0.906429	valid_1's binary_logloss: 0.392575
[8]	training's auc: 0.98273	training's binary_logloss: 0.313159	valid_1's auc: 0.910621	valid_1's binary_logloss: 0.373133
[9]	training's 



[58]	training's auc: 0.999846	training's binary_logloss: 0.0366085	valid_1's auc: 0.968976	valid_1's binary_logloss: 0.159879
[59]	training's auc: 0.999863	training's binary_logloss: 0.0355083	valid_1's auc: 0.968863	valid_1's binary_logloss: 0.159945
[60]	training's auc: 0.999879	training's binary_logloss: 0.0345254	valid_1's auc: 0.969056	valid_1's binary_logloss: 0.159145
[61]	training's auc: 0.999894	training's binary_logloss: 0.0336383	valid_1's auc: 0.969287	valid_1's binary_logloss: 0.158673
[62]	training's auc: 0.999903	training's binary_logloss: 0.0328285	valid_1's auc: 0.969378	valid_1's binary_logloss: 0.158461
[63]	training's auc: 0.999917	training's binary_logloss: 0.0319397	valid_1's auc: 0.969412	valid_1's binary_logloss: 0.158527
[64]	training's auc: 0.999923	training's binary_logloss: 0.0311673	valid_1's auc: 0.969316	valid_1's binary_logloss: 0.158621
[65]	training's auc: 0.999936	training's binary_logloss: 0.0303873	valid_1's auc: 0.969451	valid_1's binary_logloss: 0

In [112]:
fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
eval_df = pd.read_csv("./data/eval_churner.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!

# 전처리 
# -----
X_train, y_train, X_val, y_val, X_test, y_test, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)
# X_tr, y_tr, X_val, y_val, X_eval, y_eval = test_transform(bank_churner_df, eval_churner_df)

# print(X_tr, y_tr, X_val, y_val, X_eval, y_eval)

[proc_feature_split()] X_data: (8101, 13), y_data: (8101,)
[proc_feature_split()] X_tr: (5832, 13), y_tr: (5832,), X_val: (648, 13), y_val: (648,), X_test: (1621, 13), y_test: (1621,)


In [119]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=700)

# eval_set=[(X_tr, y_tr), (X_val, y_val)]
# lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

eval_set=[(X_train, y_train), (X_val, y_val)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=eval_set)

lgbm_roc_score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))



[1]	training's auc: 0.933513	training's binary_logloss: 0.631645	valid_1's auc: 0.879887	valid_1's binary_logloss: 0.636219
[2]	training's auc: 0.959314	training's binary_logloss: 0.579096	valid_1's auc: 0.900673	valid_1's binary_logloss: 0.590489
[3]	training's auc: 0.96131	training's binary_logloss: 0.534514	valid_1's auc: 0.908486	valid_1's binary_logloss: 0.548616
[4]	training's auc: 0.966823	training's binary_logloss: 0.495037	valid_1's auc: 0.912472	valid_1's binary_logloss: 0.514227
[5]	training's auc: 0.971551	training's binary_logloss: 0.461619	valid_1's auc: 0.914275	valid_1's binary_logloss: 0.486265
[6]	training's auc: 0.973584	training's binary_logloss: 0.432438	valid_1's auc: 0.919851	valid_1's binary_logloss: 0.46028
[7]	training's auc: 0.977224	training's binary_logloss: 0.405636	valid_1's auc: 0.920258	valid_1's binary_logloss: 0.438045
[8]	training's auc: 0.978906	training's binary_logloss: 0.381892	valid_1's auc: 0.922326	valid_1's binary_logloss: 0.416585
[9]	traini

In [120]:
from hyperopt import hp

lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
                    }

In [123]:
from sklearn.model_selection import KFold
# X_Features = X_tr
# y_labels = y_tr
# print(X_Features, y_labels)

def objective_func(search_space):
    lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=int(search_space['num_leaves']),
                               max_depth=int(search_space['max_depth']),
                               min_child_samples=int(search_space['min_child_samples']), 
                               subsample=search_space['subsample'],
                               learning_rate=search_space['learning_rate'])

    roc_auc_list = []
    
    kf = KFold(n_splits=3)
    # X_train을 다시 학습과 검증용 데이터로 분리
    # for tr_index, val_index in kf.split(X_Features):
    #     # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
    #     X_tr, y_tr = X_Features[tr_index], y_labels[tr_index]
    #     X_val, y_val = X_Features[val_index], y_labels[val_index]

    for tr_index, val_index in kf.split(X_train):
        # kf.split(X_train)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_train[tr_index], y_train[tr_index]
        X_val, y_val = X_train[val_index], y_train[val_index]


        # SMOTE 적용
        # ----------
        # sm = SMOTE(sampling_strategy='auto', random_state=42)
        # X_tr, y_tr=sm.fit_resample(X_tr,y_tr)

        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행. 
        # lgbm_clf.fit(X_tr, y_tr, early_stopping_rounds=30, eval_metric="auc",
        lgbm_clf.fit(X_tr, y_tr, eval_metric="auc", eval_set=[(X_tr, y_tr), (X_val, y_val)])

        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음.
        score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:, 1]) 
        roc_auc_list.append(score)
    
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환.
    return -1*np.mean(roc_auc_list)

In [124]:
from hyperopt import fmin, tpe, Trials

trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
best = fmin(fn=objective_func, space=lgbm_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=30))

print('best:', best)

[1]	training's auc: 0.943504	training's binary_logloss: 0.618982	valid_1's auc: 0.842872	valid_1's binary_logloss: 0.976606
[2]	training's auc: 0.954962	training's binary_logloss: 0.605175	valid_1's auc: 0.861091	valid_1's binary_logloss: 0.953254
[3]	training's auc: 0.956844	training's binary_logloss: 0.592302	valid_1's auc: 0.865379	valid_1's binary_logloss: 0.931703
[4]	training's auc: 0.956996	training's binary_logloss: 0.58005	valid_1's auc: 0.866222	valid_1's binary_logloss: 0.91113
[5]	training's auc: 0.958359	training's binary_logloss: 0.568078	valid_1's auc: 0.869261	valid_1's binary_logloss: 0.891713
[6]	training's auc: 0.962108	training's binary_logloss: 0.555967	valid_1's auc: 0.872135	valid_1's binary_logloss: 0.872018
[7]	training's auc: 0.962644	training's binary_logloss: 0.545036	valid_1's auc: 0.873407	valid_1's binary_logloss: 0.854698
[8]	training's auc: 0.964182	training's binary_logloss: 0.53391	valid_1's auc: 0.874588	valid_1's binary_logloss: 0.83674
[9]	training

#### 전처리 테스트1

In [None]:
def test_transform_pre1(df, drop_column, groupby_column):
    df = df.drop('cstno', axis=1)

    for col_name in drop_column:
        df = df.drop(col_name, axis=1)

    if 'imcome_cat' not in drop_column:
        df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

    df = df.groupby(groupby_column).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df = df.reset_index(drop=True)
    df.dropna(axis=0, inplace=True)
        
    df = encode_onehot(df)

    return df


def proc_null_groupby_test():
    from itertools import combinations

    model_comparison = {}  #Dictionary to store the comparison metrics of models
    model_eval_comparison = {}                        


    # 전처리 테스트 함수
    # -----------------
    # def drop_null_column_pre(df, drop_list):
    #     for col_name in drop_list:
    #         df = df.drop(col_name, axis=1)

    #     return df


            
    # 전처리 테스트 예측
    # -----------------

    # 데이터 로드 및 고객번호 삭제
    fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_df = pd.read_csv("./data/test_churner.csv")
    tot_cnt = fit_df.shape

    fit_df_org = fit_df.copy()
    eval_df_org = eval_df.copy()
    fit_df = fit_df.drop('cstno', axis=1)

    fit_df_columns = fit_df.columns
    best_auc = 0

    # 결측치 및 다중공선성 처리
    # -----------------------
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age']]

    for drop_no, drop_column in enumerate(result_list):
        for group_no, groupby_column in enumerate(fit_df_columns):
            start_time = time.time()
            if groupby_column == 'is_churned' or groupby_column in drop_column:
                continue

            fit_df = fit_df_org
            eval_df = eval_df_org
            tot_cnt = fit_df.shape
            
            # print(f'drop_column: {drop_column}, groupby_column: {groupby_column}')
            
        # -----------------------------------------------------------------------------------    
            # 평가 for Competition
            # -----------------------------------------------------------------------------------

            # 전처리 단계
            # -----------
            fit_df = test_transform_pre1(fit_df, drop_column, groupby_column)
            eval_df = test_transform_pre1(eval_df, drop_column, groupby_column)
            after_drop_cnt = len(fit_df)
            
            
            # 평가를 위한 데이터 분리
            # ---------------------
            X_train=fit_df.drop(['is_churned'],axis=1)
            y_train=fit_df['is_churned']
            
            X_eval=eval_df.drop(['is_churned'],axis=1)
            y_eval=eval_df['is_churned']


            # 중요 Feature Column 선택
            # -----------------------
            X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
            X_eval = X_eval[selected_columns]


            # Train and Test 데이터 생성 및 가공
            # ---------------------------------
            X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)
            after_smote_cnt = X_train.shape

            # Evaluation 데이터 생성 및 가공
            # ---------------------------------
            X_train, X_eval = proc_normalization(X_train, X_eval.values)   


            # 최종 평가
            # --------
            proc_type='E'
            # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
            eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
            
            if eval_auc > best_auc:
                best_type = f'{proc_type}_{drop_no}_{group_no}'
                best_auc = eval_auc
                


            # 최종 평가 로그 출력
            # ------------------
            cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            end_time = time.time()
            delta_time = end_time - start_time
            # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
            print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [G{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {eval_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, groupby_column: {groupby_column}, drop_column: {drop_column}')

            # print_eval_result(model_eval_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
# proc_null_groupby_test()

#### 전처리 테스트2

In [None]:
def proc_null_drop_test():
    from itertools import combinations

    model_comparison = {}  #Dictionary to store the comparison metrics of models
    model_eval_comparison = {}                        

    def test_transform_pre2(df, drop_list):
        
        # 데이터 변환
        # ---------- 
        df = df.drop('cstno', axis=1)


        # 결측치 처리
        # -----------
        for col_name in drop_list:
            df = df.drop(col_name, axis=1)

        if 'imcome_cat' not in drop_list:
            df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

        df = df.fillna(df.mean(numeric_only=True))
        df = df.reset_index(drop=True)
        df.dropna(axis=0, inplace=True)
            

        # One-Hot Encoding
        # ----------------
        df = encode_onehot(df)  
    
        return df

            
    # -----------
    # 예측
    # -----------

    # 데이터 로드 및 고객번호 삭제
    fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_df = pd.read_csv("./data/test_churner.csv")
    tot_cnt = fit_df.shape

    fit_df_org = fit_df.copy()
    eval_df_org = eval_df.copy()

    best_auc = 0

    # Null 처리
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age', 'imcome_cat']]

    for drop_no, drop_column in enumerate(result_list):
        start_time = time.time()

        fit_df = fit_df_org
        eval_df = eval_df_org
        
        # -----------------------------------------------------------------------------------    
        # 평가 for Competition
        # -----------------------------------------------------------------------------------

        # 전처리 단계
        # -----------
        fit_df = test_transform_pre2(fit_df, drop_column)
        eval_df = test_transform_pre2(eval_df, drop_column)
        after_drop_cnt = len(fit_df)
        
        
        # 평가를 위한 데이터 분리
        # ---------------------
        X_train=fit_df.drop(['is_churned'],axis=1)
        y_train=fit_df['is_churned']
        X_train_cnt = X_train.shape
        
        X_eval=eval_df.drop(['is_churned'],axis=1)
        y_eval=eval_df['is_churned']


        # 중요 Feature Column 선택
        # -----------------------
        X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
        X_eval = X_eval[selected_columns]


        # Train and Test 데이터 생성 및 가공
        # ---------------------------------
        X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)
        after_smote_cnt = X_train.shape

        # Evaluation 데이터 생성 및 가공
        # ---------------------------------
        X_train, X_eval = proc_normalization(X_train, X_eval.values)   


        # 최종 평가
        # --------
        proc_type='E'
        group_no=1
        # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
        eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
        
        if eval_auc > best_auc:
            best_type = f'{proc_type}_{drop_no}_{group_no}'
            best_auc = eval_auc
            


        # 최종 평가 로그 출력
        # ------------------
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
        print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {eval_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, X_train_cnt: {X_train_cnt}, drop_column: {drop_column}')


        # print_eval_result(model_eval_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
# proc_null_drop_test()        