# JBFG Data Analysis Competition

In [144]:

# #!pip install watermark
# %load_ext watermark
# %watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,imblearn,tensorflow,plotly,matplotlib,seaborn,missingno,lightgbm


#### 컬럼 데이터 및 Null 건수 확인

## Machine Learning
***

### Import Library

In [146]:
import pandas as pd
import numpy as np
from itertools import combinations
import time
import datetime
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

### Function Definition

#### encode_onehot()

In [149]:
# 원-핫 인코딩 처리 
# ----------------
def encode_onehot(df):
    '''
        데이터프레임의 object type 컬럼을 원-핫 인코딩하는 함수
        
        Args:
            df (df) : DataFrame
        Return:
            DataFrame
    '''
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df

#### select_feature()

In [150]:
# 중요 Feature 식별
# ----------------
def select_feature(df, y_labels, chosen_model):

    np.random.seed(42)    
    
    available_models = {
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=13),
    # 'LGBMC': LGBMClassifier(),
    'LGBMC': LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS'),
    'LGBMR': LGBMRegressor(),
    'Xg Boost':XGBClassifier(booster='gbtree', importance_type='gain', eval_metric='auc'),
    }

    # Create the selected model
    clf = available_models[chosen_model]

    clf = clf.fit(df.values, y_labels)                                     # Train

    if chosen_model == 'LGBMC' or chosen_model == 'LGBMR': 
        feature_importances = clf.booster_.feature_importance(importance_type="gain")
    else:        
        feature_importances = clf.feature_importances_


    chosen_model = SelectFromModel(clf, prefit=True)
    X_df = chosen_model.transform(df.values) 
    selected_feature_indices = chosen_model.get_support(indices=True)

    selected_columns = df.columns[selected_feature_indices]         # Get the indices of the selected features
    
    return X_df, selected_columns

#### proc_smote()

In [151]:
def proc_smote(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    X_train,X_test,y_train,y_test=train_test_split(X_new, y, test_size=0.25, stratify=y, random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test


#### proc_normalization()

In [152]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

def proc_normalization(X_train, X_test):
    scaler=StandardScaler()
    # scaler = QuantileTransformer()
    # scaler = PowerTransformer()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)
    
    return X_train, X_test

#### fit_predict_eval()

In [153]:

# 예측 및 평가
# -----------
def fit_predict_eval(proc_type, drop_no, group_no, model_comparison, X_train, y_train, X_test, y_test):
   
    # 초기화
    # ------
    best_roc_auc = 0
    
    # Define Models
    # ------------- 
    # No: 1 origin
    # models = [
    #     # ('LogisticRegression', LogisticRegression()),
    #     # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    #     # ('KNN', KNeighborsClassifier(n_neighbors=5)),
    #     # ('NaiveBayes', GaussianNB()),
    #     # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=0)),
    #     ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
    #     ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, eval_metric='auc')),
    #     # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700)),
    # ]

    # No: 2
    # models = [
    #     ('LightGBM', LGBMClassifier()),
    #     ('XgBoost', XGBClassifier()),
    # ]

    # No: 3
    # models = [
    #     ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42)),
    #     ('XgBoost', XGBClassifier(n_estimators=700, random_state=42,)),
    # ]

    # No: 4
    best = {'learning_rate': 0.1406105325029019, 'max_depth': 106.0, 'min_child_samples': 64.0, 'num_leaves': 41.0, 'subsample': 0.9462293554201169}
    models = [
        ('LightGBM', LGBMClassifier(n_estimators=700, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5))),
        ('XgBoost', XGBClassifier(n_estimators=700, random_state=42,)),
    ]


    # Model Fit and Testing
    # ---------------------
    for model_name, classifier in models:
        start_time = time.time()

        # 학습
        # ----            
        classifier.fit(X_train, y_train)            # Fit
        
        # 학습된 모델 저장
        # ---------------
        # file_name = f'./models/{model_name}.pkl'
        # print
        # joblib.dump(classifier, file_name)

        # 평가
        # ---- 
        y_pred = classifier.predict(X_test)         # Test
        pred_proba = classifier.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test , y_pred)
        recall = recall_score(y_test , y_pred)
        # f1 = f1_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        auces = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=skf, scoring="roc_auc")
        cv_auc = auces.mean()
        cv_std = auces.std()
        
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )
        
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        
        # Collect Result
        # --------------
        model_comparison[f'{model_name}_{proc_type}_{drop_no}_{group_no}'] = [accuracy, accuracy_class_0, accuracy_class_1, precision, recall, f1, cv_auc, cv_std, roc_auc]
        
        
        # Best ROC_AUC Value Return
        # -------------------------
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            
        
        # Print Log
        # ---------    
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        print(f'[모델별] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}_{group_no}], Model Name: {model_name:<18}, BEST AUC: {best_roc_auc:0.6f}, AUC: {roc_auc:0.6f}')

    return best_roc_auc



#### print_eval_result()

In [154]:
def print_eval_result(model_comparison):

    # # MODEL COMPARISSON
    # Model_com_df=pd.DataFrame(model_comparison).T
    # Model_com_df.columns=['Accuracy','Accuracy-0','Accuracy-1', 'Precision', 'Recall', 'F1-Score','CV AUC','CV std', 'AUC']
    # Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))

    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Accuracy', 'Accuracy-No', 'Accuracy-Yes', 'Precision', 'Recall', 'F1-Score', 'CV AUC', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    # styled_df = Model_com_df.iloc[:10,:].style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    styled_df = Model_com_df.style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    display(styled_df)

#### test_transform()

In [155]:
def test_transform(df):
    
    # 데이터 변환
    # ------------------- 
    df = df.drop('cstno', axis=1)
    df = df.drop('sex', axis=1)
    # after_drop_cnt=len(df)
    df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})
   

    # 결측치 처리
    # ----------
    # df = df.fillna(df.mean(numeric_only=True))
    df = df.groupby(['marital_stat']).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df.dropna(axis=0, inplace=True)
    # after_drop_cnt=len(df)


    # One-Hot Encoding
    # ----------------
    df = encode_onehot(df)     
    
    return df

### 전처리 테스트 내용

#### 전처리 테스트1

In [156]:
def test_transform_pre1(df, drop_column, groupby_column):
    df = df.drop('cstno', axis=1)

    for col_name in drop_column:
        df = df.drop(col_name, axis=1)

    if 'imcome_cat' not in drop_column:
        df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

    df = df.groupby(groupby_column).apply(lambda x: x.fillna(x.mean(numeric_only=True)))
    df.dropna(axis=0, inplace=True)
        
    df = encode_onehot(df)  

    return df


def proc_null_groupby_test():
    from itertools import combinations

    model_comparison = {}  #Dictionary to store the comparison metrics of models
    model_eval_comparison = {}                        


    # 전처리 테스트 함수
    # -----------------
    # def drop_null_column_pre(df, drop_list):
    #     for col_name in drop_list:
    #         df = df.drop(col_name, axis=1)

    #     return df


            
    # 전처리 테스트 예측
    # -----------------

    # 데이터 로드 및 고객번호 삭제
    fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_df = pd.read_csv("./data/test_churner.csv")
    tot_cnt = fit_df.shape

    fit_df_org = fit_df.copy()
    eval_df_org = eval_df.copy()
    fit_df = fit_df.drop('cstno', axis=1)

    fit_df_columns = fit_df.columns
    best_auc = 0

    # 결측치 및 다중공선성 처리
    # -----------------------
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age']]

    for drop_no, drop_column in enumerate(result_list):
        for group_no, groupby_column in enumerate(fit_df_columns):
            start_time = time.time()
            if groupby_column == 'is_churned' or groupby_column in drop_column:
                continue

            fit_df = fit_df_org
            eval_df = eval_df_org
            tot_cnt = fit_df.shape
            
            # print(f'drop_column: {drop_column}, groupby_column: {groupby_column}')
            
        # -----------------------------------------------------------------------------------    
            # 평가 for Competition
            # -----------------------------------------------------------------------------------

            # 전처리 단계
            # -----------
            fit_df = test_transform_pre1(fit_df, drop_column, groupby_column)
            eval_df = test_transform_pre1(eval_df, drop_column, groupby_column)
            after_drop_cnt = len(fit_df)
            
            
            # 평가를 위한 데이터 분리
            # ---------------------
            X_train=fit_df.drop(['is_churned'],axis=1)
            y_train=fit_df['is_churned']
            
            X_eval=eval_df.drop(['is_churned'],axis=1)
            y_eval=eval_df['is_churned']


            # 중요 Feature Column 선택
            # -----------------------
            X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
            X_eval = X_eval[selected_columns]


            # Train and Test 데이터 생성 및 가공
            # ---------------------------------
            X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)
            after_smote_cnt = X_train.shape

            # Evaluation 데이터 생성 및 가공
            # ---------------------------------
            X_train, X_eval = proc_normalization(X_train, X_eval.values)   


            # 최종 평가
            # --------
            proc_type='E'
            # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
            eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
            
            if eval_auc > best_auc:
                best_type = f'{proc_type}_{drop_no}_{group_no}'
                best_auc = eval_auc
                


            # 최종 평가 로그 출력
            # ------------------
            cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            end_time = time.time()
            delta_time = end_time - start_time
            # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
            print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [G{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {eval_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, groupby_column: {groupby_column}, drop_column: {drop_column}')

            # print_eval_result(model_eval_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
# proc_null_groupby_test()

#### 전처리 테스트2

In [157]:
def proc_null_drop_test():
    from itertools import combinations

    model_comparison = {}  #Dictionary to store the comparison metrics of models
    model_eval_comparison = {}                        

    def test_transform_pre2(df, drop_list):
        
        # 데이터 변환
        # ---------- 
        df = df.drop('cstno', axis=1)


        # 결측치 처리
        # -----------
        for col_name in drop_list:
            df = df.drop(col_name, axis=1)

        if 'imcome_cat' not in drop_list:
            df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

        df = df.fillna(df.mean(numeric_only=True))
        df.dropna(axis=0, inplace=True)
            

        # One-Hot Encoding
        # ----------------
        df = encode_onehot(df)  
    
        return df

            
    # -----------
    # 예측
    # -----------

    # 데이터 로드 및 고객번호 삭제
    fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
    eval_df = pd.read_csv("./data/test_churner.csv")
    tot_cnt = fit_df.shape

    fit_df_org = fit_df.copy()
    eval_df_org = eval_df.copy()

    best_auc = 0

    # Null 처리
    result_list = []
    drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
    for j in range(1, len(drop_target_columns)+1):
        for i in combinations(drop_target_columns, j):
            result_list.append(list(i))

    # result_list = [['sex'], ['sex', 'age', 'imcome_cat']]

    for drop_no, drop_column in enumerate(result_list):
        start_time = time.time()

        fit_df = fit_df_org
        eval_df = eval_df_org
        
        # -----------------------------------------------------------------------------------    
        # 평가 for Competition
        # -----------------------------------------------------------------------------------

        # 전처리 단계
        # -----------
        fit_df = test_transform_pre2(fit_df, drop_column)
        eval_df = test_transform_pre2(eval_df, drop_column)
        after_drop_cnt = len(fit_df)
        
        
        # 평가를 위한 데이터 분리
        # ---------------------
        X_train=fit_df.drop(['is_churned'],axis=1)
        y_train=fit_df['is_churned']
        X_train_cnt = X_train.shape
        
        X_eval=eval_df.drop(['is_churned'],axis=1)
        y_eval=eval_df['is_churned']


        # 중요 Feature Column 선택
        # -----------------------
        X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
        X_eval = X_eval[selected_columns]


        # Train and Test 데이터 생성 및 가공
        # ---------------------------------
        X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)
        after_smote_cnt = X_train.shape

        # Evaluation 데이터 생성 및 가공
        # ---------------------------------
        X_train, X_eval = proc_normalization(X_train, X_eval.values)   


        # 최종 평가
        # --------
        proc_type='E'
        group_no=1
        # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
        eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
        
        if eval_auc > best_auc:
            best_type = f'{proc_type}_{drop_no}_{group_no}'
            best_auc = eval_auc
            


        # 최종 평가 로그 출력
        # ------------------
        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
        print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {eval_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, X_train_cnt: {X_train_cnt}, drop_column: {drop_column}')


        # print_eval_result(model_eval_comparison)


# 테스트시 아래의 주석 풀고 실행
# ----------------------------
# proc_null_drop_test()        

### 파라미터 튜닝 단계

### 학습 및 Test 단계

#### 예측 및 결과

In [158]:
# 데이터 로딩
# ----------
ml_churner_df = pd.read_csv("./data/bank_churner.csv")
tot_cnt = len(ml_churner_df)

# 결과 저장소 초기화
# -----------------
model_test_comparison = {}  #Dictionary to store the comparison metrics of models
# model_eval_comparison = {}                        
start_time = time.time()


ml_churner_df = test_transform(ml_churner_df)
after_drop_cnt = len(ml_churner_df)


# ML 데이터 분리
# --------------
X_Features = ml_churner_df.drop(['is_churned'],axis=1)
y_labels = ml_churner_df['is_churned']


# 중요 Feature Column 선택
# -----------------------
# X_new, selected_columns = select_feature(X, y, 'Xg Boost')
X_Features_new, selected_columns = select_feature(X_Features, y_labels, 'ExtraTrees')


# Train and Test 데이터 생성 및 가공
# ---------------------------------
X_train, y_train, X_test, y_test = proc_smote(X_Features_new, y_labels)
# X_train_for_normalization = X_train.copy()
after_smote_cnt = len(X_train)


# Normalization
# -------------
X_train, X_test = proc_normalization(X_train, X_test)    


# Pridict 및 Test 평가
# --------------------
proc_type='T'
drop_no = 1
group_no = 1
best_auc = 0
test_auc = fit_predict_eval(proc_type, drop_no, group_no, model_test_comparison, X_train, y_train, X_test, y_test)
# test_auc = fit_predict_eval_tunning(proc_type, drop_no, group_no, model_test_comparison, X_train, y_train, X_test, y_test)

if test_auc > best_auc:
    best_type = f'{proc_type}_{drop_no}_{group_no}'
    best_auc = test_auc


# 예측 및 테스트 로그 출력
# ----------------------
cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
end_time = time.time()
delta_time = end_time - start_time
groupby_column = None
drop_column = None
# print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}], AUC: {test_auc:0.6f}, tot_cnt: {tot_cnt:<6}, after_drop_cnt : {after_drop_cnt:<6}, after_smote_cnt: {after_smote_cnt:<6}, X_train:{X_train.shape}, y_train:{y_train.shape}, X_test:{X_test.shape}, y_test:{y_test.shape}')
print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [G{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {test_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, groupby_column: {groupby_column}, drop_column: {drop_column}')


# print_eval_result(model_comparison)


[모델별] 2023-09-19 13:12:42, 0:00:03, [T_1_1], Model Name: LightGBM          , BEST AUC: 0.974129, AUC: 0.974129
[모델별] 2023-09-19 13:12:52, 0:00:09, [T_1_1], Model Name: XgBoost           , BEST AUC: 0.974129, AUC: 0.969663
[테스트] 2023-09-19 13:12:52, 0:00:13, [GT_1_1], best-type: [T_1_1], Best-AUC: 0.974129, AUC: 0.974129, tot_cnt: 8101, after_drop_cnt: 8101, after_smote_cnt: 10200, groupby_column: None, drop_column: None


#### 예측결과 출력

In [159]:
print_eval_result(model_test_comparison)

Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM_T_1_1,94.37%,97.77%,76.62%,86.76%,76.62%,81.37%,99.64%,0.000567,0.974129
XgBoost_T_1_1,93.78%,97.30%,75.38%,84.19%,75.38%,79.55%,99.56%,0.000744,0.969663


### 평가 단계 ~ 평가자가 Competition 평가를 위해 사용 하는 단계

#### 데이터 로딩

In [160]:
eval_df = pd.read_csv("./data/test_churner.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!
# eval_df = pd.read_csv("./data/test_churner_kaggle_all.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!

fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드

#### 예측 및 결과

In [161]:
# -----------------------------------------------------------------------------------    
# 평가 for Competition
# -----------------------------------------------------------------------------------
start_time = time.time()
model_eval_comparison = {}                        
fit_tot_cnt = fit_df.shape
eval_tot_cnt = eval_df.shape


# 전처리 단계
# -----------
fit_df = test_transform(fit_df)
eval_df = test_transform(eval_df)
fit_drop_cnt = fit_df.shape
eval_drop_cnt = eval_df.shape

    
# 평가를 위한 데이터 분리
# ---------------------
X_Features=fit_df.drop(['is_churned'],axis=1)
y_labels=fit_df['is_churned']

X_eval=eval_df.drop(['is_churned'],axis=1)
y_eval=eval_df['is_churned']


# 중요 Feature Column 선택
# -----------------------
X_Features_new, selected_columns = select_feature(X_Features, y_labels, 'ExtraTrees')
# X_new, selected_columns = select_feature(X_train, y_train, 'LGBMC')
X_eval = X_eval[selected_columns]


# Train and Test 데이터 생성 및 가공
# ---------------------------------
X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_Features_new, y_labels)


# Evaluation 데이터 생성 및 가공
# ---------------------------------
X_train, X_eval = proc_normalization(X_train, X_eval.values)   


# 최종 평가
# --------
proc_type='E'
drop_no = 1
group_no =1


# eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
if eval_auc > best_auc:
    best_type = f'{proc_type}_{drop_no}_{group_no}'
    best_auc = eval_auc


# 최종 평가 로그 출력
# ------------------
cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
end_time = time.time()
delta_time = end_time - start_time
print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}], best_auc: {best_auc:0.6f}, fit_tot_cnt: {fit_tot_cnt}, eval_tot_cnt: {eval_tot_cnt}, fit_drop_cnt : {fit_drop_cnt}, eval_drop_cnt : {eval_drop_cnt}, X_train: {X_train.shape}, y_train: {y_train.shape}, X_eval: {X_eval.shape}, y_eval:{y_eval.shape}')


# print_eval_result(model_eval_comparison)

[모델별] 2023-09-19 13:12:56, 0:00:03, [E_1_1], Model Name: LightGBM          , BEST AUC: 0.990255, AUC: 0.990255
[모델별] 2023-09-19 13:13:06, 0:00:09, [E_1_1], Model Name: XgBoost           , BEST AUC: 0.990255, AUC: 0.988054
[평  가] 2023-09-19 13:13:06, 0:00:14, [E_1], best_auc: 0.990255, fit_tot_cnt: (8101, 21), eval_tot_cnt: (2026, 21), fit_drop_cnt : (8101, 31), eval_drop_cnt : (2026, 31), X_train: (10200, 13), y_train: (10200,), X_eval: (2026, 13), y_eval:(2026,)


#### 평가결과 출력

In [162]:
print_eval_result(model_eval_comparison)


Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,Precision,Recall,F1-Score,CV AUC,CV std,AUC
LightGBM_E_1_1,96.15%,97.59%,88.69%,87.61%,88.69%,88.15%,99.64%,0.000567,0.990255
XgBoost_E_1_1,95.90%,97.29%,88.69%,86.31%,88.69%,87.48%,99.56%,0.000744,0.988054


In [163]:
eval_df = pd.read_csv("./data/test_churner.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!
# eval_df = pd.read_csv("./data/test_churner_kaggle_all.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!

fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드

In [164]:
# -----------------------------------------------------------------------------------    
# 평가 for Competition
# -----------------------------------------------------------------------------------
start_time = time.time()
model_eval_comparison = {}                        
fit_tot_cnt = fit_df.shape
eval_tot_cnt = eval_df.shape


# 전처리 단계
# -----------
fit_df = test_transform(fit_df)
eval_df = test_transform(eval_df)
fit_drop_cnt = fit_df.shape
eval_drop_cnt = eval_df.shape

    
# 평가를 위한 데이터 분리
# ---------------------
X_Features=fit_df.drop(['is_churned'],axis=1)
y_labels=fit_df['is_churned']

X_eval=eval_df.drop(['is_churned'],axis=1)
y_eval=eval_df['is_churned']


# 중요 Feature Column 선택
# -----------------------
X_Features_new, selected_columns = select_feature(X_Features, y_labels, 'ExtraTrees')
# X_new, selected_columns = select_feature(X_train, y_train, 'LGBMC')
X_eval = X_eval[selected_columns]


# Train and Test 데이터 생성 및 가공
# ---------------------------------
X_train, y_train, X_test, y_test = proc_smote(X_Features_new, y_labels)


# Evaluation 데이터 생성 및 가공
# ---------------------------------
X_train, X_eval = proc_normalization(X_train, X_eval.values)   


# 최종 평가
# --------
proc_type='E'
drop_no = 1
group_no =1


# eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
if eval_auc > best_auc:
    best_type = f'{proc_type}_{drop_no}_{group_no}'
    best_auc = eval_auc


# 최종 평가 로그 출력
# ------------------
cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
end_time = time.time()
delta_time = end_time - start_time
print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}], best_auc: {best_auc:0.6f}, fit_tot_cnt: {fit_tot_cnt}, eval_tot_cnt: {eval_tot_cnt}, fit_drop_cnt : {fit_drop_cnt}, eval_drop_cnt : {eval_drop_cnt}, X_train: {X_train.shape}, y_train: {y_train.shape}, X_eval: {X_eval.shape}, y_eval:{y_eval.shape}')


# print_eval_result(model_eval_comparison)
best = {'learning_rate': 0.1406105325029019, 'max_depth': 106.0, 'min_child_samples': 64.0, 'num_leaves': 41.0, 'subsample': 0.9462293554201169}
lgbm_clf =  LGBMClassifier(n_estimators=700, num_leaves=int(best['num_leaves']),
                           max_depth=int(best['max_depth']),
                           min_child_samples=int(best['min_child_samples']), 
                           subsample=round(best['subsample'], 5),
                           learning_rate=round(best['learning_rate'], 5)
                          )


# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행. 
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, 
            eval_metric="auc",eval_set=[(X_train, y_train), (X_test, y_test)])

lgbm_roc_score = roc_auc_score(y_eval, lgbm_clf.predict_proba(X_eval)[:,1])
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

[모델별] 2023-09-19 13:13:11, 0:00:03, [E_1_1], Model Name: LightGBM          , BEST AUC: 0.990255, AUC: 0.990255
[모델별] 2023-09-19 13:13:21, 0:00:09, [E_1_1], Model Name: XgBoost           , BEST AUC: 0.990255, AUC: 0.988054
[평  가] 2023-09-19 13:13:21, 0:00:14, [E_1], best_auc: 0.990255, fit_tot_cnt: (8101, 21), eval_tot_cnt: (2026, 21), fit_drop_cnt : (8101, 31), eval_drop_cnt : (2026, 31), X_train: (10200, 13), y_train: (10200,), X_eval: (2026, 13), y_eval:(2026,)
[1]	training's auc: 0.947246	training's binary_logloss: 0.606997	valid_1's auc: 0.5	valid_1's binary_logloss: 0.607502
[2]	training's auc: 0.954415	training's binary_logloss: 0.540712	valid_1's auc: 0.5	valid_1's binary_logloss: 0.548303
[3]	training's auc: 0.966035	training's binary_logloss: 0.484324	valid_1's auc: 0.504978	valid_1's binary_logloss: 0.506701
[4]	training's auc: 0.972323	training's binary_logloss: 0.43683	valid_1's auc: 0.483494	valid_1's binary_logloss: 0.482528
[5]	training's auc: 0.976108	training's binary_



[124]	training's auc: 1	training's binary_logloss: 0.00760645	valid_1's auc: 0.590793	valid_1's binary_logloss: 0.967188
[125]	training's auc: 1	training's binary_logloss: 0.00745037	valid_1's auc: 0.592142	valid_1's binary_logloss: 0.951965
[126]	training's auc: 1	training's binary_logloss: 0.00728707	valid_1's auc: 0.592142	valid_1's binary_logloss: 0.974316
[127]	training's auc: 1	training's binary_logloss: 0.00714225	valid_1's auc: 0.592142	valid_1's binary_logloss: 0.974649
[128]	training's auc: 1	training's binary_logloss: 0.00699682	valid_1's auc: 0.592142	valid_1's binary_logloss: 0.97495
ROC AUC: 0.9802
