# JBFG Data Analysis Competition

In [3]:
#!pip install watermark
%load_ext watermark
%watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,imblearn,tensorflow,plotly,matplotlib,seaborn,missingno,lightgbm

Author: DataLine

Python implementation: CPython
Python version       : 3.9.18
IPython version      : 8.15.0

numpy     : 1.24.3
pandas    : 2.0.3
sklearn   : 1.3.0
imblearn  : 0.0
tensorflow: 2.10.0
plotly    : 5.9.0
matplotlib: 3.7.2
seaborn   : 0.12.2
missingno : 0.5.1
lightgbm  : 3.3.5

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 165 Stepping 3, GenuineIntel
CPU cores   : 12
Architecture: 64bit



In [4]:
import imblearn

#### 컬럼 데이터 및 Null 건수 확인

In [5]:
"""
# 데이터 포맷 함수
# ---------------
def change_format(df, column, format):
    '''
    데이터프레임의 지정된 컬럼에 컴마, 백분율로 변경하여 데이터프레임을 반환하는 함수
        Args:
            df (df) : DataFrame
            column (str) : column of DataFrame
            format (str) : 'comma' | 'percent'
        Return:
            DataFrame
    '''
    if format == 'comma':
        df[column] = df[column].apply(lambda x: f"{x:,}")
    elif format == 'percent':
        df[column] = df[column].apply(lambda x: f"{x:.2%}")
        
    return df


# 데이터프레임의 특정 컬럼에 대한 건수, Null, Percent 표시
# -----------------------------------------------------
def count_column_na_count(df, column):
    '''
    데이터프레임의 특정 컬럼에 대한 건수, Null, Percent를 출력하는 함수
        Args:
            df (df) : DataFrame
            column (str) : column of DataFrame
        Return:
            None
    '''
    column_na_counts = df[column].size, df[column].count(), df[column].isnull().sum()
    column_na_counts_df = pd.Series(column_na_counts).to_frame().T
    column_na_counts_df.columns = ['tot_counts', 'data_counts', 'null_counts']
    column_na_counts_df['data_percents'] = column_na_counts_df['data_counts'].values/column_na_counts_df['tot_counts'].values
    column_na_counts_df['null_percents'] = column_na_counts_df['null_counts'].values/column_na_counts_df['tot_counts'].values


    column_na_counts_df = change_format(column_na_counts_df, 'tot_counts', 'comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'null_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_percents', 'percent')
    column_na_counts_df = change_format(column_na_counts_df, 'null_percents', 'percent')

    print(column_na_counts_df.to_string(index=False))
    print('-'*70)


def count_column_data_count(df, column):
    ''' 
    '''
    # column_data_countcounts = df.groupby(column)['is_churned'].value_counts().unstack()


    column_counts = df.groupby(column)['is_churned'].value_counts().unstack()
    column_counts = column_counts.rename(columns={0: 'exist_counts', 1: 'churned_counts'})
    column_counts['total_counts'] =  column_counts['exist_counts'] + column_counts['churned_counts']
    column_counts = column_counts.fillna(0)

    column_percents = df.groupby(column)['is_churned'].value_counts(normalize=True).unstack()
    column_percents = column_percents.rename(columns={0: 'exist_percents', 1: 'churned_percents'})
    column_percents = column_percents.fillna(0)


    column_count_percent = pd.concat([column_counts, column_percents], axis=1)
    column_count_percent = column_count_percent.reset_index()
    column_count_percent = column_count_percent.sort_values(by='churned_percents', ascending=False)

    
    column_count_percent = change_format(column_count_percent, 'exist_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'churned_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'total_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'exist_percents', 'percent')
    column_count_percent = change_format(column_count_percent, 'churned_percents', 'percent')
    

    print(column_count_percent.to_string(index=False))

"""    

'\n# 데이터 포맷 함수\n# ---------------\ndef change_format(df, column, format):\n    \'\'\'\n    데이터프레임의 지정된 컬럼에 컴마, 백분율로 변경하여 데이터프레임을 반환하는 함수\n        Args:\n            df (df) : DataFrame\n            column (str) : column of DataFrame\n            format (str) : \'comma\' | \'percent\'\n        Return:\n            DataFrame\n    \'\'\'\n    if format == \'comma\':\n        df[column] = df[column].apply(lambda x: f"{x:,}")\n    elif format == \'percent\':\n        df[column] = df[column].apply(lambda x: f"{x:.2%}")\n        \n    return df\n\n\n# 데이터프레임의 특정 컬럼에 대한 건수, Null, Percent 표시\n# -----------------------------------------------------\ndef count_column_na_count(df, column):\n    \'\'\'\n    데이터프레임의 특정 컬럼에 대한 건수, Null, Percent를 출력하는 함수\n        Args:\n            df (df) : DataFrame\n            column (str) : column of DataFrame\n        Return:\n            None\n    \'\'\'\n    column_na_counts = df[column].size, df[column].count(), df[column].isnull().sum()\n    column_na_counts

## Machine Learning
***

### Import Library

In [6]:
import pandas as pd
import numpy as np
from itertools import combinations
import time
import datetime
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score

### Function Definition

#### tot_column_count()

In [7]:
"""
def tot_column_counts(df):
    ''' 
    '''
    data_counts = df.count()
    null_counts = df.isnull().sum()
    tot_counts_df = pd.concat([data_counts, null_counts], axis=1)
    tot_counts_df = tot_counts_df.rename(columns={0: 'data_counts', 1: 'null_counts'})
    tot_counts_df.insert(0,'tot_counts', tot_counts_df['data_counts'] + tot_counts_df['null_counts'])
    tot_counts_df['data_percents'] = tot_counts_df['data_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df['null_percents'] = tot_counts_df['null_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df = tot_counts_df.sort_values(by='null_percents', ascending=False)

    tot_counts_df = change_format(tot_counts_df, 'tot_counts', 'comma')
    tot_counts_df = change_format(tot_counts_df, 'data_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'null_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'data_percents', 'percent')
    tot_counts_df = change_format(tot_counts_df, 'null_percents', 'percent')

    tot_counts_df = tot_counts_df.reset_index()

    print(tot_counts_df.to_string(index=False))
    
"""    

"\ndef tot_column_counts(df):\n    ''' \n    '''\n    data_counts = df.count()\n    null_counts = df.isnull().sum()\n    tot_counts_df = pd.concat([data_counts, null_counts], axis=1)\n    tot_counts_df = tot_counts_df.rename(columns={0: 'data_counts', 1: 'null_counts'})\n    tot_counts_df.insert(0,'tot_counts', tot_counts_df['data_counts'] + tot_counts_df['null_counts'])\n    tot_counts_df['data_percents'] = tot_counts_df['data_counts'].values / tot_counts_df['tot_counts'].values\n    tot_counts_df['null_percents'] = tot_counts_df['null_counts'].values / tot_counts_df['tot_counts'].values\n    tot_counts_df = tot_counts_df.sort_values(by='null_percents', ascending=False)\n\n    tot_counts_df = change_format(tot_counts_df, 'tot_counts', 'comma')\n    tot_counts_df = change_format(tot_counts_df, 'data_counts','comma')\n    tot_counts_df = change_format(tot_counts_df, 'null_counts','comma')\n    tot_counts_df = change_format(tot_counts_df, 'data_percents', 'percent')\n    tot_counts_df = 

#### drop_null_column()

In [8]:
# 데이터프레임의 특정컬럼을 리스트로 받아 삭제
def drop_null_column(df, drop_list):
    '''
        데이터프레임의 특정컬럼을 리스트로 받아 삭제후 반환하는 함수
        
        Args:
            df (df) : DataFrame
            drop_list (list) : 삭제대상 컬럼의 List 
        Return:
            DataFrame
    '''
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)
    df.dropna(axis=0, inplace=True)

    return df

#### encode_onehot()

In [9]:
# 원-핫 인코딩 처리 
# ----------------
def encode_onehot(df):
    '''
        데이터프레임의 object type 컬럼을 원-핫 인코딩하는 함수
        
        Args:
            df (df) : DataFrame
        Return:
            DataFrame
    '''
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df

#### select_feature()

In [10]:
# 중요 Feature 식별
# ----------------
def select_feature(df, y, chosen_model):

    np.random.seed(42)    
    
    available_models = {
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=13),
    'LGBMC': LGBMClassifier(),
    'LGBMR': LGBMRegressor(),
    'Xg Boost':XGBClassifier(booster='gbtree', importance_type='gain', eval_metric='auc'),
    }

    # Create the selected model
    clf = available_models[chosen_model]

    clf = clf.fit(df.values, y)                                     # Train

    if chosen_model == 'LGBMC' or chosen_model == 'LGBMR': 
        feature_importances = clf.booster_.feature_importance(importance_type="gain")
    else:        
        feature_importances = clf.feature_importances_


    chosen_model = SelectFromModel(clf, prefit=True)
    X_df = chosen_model.transform(df.values) 
    selected_feature_indices = chosen_model.get_support(indices=True)

    selected_columns = df.columns[selected_feature_indices]         # Get the indices of the selected features
    
    return X_df, selected_columns

#### proc_smote()

In [11]:
def proc_smote(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split
    from imblearn.over_sampling import SMOTE

    X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test


#### proc_normalization()

In [12]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer

def proc_normalization(X_train, X_test):
    scaler=StandardScaler()
    # scaler = QuantileTransformer()
    # scaler = PowerTransformer()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)
    
    return X_train, X_test

#### fit_predict_eval()

In [13]:
# 예측 및 평가
# -----------
def fit_predict_eval(proc_type, drop_no, group_no, model_comparison, X_train, y_train, X_test, y_test):
    
    # 초기화
    # ------
    best_roc_auc = 0
    
    # Define Models
    # ------------- 
    models = [
        # ('LogisticRegression', LogisticRegression()),
        # ('DecisionTree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
        # ('KNN', KNeighborsClassifier(n_neighbors=5)),
        # ('NaiveBayes', GaussianNB()),
        # ('RandomForest', RandomForestClassifier(n_estimators=700, criterion='entropy', random_state=0)),
        ('LightGBM', LGBMClassifier(n_estimators=700, random_state=42, boosting_type='GOSS')),
        # ('XgBoost', XGBClassifier(n_estimators=700, random_state=42, use_label_encoder=False,  eval_metric='auc')),
        # ('Xg Boost', XGBClassifier(n_estimators=700, random_state=42, use_label_encoder=False, eval_metric='logloss')),        
        # ('ExtraTrees', ExtraTreesClassifier(n_estimators=700)),
        # ('SVM', SVC(kernel='linear')),
        # ('LASSO', Lasso(alpha=0.01)),
    ]


    # Model Fit and Testing
    # ---------------------
    for model_name, classifier in models:
        start_time = time.time()

        # 학습
        # ----            
        classifier.fit(X_train, y_train)            # Fit
        
        # 학습된 모델 저장
        # ---------------
        # file_name = f'./models/{model_name}.pkl'
        # print
        # joblib.dump(classifier, file_name)

        # 평가
        # ---- 
        y_pred = classifier.predict(X_test)         # Test
        pred_proba = classifier.predict_proba(X_test)[:, 1]

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5, scoring="roc_auc")
        # accuracies = cross_val_score(estimator=classifier, X=X_test, y=y_test, cv=5, scoring="recall")
        cv_auc = accuracies.mean()
        cv_std = accuracies.std()
        
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )
        
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        
        # Collect Result
        # --------------
        model_comparison[f'{model_name}_{proc_type}_{drop_no}_{group_no}'] = [accuracy, accuracy_class_0, accuracy_class_1, f1, cv_auc, cv_std, roc_auc]
        
        
        # Best ROC_AUC Value Return
        # -------------------------
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc

        cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        end_time = time.time()
        delta_time = end_time - start_time
        # print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}], Model Name: {model_name:<18}, BEST AUC: {best_roc_auc:0.6f}, AUC: {roc_auc:0.6f}')

    return best_roc_auc



#### print_eval_result()

In [14]:
def print_eval_result(model_comparison):

    # # MODEL COMPARISSON
    # Model_com_df=pd.DataFrame(model_comparison).T
    # Model_com_df.columns=['Model Accuracy','Model Accuracy-0','Model Accuracy-1','Model F1-Score','CV Accuracy','CV std', 'AUC']
    # Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))

    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Accuracy', 'Accuracy-No', 'Accuracy-Yes', 'F1-Score', 'CV AUC', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    styled_df = Model_com_df.iloc[:10,:].style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV AUC']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV AUC'])
    display(styled_df)

#### test_transform()

In [15]:
def test_transform(df):
    
    # 데이터 변환
    # ------------------- 
    df = df.drop('cstno', axis=1)
    df = df.drop('sex', axis=1)
    # after_drop_cnt=len(df)
    df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

    
    # 결측치 처리
    # ----------
    df = df.fillna(df.mean(numeric_only=True))
    df.dropna(axis=0, inplace=True)
    # after_drop_cnt=len(df)
    
    # One-Hot Encoding
    # ----------------
    df = encode_onehot(df)  
   
    return df

### 학습 및 Test 단계

#### 데이터 로딩

In [16]:
ml_churner_df = pd.read_csv("./data/bank_churner.csv")
tot_cnt = len(ml_churner_df)

#### 예측 및 결과

In [17]:
# 결과 저장소 초기화
# -----------------
model_comparison = {}  #Dictionary to store the comparison metrics of models
model_eval_comparison = {}       
drop_no = 1                 

In [18]:
'''
# 결과 저장소 초기화
# -----------------
model_comparison = {}  #Dictionary to store the comparison metrics of models
model_eval_comparison = {}                        
drop_no = 1
start_time = time.time()


ml_churner_df = test_transform(ml_churner_df)
after_drop_cnt = len(ml_churner_df)

# ML 데이터 분리
# --------------
X=ml_churner_df.drop(['is_churned'],axis=1)
y=ml_churner_df['is_churned']


# 중요 Feature Column 선택
# -----------------------
# X_new, selected_columns = select_feature(X, y, 'Xg Boost')
X_new, selected_columns = select_feature(X, y, 'ExtraTrees')


# Train and Test 데이터 생성 및 가공
# ---------------------------------
X_train, y_train, X_test, y_test = proc_smote(X_new, y)
X_train_for_normalization = X_train.copy()
after_smote_cnt = len(X_train)


# Normalization
# -------------
X_train, X_test = proc_normalization(X_train, X_test)    


# Pridict 및 Test 평가
# --------------------
proc_type='T'
test_auc = fit_predict_eval(proc_type, drop_no, model_comparison, X_train, y_train, X_test, y_test)


# 예측 및 테스트 로그 출력
# ----------------------
cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
end_time = time.time()
delta_time = end_time - start_time
print(f'[테스트] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}], AUC: {test_auc:0.6f}, tot_cnt: {tot_cnt:<6}, after_drop_cnt : {after_drop_cnt:<6}, after_smote_cnt: {after_smote_cnt:<6}, X_train:{X_train.shape}, y_train:{y_train.shape}, X_test:{X_test.shape}, y_test:{y_test.shape}')


# print_eval_result(model_comparison)

'''

'\n# 결과 저장소 초기화\n# -----------------\nmodel_comparison = {}  #Dictionary to store the comparison metrics of models\nmodel_eval_comparison = {}                        \ndrop_no = 1\nstart_time = time.time()\n\n\nml_churner_df = test_transform(ml_churner_df)\nafter_drop_cnt = len(ml_churner_df)\n\n# ML 데이터 분리\n# --------------\nX=ml_churner_df.drop([\'is_churned\'],axis=1)\ny=ml_churner_df[\'is_churned\']\n\n\n# 중요 Feature Column 선택\n# -----------------------\n# X_new, selected_columns = select_feature(X, y, \'Xg Boost\')\nX_new, selected_columns = select_feature(X, y, \'ExtraTrees\')\n\n\n# Train and Test 데이터 생성 및 가공\n# ---------------------------------\nX_train, y_train, X_test, y_test = proc_smote(X_new, y)\nX_train_for_normalization = X_train.copy()\nafter_smote_cnt = len(X_train)\n\n\n# Normalization\n# -------------\nX_train, X_test = proc_normalization(X_train, X_test)    \n\n\n# Pridict 및 Test 평가\n# --------------------\nproc_type=\'T\'\ntest_auc = fit_predict_eval(proc_type, dro

### 평가 단계 ~ 평가자가 Competition 평가를 위해 사용 하는 단계

#### 데이터 로딩

In [19]:
eval_df = pd.read_csv("./data/test_churner.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!
# eval_df = pd.read_csv("./data/test_churner_kaggle_all.csv") # 평가를 위한 데이터 로드 - 평가데이터 경로를 입력해 주세요!!!

fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
tot_cnt = len(eval_df)

#### 평가 및 결과

In [61]:
fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
eval_df = pd.read_csv("./data/test_churner.csv")

fit_df = fit_df.drop('sex', axis=1)
fit_df = fit_df.fillna(fit_df.mean(numeric_only=True))
fit_df.dropna(axis=0, inplace=True)
# fit_df = test_transform(fit_df)

# X_train=fit_df.drop(['is_churned'],axis=1)
# y_train=fit_df['is_churned']
# X_eval=eval_df.drop(['is_churned'],axis=1)
# y_eval=eval_df['is_churned']


# X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
# X_eval = X_eval[selected_columns]

# X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)

# X_train, X_eval = proc_normalization(X_train, X_eval.values)   

In [63]:
fit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8101 entries, 0 to 8100
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cstno                  8101 non-null   int64  
 1   is_churned             8101 non-null   int64  
 2   age                    8101 non-null   int64  
 3   dependent_num          8101 non-null   int64  
 4   education              8101 non-null   object 
 5   marital_stat           8101 non-null   object 
 6   imcome_cat             6482 non-null   object 
 7   card_type              8101 non-null   object 
 8   mon_on_book            8101 non-null   int64  
 9   tot_product_count      8101 non-null   int64  
 10  months_inact_for_12m   8101 non-null   int64  
 11  contact_cnt_for_12m    8101 non-null   int64  
 12  credit_line            8101 non-null   float64
 13  tot_revol_balance      8101 non-null   float64
 14  mean_open_to_buy       8101 non-null   float64
 15  tot_

In [62]:
fit_df.shape
# fit_df.shape, X_train.shape

(8101, 20)

In [66]:
from itertools import combinations

model_comparison = {}  #Dictionary to store the comparison metrics of models
model_eval_comparison = {}                        

def drop_null_column(df, drop_list):
    for col_name in drop_list:
        df = df.drop(col_name, axis=1)

    return df

def test_transform(df, drop_list):
    
    # 데이터 변환
    # ------------------- 
    df = df.drop('cstno', axis=1)
    for col_name in drop_list:
        df = df.drop(col_name, axis=1)

    if 'imcome_cat' not in drop_list:
        df['imcome_cat']=df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

    df = df.fillna(df.mean(numeric_only=True))
    df.dropna(axis=0, inplace=True)
        
    # 결측치 처리
    # ----------
    # after_drop_cnt=len(df)
    
    # One-Hot Encoding
    # ----------------
    df = encode_onehot(df)  
   
    return df

        
# -----------
# 예측
# -----------

# 데이터 로드 및 고객번호 삭제
fit_df = pd.read_csv("./data/bank_churner.csv") # 학습을 위한 데이터 로드
eval_df = pd.read_csv("./data/test_churner.csv")
tot_cnt = fit_df.shape

fit_df_org = fit_df.copy()
eval_df_org = eval_df.copy()

best_auc = 0

# Null 처리
result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))

# result_list = [['sex'], ['sex', 'age', 'imcome_cat']]

for drop_no, drop_column in enumerate(result_list):
    start_time = time.time()

    fit_df = fit_df_org
    eval_df = eval_df_org
    
    # -----------------------------------------------------------------------------------    
    # 평가 for Competition
    # -----------------------------------------------------------------------------------

    # 전처리 단계
    # -----------
    fit_df = test_transform(fit_df, drop_column)
    eval_df = test_transform(eval_df, drop_column)
    after_drop_cnt = len(fit_df)
    
    
    # 평가를 위한 데이터 분리
    # ---------------------
    X_train=fit_df.drop(['is_churned'],axis=1)
    y_train=fit_df['is_churned']
    X_train_cnt = X_train.shape
    
    X_eval=eval_df.drop(['is_churned'],axis=1)
    y_eval=eval_df['is_churned']


    # 중요 Feature Column 선택
    # -----------------------
    X_new, selected_columns = select_feature(X_train, y_train, 'ExtraTrees')
    X_eval = X_eval[selected_columns]


    # Train and Test 데이터 생성 및 가공
    # ---------------------------------
    X_train, y_train, X_test_temp, y_test_temp = proc_smote(X_new, y_train)
    after_smote_cnt = X_train.shape

    # Evaluation 데이터 생성 및 가공
    # ---------------------------------
    X_train, X_eval = proc_normalization(X_train, X_eval.values)   


    # 최종 평가
    # --------
    proc_type='E'
    group_no=1
    # eval_auc = fit_predict(proc_type, drop_no, model_eval_comparison, X_train_for_evaluation, y_train_for_evaluation, X_eval, y_eval)
    eval_auc = fit_predict_eval(proc_type, drop_no, group_no, model_eval_comparison, X_train, y_train, X_eval, y_eval)
    
    if eval_auc > best_auc:
        best_type = f'{proc_type}_{drop_no}_{group_no}'
        best_auc = eval_auc
        


    # 최종 평가 로그 출력
    # ------------------
    cur_datetime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    end_time = time.time()
    delta_time = end_time - start_time
    # print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, AUC: {test_auc:0.6f}, 처리 건수: {len(eval_df)}, 최종 평가 건수: {len(X_eval)}')
    print(f'[평  가] {cur_datetime}, {str(datetime.timedelta(seconds=delta_time)).split(".")[0]}, [{proc_type}_{drop_no}_{group_no}], best-type: [{best_type}], Best-AUC: {best_auc:0.6f}, AUC: {eval_auc:0.6f}, tot_cnt: {tot_cnt}, after_drop_cnt: {after_drop_cnt}, after_smote_cnt: {after_smote_cnt}, X_train_cnt: {X_train_cnt}, drop_column: {drop_column}')


    # print_eval_result(model_eval_comparison)

[평  가] 2023-09-17 17:42:30, 0:00:06, [E_0_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.989888, tot_cnt: (8101, 21), after_drop_cnt: 8101, after_smote_cnt: (10200, 13), X_train_cnt: (8101, 30), drop_column: ['sex']
[평  가] 2023-09-17 17:42:37, 0:00:07, [E_1_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.989881, tot_cnt: (8101, 21), after_drop_cnt: 7293, after_smote_cnt: (9188, 14), X_train_cnt: (7293, 31), drop_column: ['imcome_cat']
[평  가] 2023-09-17 17:42:44, 0:00:07, [E_2_1], best-type: [E_0_1], Best-AUC: 0.989888, AUC: 0.983981, tot_cnt: (8101, 21), after_drop_cnt: 7293, after_smote_cnt: (9188, 13), X_train_cnt: (7293, 31), drop_column: ['tot_amt_ratio_q4_q1']
[평  가] 2023-09-17 17:42:50, 0:00:06, [E_3_1], best-type: [E_3_1], Best-AUC: 0.990320, AUC: 0.990320, tot_cnt: (8101, 21), after_drop_cnt: 7293, after_smote_cnt: (9188, 13), X_train_cnt: (7293, 31), drop_column: ['mean_util_pct']
[평  가] 2023-09-17 17:42:58, 0:00:07, [E_4_1], best-type: [E_3_1], Best-AUC: 0.990320, AUC:

In [67]:
print_eval_result(model_eval_comparison)


Unnamed: 0,Accuracy,Accuracy-No,Accuracy-Yes,F1-Score,CV AUC,CV std,AUC
LightGBM_E_85_1,96.25%,97.88%,87.77%,96.24%,99.22%,0.014537,0.990331
LightGBM_E_140_1,96.25%,97.88%,87.77%,96.24%,99.22%,0.014537,0.990331
LightGBM_E_3_1,96.25%,97.76%,88.38%,96.25%,99.27%,0.013735,0.99032
LightGBM_E_16_1,96.25%,97.76%,88.38%,96.25%,99.27%,0.013735,0.99032
LightGBM_E_6_1,96.30%,97.70%,88.99%,96.30%,99.20%,0.014922,0.989935
LightGBM_E_19_1,96.30%,97.70%,88.99%,96.30%,99.20%,0.014922,0.989935
LightGBM_E_27_1,96.54%,97.94%,89.30%,96.54%,99.23%,0.014449,0.989902
LightGBM_E_54_1,96.30%,97.47%,90.21%,96.32%,99.18%,0.01549,0.989901
LightGBM_E_8_1,96.25%,97.53%,89.60%,96.27%,99.39%,0.011514,0.989888
LightGBM_E_0_1,96.25%,97.53%,89.60%,96.27%,99.39%,0.011514,0.989888
