# JBFG Data Analysis Competition
- Null 처리방식 : 2번

## PC Environment and Library Version
***

In [None]:
'''
#!pip install watermark
%load_ext watermark
%watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,imblearn,tensorflow,plotly,matplotlib,seaborn,missingno,lightgbm
'''

## 탐색적 데이터 분석
***

### Library and Data Loading, Function Definition

#### Import Library for Data Analysis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl  
import missingno as msno
import warnings

import plotly.express as px
import plotly.graph_objs as go
# import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()


warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
%matplotlib inline

mpl.rc('font', family='Malgun Gothic')  # 한글 폰트 설정
                                        # 윈도우 폰트 위치 - C:\Windows\Fonts
plt.figure(figsize=(10,6))              # 그래프 사이즈 설정
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus':False}, style='darkgrid') # 마이너스 처리


### Function Definition

#### 연속형 데이터 그래프 함수

In [None]:
def print_continuous_graphs(df, column, column_desc):

       counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
       exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
       churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
       churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    

       fig = make_subplots(rows=3, 
                     cols=2, 
                     subplot_titles=('전체 건수 분포', '유지/이탈별 사분위', '유지/이탈별 분포'), 
                     # shared_xaxes=True,
                     horizontal_spacing=0.1,
                     vertical_spacing=0.1,
                     specs=[[{"secondary_y": True}, {}],
                            [{}, {"secondary_y": True}],
                            [{"secondary_y": True},{}],
                            ]
                     )

       # 전체
       # ----
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column],  marker_color="red", name='이탈'), row=1, col=1, secondary_y=False)
       # fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], texttemplate="%{x}", marker_color="red"), row=1, col=1, secondary_y=False)
       
       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], marker_color="blue", name='유지'), row=1, col=1, secondary_y=False)
       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=1, col=1, secondary_y=True)
       fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)


       
       # Box Graph
       # ---------
       # fig.add_trace(go.Box(x=exist_counts, 
       #               name='유지'), row=2, col=1)
       fig.add_trace(go.Box(x=df[df['is_churned']==0][column].sort_values(), name='유지', marker_color="blue"), row=1, col=2)
       fig.add_trace(go.Box(x=df[df['is_churned']!=0][column].sort_values(), name='이탈', marker_color="red"), row=1, col=2)


       # Histogram Graph
       # ---------------

       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], marker_color="blue"), row=2, col=1)
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], marker_color="red"), row=2, col=1)

       # Scatter Graph
       # -------------
       fig.add_trace(go.Scatter(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), mode='lines+markers', marker_color="red", name='이탈'), row=2, col=2, secondary_y=False)
       fig.add_trace(go.Scatter(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), mode='lines+markers', marker_color='blue', name='유지'), row=2, col=2, secondary_y=False)

       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=2, col=2, secondary_y=True)


       # 이탈률
       # ------
       # churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
       fig.add_trace(go.Histogram(x=churn_rates.sort_index()), row=3, col=1)
       fig.update_yaxes(secondary_y=True, range=[0, 1], row=3, col=1)


       fig.update_layout(width=1200, 
                     height=1200, 
                     showlegend=False,
                     barmode='stack',
                     hovermode="x",
                     template='plotly_dark',
                     )
       fig.show()


In [None]:
def category_func(df, column, start_value, units):
    bins = np.arange(start_value, df[column].max()+units, units)
    bins_label = [str(round(x,2)) for x in bins]
    df[f"{column}_category"] = pd.cut(df[column], bins, right=True, include_lowest=True, labels=bins_label[:-1])

    return df

#### 범주형 데이터 그래프 함수

In [None]:
def print_category_graphs(df, column, column_desc):
    
    counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
    exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
    churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
    churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
    
    
    fig = make_subplots(rows=3, 
                    cols=2, 
                    subplot_titles=('【 전체 현황 】', '【 이탈율 】', '【 사분위 】', f'【 {column_desc} 중 전체 현황 】', f'【 {column_desc} 중 유지 현황 】', f'【 {column_desc} 중 이탈 현황 】'), 
                    # shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1,
                    specs=[[{"secondary_y": True}, {}],
                           [{}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}]]
                   )


    # 전체 현황
    # ---------
    fig.add_trace(go.Bar(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), marker_color="red", offsetgroup=0, name='이탈', 
                         text=churn_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto'), row=1, col=1, secondary_y=False)
    
    
    fig.add_trace(go.Bar(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), marker_color="blue", offsetgroup=0, name='유지', 
                         texttemplate='%{value:,}', 
                        #  text=exist_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto', base=churn_counts.sort_index()), row=1, col=1, secondary_y=False)
    
    fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', 
                             line_shape='linear'), row=1, col=1, secondary_y=True)
    
    fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)
    #fig.update_traces(texttemplate='%{value:,}', hovertemplate = '%{label}, %{value}', row=1, col=1)
    # fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
    

    # 이탈율
    # ------
    fig.add_trace(go.Bar(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="red", name='이탈율'),
                  row=1, col=2)

    
    # 사분위
    # ------
    fig.add_trace(go.Box(x=df[df['is_churned']!=0][column].sort_values(), marker_color="red", name='이탈'), row=2, col=1)
    fig.add_trace(go.Box(x=df[df['is_churned']==0][column].sort_values(), marker_color="blue", name='유지'), row=2, col=1)


    # 유지/이탈 현황
    # -------------
    fig.add_trace(go.Pie(labels=counts.sort_index().index, values=counts.sort_index(), name=f'{column_desc} 분표 현황', texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=2, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=2, col=2)

  
    # 유지 현황
    # ---------
    fig.add_trace(go.Pie(labels=exist_counts.sort_index().index, values=exist_counts.sort_index(), name="유지", texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=1)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=1)


    # 이탈 현황
    # ---------
    fig.add_trace(go.Pie(labels=churn_counts.sort_index().index, values=churn_counts.sort_index(), name="이탈", texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=2)


    fig.add_annotation(dict(x=0.73, y=0.5, ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "<b>전체</b>", 
                    font_size=20,
                  ))

    fig.add_annotation(dict(x=0.21, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>유지</b>", 
                        font_size=20,
                      ))

    fig.add_annotation(dict(x=0.73, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>이탈</b>", 
                        font_size=20,
                      ))

    
    fig.update_layout(width=1200, 
                  height=1200, 
                  showlegend=False,
                  title_text=f'『 {column_desc} 』에 따른 분석 그래프',
                # barmode='stack'
                  hovermode="x",
                  template='plotly_dark',
                 )
    

    fig.show()    

#### 컬럼 데이터 및 Null 건수 확인

In [None]:

def change_format(df, column, format):
    if format == 'comma':
        df[column] = df[column].apply(lambda x: f"{x:,}")
    elif format == 'percent':
        df[column] = df[column].apply(lambda x: f"{x:.2%}")
        
    return df


def count_column_na_count(df, column):
    '''
    '''
    column_na_counts = df[column].size, df[column].count(), df[column].isnull().sum()
    column_na_counts_df = pd.Series(column_na_counts).to_frame().T
    column_na_counts_df.columns = ['tot_counts', 'data_counts', 'null_counts']
    column_na_counts_df['data_percents'] = column_na_counts_df['data_counts'].values/column_na_counts_df['tot_counts'].values
    column_na_counts_df['null_percents'] = column_na_counts_df['null_counts'].values/column_na_counts_df['tot_counts'].values


    column_na_counts_df = change_format(column_na_counts_df, 'tot_counts', 'comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'null_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_percents', 'percent')
    column_na_counts_df = change_format(column_na_counts_df, 'null_percents', 'percent')

    print(column_na_counts_df.to_string(index=False))
    print('-'*70)


def count_column_data_count(df, column):
    ''' 
    '''
    # column_data_countcounts = df.groupby(column)['is_churned'].value_counts().unstack()


    column_counts = df.groupby(column)['is_churned'].value_counts().unstack()
    column_counts = column_counts.rename(columns={0: 'exist_counts', 1: 'churned_counts'})
    column_counts['total_counts'] =  column_counts['exist_counts'] + column_counts['churned_counts']
    column_counts = column_counts.fillna(0)

    column_percents = df.groupby(column)['is_churned'].value_counts(normalize=True).unstack()
    column_percents = column_percents.rename(columns={0: 'exist_percents', 1: 'churned_percents'})
    column_percents = column_percents.fillna(0)


    column_count_percent = pd.concat([column_counts, column_percents], axis=1)
    column_count_percent = column_count_percent.reset_index()
    column_count_percent = column_count_percent.sort_values(by='churned_percents', ascending=False)

    
    column_count_percent = change_format(column_count_percent, 'exist_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'churned_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'total_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'exist_percents', 'percent')
    column_count_percent = change_format(column_count_percent, 'churned_percents', 'percent')
    

    print(column_count_percent.to_string(index=False))

## Machine Learning
***

### Import Library and Data Loading, Function Definition for Machine Learning

#### Import Library

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

# import scikitplot as skplt
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE

#### Function Definition

In [None]:
# 평가 함수 출력
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test, pred)
    f11 = f1_score(y_test, pred, average='weighted')
    roc_auc = roc_auc_score(y_test, pred_proba)

    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
          F1: {3:.4f}, F11: {4:.4f}, AUC:{5:.4f}'.format(accuracy, precision, recall, f1, f11, roc_auc))   

In [None]:
def precision_recall_curve_plot(y_test=None, pred_proba=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [None]:
def precision_recall_curve_plot(y_test=None, pred_proba=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

In [None]:
def roc_curve_plot(y_test , pred_proba):
    # 임곗값에 따른 FPR, TPR 값을 반환 받음. 
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba)

    # ROC Curve를 plot 곡선으로 그림. 
    plt.plot(fprs , tprs, label='ROC')
    # 가운데 대각선 직선을 그림. 
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    # FPR X 축의 Scale을 0.1 단위로 변경, X,Y 축명 설정등   
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )'); plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()
    

#### Data Loading

In [None]:
ml_churner_df = pd.read_csv("./data/bank_churner.csv")

### 전처리(Pre-Processing)

#### 불필요한 컬럼 삭제

In [None]:
ml_churner_df = ml_churner_df.drop('cstno', axis=1)

#### 전처리 함수 정의

#### 결측(Null) 데이터 확인 및 처리

In [None]:
def tot_column_counts(df):
    ''' 
    '''
    data_counts = df.count()
    null_counts = df.isnull().sum()
    tot_counts_df = pd.concat([data_counts, null_counts], axis=1)
    tot_counts_df = tot_counts_df.rename(columns={0: 'data_counts', 1: 'null_counts'})
    tot_counts_df.insert(0,'tot_counts', tot_counts_df['data_counts'] + tot_counts_df['null_counts'])
    tot_counts_df['data_percents'] = tot_counts_df['data_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df['null_percents'] = tot_counts_df['null_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df = tot_counts_df.sort_values(by='null_percents', ascending=False)

    tot_counts_df = change_format(tot_counts_df, 'tot_counts', 'comma')
    tot_counts_df = change_format(tot_counts_df, 'data_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'null_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'data_percents', 'percent')
    tot_counts_df = change_format(tot_counts_df, 'null_percents', 'percent')

    tot_counts_df = tot_counts_df.reset_index()

    print(tot_counts_df.to_string(index=False))

In [None]:
# tot_trans_cnt_for_12m      8,101       4,851       3,250        59.88%        40.12% : 중요도 : 1순위
#         mean_util_pct      8,101       5,575       2,526        68.82%        31.18% : 중요도 : 6순위
#   tot_amt_ratio_q4_q1      8,101       5,666       2,435        69.94%        30.06% : 중요도 : 8순위   Drop - 3순위
# tot_trans_amt_for_12m      8,101       6,432       1,669        79.40%        20.60% : 중요도 : 3순위
#   tot_cnt_ratio_q4_q1      8,101       6,472       1,629        79.89%        20.11% : 중요도 : 5순위
#            imcome_cat      8,101       6,482       1,619        80.01%        19.99% : 중요도 : X,     Drop - 2순위
#     tot_revol_balance      8,101       6,580       1,521        81.22%        18.78% : 중요도 : 2순위
#                   sex      8,101       7,293         808        90.03%         9.97%  : 중요도 : X,    Drop - 1순위

# 공선성
# ------ 
# age와 mon_on_book, 
# credit_line과 mean_open_to_buy, 
# tot_trans_cnt_for_12m와 tot_trans_amt_for_12m 

# Feature 중요도
# --------------
# tot_trans_cnt_for_12m - tot_revol_balance - tot_trans_amt_for_12m - tot_product_count - tot_cnt_ratio_q4_q1 - 
# mean_util_pct - contact_cnt_for_12m
# tot_amt_ratio_q4_q1 - months_inact_for_12m - credi_line - mean_open_to_buy - mon_on_book - age



def drop_null_column(df, drop_list):
    # Null 소유 필드 Drop
    # ------------------
    df = df.drop('sex', axis=1)
    df = df.drop('imcome_cat', axis=1)
    df = df.drop('tot_amt_ratio_q4_q1', axis=1)
    # df = df.drop('mean_util_pct', axis=1)
    # df = df.drop('tot_trans_cnt_for_12m', axis=1)
    
    # 다중공선성 처리
    # ----------
    df = df.drop('age', axis=1)
    df = df.drop('mean_open_to_buy', axis=1)
    # df = df.drop('tot_trans_amt_for_12m', axis=1)

    df.dropna(axis=0, inplace=True)        
    
    return df

#### 원-핫(One-Hot) 인코딩

In [None]:

def encode_onehot(df):
    # sex                    7293 non-null   object 
    # education              8101 non-null   object 
    # marital_stat           8101 non-null   object 
    # imcome_cat             6482 non-null   object 
    # card_type              8101 non-null   object 

    # # ml_churner_df = pd.concat([ml_churner_df, pd.get_dummies(ml_churner_df['sex'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['education']).drop(columns=['Unknown'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['marital_stat']).drop(columns=['Unknown'])], axis=1)
    # # ml_churner_df = pd.concat([ml_churner_df, pd.get_dummies(ml_churner_df['imcome_cat']).drop(columns=['Unknown'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['card_type'])], axis=1)
    # # ml_churner_df.drop(columns = ['sex', 'education', 'marital_stat', 'imcome_cat', 'card_type'], inplace=True)
    # df.drop(columns = ['education', 'marital_stat', 'card_type'], inplace=True)
    
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df, catcols

# ml_churner_df = pd.read_csv("./data/bank_churner.csv")
# ml_churner_df = ml_churner_df.drop('cstno', axis=1)

# ddf, catcols = encode_onehot(ml_churner_df)

In [None]:
catcols

#### 학습에 사용될 중요 Feature 식별

In [None]:
def select_feature(df, model):

    #Set the random seed for reproducibility
    np.random.seed(42)

    #Define a list of available models for selection
    available_models = {
        'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'SVM': SVC(kernel='linear'),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'LASSO': Lasso(alpha=0.01),  # Agrega LASSO aquí
        'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=13)
    }

    # Choose the desired model for feature selection
    chosen_model = model

    # Create the selected model
    clf = available_models[chosen_model]

    #Train the model with the data
    clf = clf.fit(df.values, y)

    # Obtain feature importances from the model
    feature_importances = clf.feature_importances_

    # Create a SelectFromModel object with the trained classifier
    model = SelectFromModel(clf, prefit=True)

    #Transform the original features to obtain the selected ones

    X_df = model.transform(df.values)

    selected_feature_indices = model.get_support(indices=True)

    #Get the indices of the selected features
    selected_columns = df.columns[selected_feature_indices]
    
    return X_df, selected_columns

In [None]:
def proc_smote(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)


    from imblearn.over_sampling import SMOTE
    X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test


#### Normalization

In [None]:
def proc_normalization(X_train, X_test):
    #Normalization
    from sklearn.preprocessing import StandardScaler
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    
    return X_train, X_test

### 모델별 학습 및 평가

In [None]:
def fit_predict(proc_type, drop_no, model_comparison, X_train, y_train, X_test, y_test):
    #Training with different models
    #entrenamiento con distintos modelos
    from sklearn.metrics import accuracy_score, f1_score, classification_report
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from lightgbm import LGBMClassifier
    from xgboost import XGBClassifier

    #Create a list of tuples with the model name and the classifier instance
    # Crear una lista de tuplas con el nombre del modelo y la instancia del clasificador
    models = [
        # ('Logistic Regression', LogisticRegression()),
        # ('Decision Tree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
        # ('KNN', KNeighborsClassifier(n_neighbors=5)),
        # ('Naive Bayes', GaussianNB()),
        # ('Random Forest', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)),
        ('LightGBM', LGBMClassifier(n_estimators=500, random_state=42, boosting_type='GOSS')),
        ('Xg Boost', XGBClassifier())
    ]


    for model_name, classifier in models:
        #Fit the model using the training set
        classifier.fit(X_train, y_train)


        #Make predictions on the test set
        y_pred = classifier.predict(X_test)
        pred_proba = classifier.predict_proba(X_test)[:, 1]

        
        #Calculate model metrics
        accuracy = accuracy_score(y_test, y_pred)
        #f1 = f1_score(y_pred, y_test, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        ## 확인 필요 ??? 
        accuracies = cross_val_score(estimator=classifier, X=X_test, y=y_test, cv=5, scoring="recall")
        cv_accuracy = accuracies.mean()
        cv_std = accuracies.std()
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        #Print model metrics
        # print("-" * 30)
        # print(f"Model: {model_name}")
        # print("-" * 30)
        # print(f"Model Accuracy: {accuracy * 100:.2f}%")
        # print(f"Model F1-Score: {f1 * 100:.2f}%")
        # print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
        # print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")


        #Add metrics to the models comparison dictionary
        model_comparison[f'{model_name}_{proc_type}_{drop_no}'] = [drop_no, accuracy, accuracy_class_0, accuracy_class_1, f1, cv_accuracy, cv_std, roc_auc]
        # print(classification_report(y_pred, y_test, zero_division=1))
        # print("-" * 60)

        
        # get_clf_eval(y_test, y_pred, pred_proba)
        
        # precision_recall_curve_plot(y_test, pred_proba)
        # roc_curve_plot(y_test , pred_proba)
        
        print("-" * 100)



### 학습 및 예측 결과

In [None]:
def print_eval_result(model_comparison):
    import pandas as pd

    # MODEL COMPARISSON

    Model_com_df=pd.DataFrame(model_comparison).T
    Model_com_df.columns=['Drop No', 'Model Accuracy','Model Accuracy-0','Model Accuracy-1','Model F1-Score','CV Accuracy','CV std', 'AUC']
    Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))


    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Drop No', 'Model Accuracy', 'Model Accuracy-No', 'Model Accuracy-Yes', 'Model F1-Score', 'CV Accuracy', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    styled_df = Model_com_df.style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV Accuracy']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV Accuracy'])
    display(styled_df)

In [None]:
type(model_comparison)

# 메인 처리 for 예측

In [None]:
# ml_churner_df['credit_line']
bb = ['credit_line']
for col in bb:
    cc = col

aa = 'credit_line'
ml_churner_df[cc]

In [None]:

def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)
        df.dropna(axis=0, inplace=True)

    return df


from itertools import combinations

result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))

for no, drop_column in enumerate(result_list):
    ml_churner_df = pd.read_csv("./data/bank_churner.csv")
    ml_churner_df = ml_churner_df.drop('cstno', axis=1)

    ml_churner_df = drop_null_column(ml_churner_df, drop_column)
    print(f'구분 : {no}, 남은 갯수: {len(ml_churner_df)}, Drop Col:{drop_column}')

In [None]:
def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        print(col_name, type(col_name))
        
        df = df.drop(col_name, axis=1)
        
    # df.dropna(axis=0, inplace=True)
    return df
    

from itertools import combinations

result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))

for drop_column in result_list:
    ml_churner_df = drop_null_column(ml_churner_df, drop_column)
    print(f'남은 갯수: {len(ml_churner_df)}, Drop Col:{drop_column}')


In [None]:
result_list

In [None]:
drop_target_columns.length()

In [None]:
# df = df.drop('sex', axis=1)
# df = df.drop('imcome_cat', axis=1)
# df = df.drop('tot_amt_ratio_q4_q1', axis=1)
# # df = df.drop('mean_util_pct', axis=1)
# # df = df.drop('tot_trans_cnt_for_12m', axis=1)

# # 다중공선성 처리
# # ----------
# df = df.drop('age', axis=1)
# df = df.drop('mean_open_to_buy', axis=1)
# # df = df.drop('tot_trans_amt_for_12m', axis=1)
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']

for i in drop_target_columns:
    print(i)
    
# drop_target_columns    
# ml_churner_df = drop_null_column(ml_churner_df, ['tot_trans_cnt_for_12m'])


test_list = ['one', 'two', 'three'] 

for i in test_list:
    print(i)

In [None]:
selected_columns

# 메인 처리 함수

In [None]:
def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)
        df.dropna(axis=0, inplace=True)

    return df


from itertools import combinations

# result_list = []
# drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
# for j in range(1, len(drop_target_columns)+1):
#     for i in combinations(drop_target_columns, j):
#         result_list.append(list(i))

# for no, drop_column in enumerate(result_list):
#     ml_churner_df = pd.read_csv("./data/bank_churner.csv")
#     ml_churner_df = ml_churner_df.drop('cstno', axis=1)

#     ml_churner_df = drop_null_column(ml_churner_df, drop_column)
#     print(f'구분 : {no}, 남은 갯수: {len(ml_churner_df)}, Drop Col:{drop_column}')
    
    
    
    
model_comparison = {}  #Dictionary to store the comparison metrics of models
model_eval_comparison = {}                        
    
# -----------
# 예측
# -----------

# 데이터 로드 및 고객번호 삭제
ml_churner_df = pd.read_csv("./data/bank_churner.csv")
ml_churner_df = ml_churner_df.drop('cstno', axis=1)

# Null 처리
result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
#drop_target_columns = ['sex','imcome_cat']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))


for drop_no, drop_column in enumerate(result_list):
    ml_churner_df = pd.read_csv("./data/bank_churner.csv")
    ml_churner_df = ml_churner_df.drop('cstno', axis=1)

    ml_churner_df = drop_null_column(ml_churner_df, drop_column)
    after_null_drop_cnt = len(ml_churner_df)
       

    ml_churner_df, catcols = encode_onehot(ml_churner_df)  


    #We create our feature matrix and our target variable vector.
    X=ml_churner_df.drop(['is_churned'],axis=1)
    y=ml_churner_df['is_churned']

    X_new, selected_columns = select_feature(X, 'ExtraTrees')
    # display(selected_columns)

    X_train, y_train, X_test, y_test = proc_smote(X_new, y)
    X_train_org = X_train.copy()

    X_train, X_test = proc_normalization(X_train, X_test)    

    print(f'구분: {drop_no}, X_train 건수: {len(X_new)}, X_train_SMOTE 건수: {len(X_train)}, After Drop 건수: {after_null_drop_cnt}, Drop Col:{drop_column}')
    
    # display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    proc_type='P'
    fit_predict(proc_type, drop_no, model_comparison, X_train, y_train, X_test, y_test)

    print_eval_result(model_comparison)



    # -----------
    # 평가
    # -----------
    test_df = pd.read_csv("./data/test_churner.csv")
    test_df = test_df.drop('cstno', axis=1)


    test_df, catcols = encode_onehot(test_df)  
    print(f'Test df: {type(test_df)}')
    
    #We create our feature matrix and our target variable vector.
    X=test_df.drop(['is_churned'],axis=1)
    y=test_df['is_churned']
    y_test = y

    # display(selected_columns)
    X_new = X[selected_columns]


    X_train_temp, X_test = proc_normalization(X_train_org, X_new.values)   


    # display(X_test.shape)

    # display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    proc_type='E'
    fit_predict(proc_type, drop_no, model_eval_comparison, X_train, y_train, X_test, y_test)

    print_eval_result(model_eval_comparison)

In [None]:
X

In [None]:
def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)
        df.dropna(axis=0, inplace=True)

    return df


from itertools import combinations

result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))

for no, drop_column in enumerate(result_list):
    ml_churner_df = pd.read_csv("./data/bank_churner.csv")
    ml_churner_df = ml_churner_df.drop('cstno', axis=1)

    ml_churner_df = drop_null_column(ml_churner_df, drop_column)
    print(f'구분 : {no}, 남은 갯수: {len(ml_churner_df)}, Drop Col:{drop_column}')
    
    
    
    
    
# -----------
# 예측
# -----------

# 데이터 로드 및 고객번호 삭제
ml_churner_df = pd.read_csv("./data/bank_churner.csv")
ml_churner_df = ml_churner_df.drop('cstno', axis=1)

# Null 처리

ml_churner_df = drop_null_column(ml_churner_df, ['tot_trans_cnt_for_12m'])
ml_churner_df.shape

ml_churner_df = encode_onehot(ml_churner_df)  


#We create our feature matrix and our target variable vector.
X=ml_churner_df.drop(['is_churned'],axis=1)
y=ml_churner_df['is_churned']

X_new, selected_columns = select_feature(X, 'ExtraTrees')
display(selected_columns)

display(X_new.shape)

X_train, y_train, X_test, y_test = proc_smote(X_new, y)
X_train_org = X_train.copy()

display(X_train.shape)

X_train, X_test = proc_normalization(X_train, X_test)    


model_comparison = {}  #Dictionary to store the comparison metrics of models
                        # Diccionario para almacenar las métricas de comparación de modelos
model_eval_comparison = {}                        

display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
fit_predict(model_comparison, X_train, y_train, X_test, y_test)

print_eval_result(model_comparison)



# -----------
# 평가
# -----------
test_df = pd.read_csv("./data/test_churner.csv")
test_df = test_df.drop('cstno', axis=1)


test_df = encode_onehot(test_df)  


#We create our feature matrix and our target variable vector.
X=test_df.drop(['is_churned'],axis=1)
y=test_df['is_churned']
y_test = y


display(selected_columns)
X_new = X[selected_columns]


X_train_temp, X_test = proc_normalization(X_train_org, X_new.values)   


display(X_test.shape)

display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
fit_predict(model_eval_comparison, X_train, y_train, X_test, y_test)

print_eval_result(model_eval_comparison)

## 최종 테스트 데이터로 평가
***

### 평가(학습 및 예측 결과)

# 메인 처리 for 평가

In [None]:
print_eval_result(model_comparison)

In [None]:
test_df = pd.read_csv("./data/test_churner.csv")
test_df = test_df.drop('cstno', axis=1)


test_df = encode_onehot(test_df)  


#We create our feature matrix and our target variable vector.
X=test_df.drop(['is_churned'],axis=1)
y=test_df['is_churned']
y_test = y


display(selected_columns)
X_new = X[selected_columns]


X_train_temp, X_test = proc_normalization(X_train_org, X_new.values)   


display(X_test.shape)

display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
fit_predict(model_eval_comparison, X_train, y_train, X_test, y_test)

print_eval_result(model_eval_comparison)

Index(['mon_on_book', 'tot_product_count', 'months_inact_for_12m',
       'contact_cnt_for_12m', 'credit_line', 'tot_revol_balance',
       'tot_trans_amt_for_12m', 'tot_trans_cnt_for_12m',
       'tot_cnt_ratio_q4_q1'],
      dtype='object')

## Deep Learning
***

In [None]:
#!pip install tensorflow

### Import Library and Data Loading, Function Definition for Machine Learning

#### Import Library

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.metrics import auc

#### Data Loading

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")

### 전처리(Pre-Processing)

In [None]:
# 테스트 데이터 전처리

def test_transform(x_test):
    ''' 전처리 함수 정의'''
    
    # 불필요 컬럼 제거(고객번호)
    # -------------------------
    x_test = x_test.drop('cstno', axis=1)
    
    
    # 성별 변환('F':0, 'M':1)
    # -------------------------
    x_test['sex']=x_test['sex'].replace({'F':0,'M':1})
    x_test['is_churned']=x_test['is_churned'].replace({'Existing Customer':0,'Attrited Customer':1})
    
    
    # 다중공선성 컬럼 제거
    # -------------------
    x_test = x_test.drop('mon_on_book', axis = 1)
    x_test = x_test.drop('mean_open_to_buy', axis = 1)
    x_test = x_test.drop('tot_trans_cnt_for_12m', axis = 1)


    # 범주형 데이터 One-Hot 인코딩
    # --------------------------
    x_test = pd.concat([x_test,pd.get_dummies(x_test['education']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['imcome_cat']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['marital_stat']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['card_type']).drop(columns=['Platinum'])],axis=1)
    x_test.drop(columns = ['education','imcome_cat','marital_stat','card_type'],inplace=True)


    # Null 처리 1 방식
    # ---------------
    # x_test.dropna(axis=0, inplace=True)

    
    # Null 처리 2 방식
    # ---------------
    # x_test.drop(columns = ['sex'], inplace=True)
    # x_test.drop(columns = ['tot_revol_balance'], inplace=True)
    # x_test.drop(columns = ['tot_amt_ratio_q4_q1'], inplace=True)        
    # x_test.drop(columns = ['tot_trans_amt_for_12m'], inplace=True)        
    # x_test.drop(columns = ['tot_cnt_ratio_q4_q1'], inplace=True)        
    # x_test.drop(columns = ['mean_util_pct'], inplace=True)


    # # Null 처리 3 방식
    # # ----------------
    x_test.drop(columns = ['mean_util_pct'], inplace=True)
    x_test.dropna(axis=0, inplace=True)
        
    return x_test

In [None]:
bank_churner_df = test_transform(bank_churner_df)

y=bank_churner_df['is_churned']
X_new=bank_churner_df.drop(['is_churned'], axis=1)

X_new = X_new[['age', 'dependent_num', 'tot_product_count', 'months_inact_for_12m',
       'contact_cnt_for_12m', 'credit_line', 'tot_revol_balance',
       'tot_amt_ratio_q4_q1', 'tot_trans_amt_for_12m', 'tot_cnt_ratio_q4_q1']]

X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

#Normalization
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and test sets
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

In [None]:
#DEEP LEARNING

# Scale the data with StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

# Build the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predict on the test data
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba >= 0.5).astype(int)


# --------------------------------
y_pred = model.predict(X_test).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred)

from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)
print('auc_keras: ')
# --------------------------------

# Calculate metrics
# print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
# print(f"Model F1-Score: {f1_score(y_test, y_pred, average='weighted') * 100:.2f}%")
# print(classification_report(y_test, y_pred, zero_division=1))

# Calculate accuracies per class
# accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
# accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])

#pred_proba_proba = model.predict_proba(X_test)[:, 1]
from sklearn.ensemble import RandomForestClassifier
# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=10)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_rf)
auc_rf = auc(fpr_rf, tpr_rf)

#get_clf_eval(y_test, y_pred, y_pred_proba)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Zoom in view of the upper left corner.
plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()


In [None]:
auc_keras

In [None]:
'''
# Compute confusion matrix for the Deep learning model
from sklearn.metrics import classification_report, confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix. Normalization can be applied by setting normalize=True.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix without normalization')

    print(cm)

    #Plot the confusion matrix.
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.grid(False)  # <-- Agregar esta línea para evitar el aviso de deprecación
    plt.title(title,fontsize=18)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Add labels to the cells of the confusion matrix.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",fontsize=16,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label',fontsize=15)
    plt.xlabel('Predicted label',fontsize=15)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

print(confusion_matrix(y_test, y_pred, labels=[0,1]))


cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['y=0','y=1'],normalize= True,  title='Confusion matrix')

'''

# 결론

## 고객 이탈 예측 분석
LightGBM은 91%의 가장 높은 Attrited Customer Recall과 89%의 정밀도를 가지고 있음
고객 이탈을 사전에 방지하기 위해서 LightGBM 모델을 사용하는 것이 적합함