# JBFG Data Analysis Competition
- Null 처리방식 : 2번

#### Import Library for Data Analysis

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl  
import missingno as msno
import warnings

import plotly.express as px
import plotly.graph_objs as go
# import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()


warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
%matplotlib inline

mpl.rc('font', family='Malgun Gothic')  # 한글 폰트 설정
                                        # 윈도우 폰트 위치 - C:\Windows\Fonts
plt.figure(figsize=(10,6))              # 그래프 사이즈 설정
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus':False}, style='darkgrid') # 마이너스 처리


#### 컬럼 데이터 및 Null 건수 확인

In [None]:

def change_format(df, column, format):
    if format == 'comma':
        df[column] = df[column].apply(lambda x: f"{x:,}")
    elif format == 'percent':
        df[column] = df[column].apply(lambda x: f"{x:.2%}")
        
    return df


def count_column_na_count(df, column):
    '''
    '''
    column_na_counts = df[column].size, df[column].count(), df[column].isnull().sum()
    column_na_counts_df = pd.Series(column_na_counts).to_frame().T
    column_na_counts_df.columns = ['tot_counts', 'data_counts', 'null_counts']
    column_na_counts_df['data_percents'] = column_na_counts_df['data_counts'].values/column_na_counts_df['tot_counts'].values
    column_na_counts_df['null_percents'] = column_na_counts_df['null_counts'].values/column_na_counts_df['tot_counts'].values


    column_na_counts_df = change_format(column_na_counts_df, 'tot_counts', 'comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'null_counts','comma')
    column_na_counts_df = change_format(column_na_counts_df, 'data_percents', 'percent')
    column_na_counts_df = change_format(column_na_counts_df, 'null_percents', 'percent')

    print(column_na_counts_df.to_string(index=False))
    print('-'*70)


def count_column_data_count(df, column):
    ''' 
    '''
    # column_data_countcounts = df.groupby(column)['is_churned'].value_counts().unstack()


    column_counts = df.groupby(column)['is_churned'].value_counts().unstack()
    column_counts = column_counts.rename(columns={0: 'exist_counts', 1: 'churned_counts'})
    column_counts['total_counts'] =  column_counts['exist_counts'] + column_counts['churned_counts']
    column_counts = column_counts.fillna(0)

    column_percents = df.groupby(column)['is_churned'].value_counts(normalize=True).unstack()
    column_percents = column_percents.rename(columns={0: 'exist_percents', 1: 'churned_percents'})
    column_percents = column_percents.fillna(0)


    column_count_percent = pd.concat([column_counts, column_percents], axis=1)
    column_count_percent = column_count_percent.reset_index()
    column_count_percent = column_count_percent.sort_values(by='churned_percents', ascending=False)

    
    column_count_percent = change_format(column_count_percent, 'exist_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'churned_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'total_counts', 'comma')
    column_count_percent = change_format(column_count_percent, 'exist_percents', 'percent')
    column_count_percent = change_format(column_count_percent, 'churned_percents', 'percent')
    

    print(column_count_percent.to_string(index=False))

## Machine Learning
***

### Import Library and Data Loading, Function Definition for Machine Learning

#### Import Library

In [None]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve

# import scikitplot as skplt
from imblearn.over_sampling import SMOTE

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE

#### Function Definition

#### Data Loading

In [None]:
ml_churner_df = pd.read_csv("./data/bank_churner.csv")

### 전처리(Pre-Processing)

#### 불필요한 컬럼 삭제

In [None]:
ml_churner_df = ml_churner_df.drop('cstno', axis=1)

#### 전처리 함수 정의

#### 결측(Null) 데이터 확인 및 처리

In [None]:
def tot_column_counts(df):
    ''' 
    '''
    data_counts = df.count()
    null_counts = df.isnull().sum()
    tot_counts_df = pd.concat([data_counts, null_counts], axis=1)
    tot_counts_df = tot_counts_df.rename(columns={0: 'data_counts', 1: 'null_counts'})
    tot_counts_df.insert(0,'tot_counts', tot_counts_df['data_counts'] + tot_counts_df['null_counts'])
    tot_counts_df['data_percents'] = tot_counts_df['data_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df['null_percents'] = tot_counts_df['null_counts'].values / tot_counts_df['tot_counts'].values
    tot_counts_df = tot_counts_df.sort_values(by='null_percents', ascending=False)

    tot_counts_df = change_format(tot_counts_df, 'tot_counts', 'comma')
    tot_counts_df = change_format(tot_counts_df, 'data_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'null_counts','comma')
    tot_counts_df = change_format(tot_counts_df, 'data_percents', 'percent')
    tot_counts_df = change_format(tot_counts_df, 'null_percents', 'percent')

    tot_counts_df = tot_counts_df.reset_index()

    print(tot_counts_df.to_string(index=False))

#### 원-핫(One-Hot) 인코딩

In [None]:

def encode_onehot(df):
    # sex                    7293 non-null   object 
    # education              8101 non-null   object 
    # marital_stat           8101 non-null   object 
    # imcome_cat             6482 non-null   object 
    # card_type              8101 non-null   object 

    # # ml_churner_df = pd.concat([ml_churner_df, pd.get_dummies(ml_churner_df['sex'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['education']).drop(columns=['Unknown'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['marital_stat']).drop(columns=['Unknown'])], axis=1)
    # # ml_churner_df = pd.concat([ml_churner_df, pd.get_dummies(ml_churner_df['imcome_cat']).drop(columns=['Unknown'])], axis=1)
    # df = pd.concat([df, pd.get_dummies(df['card_type'])], axis=1)
    # # ml_churner_df.drop(columns = ['sex', 'education', 'marital_stat', 'imcome_cat', 'card_type'], inplace=True)
    # df.drop(columns = ['education', 'marital_stat', 'card_type'], inplace=True)
    
    catcols = df.select_dtypes(exclude = ['int64','float64']).columns
    df = pd.get_dummies(df, columns = catcols)
    
    return df, catcols

# ml_churner_df = pd.read_csv("./data/bank_churner.csv")
# ml_churner_df = ml_churner_df.drop('cstno', axis=1)

# ddf, catcols = encode_onehot(ml_churner_df)

#### 학습에 사용될 중요 Feature 식별

In [None]:
def select_feature(df, model):

    #Set the random seed for reproducibility
    np.random.seed(42)

    #Define a list of available models for selection
    available_models = {
        'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
        'RandomForest': RandomForestClassifier(n_estimators=100),
        'SVM': SVC(kernel='linear'),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'LASSO': Lasso(alpha=0.01),  # Agrega LASSO aquí
        'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=13)
    }

    # Choose the desired model for feature selection
    chosen_model = model

    # Create the selected model
    clf = available_models[chosen_model]

    #Train the model with the data
    clf = clf.fit(df.values, y)

    # Obtain feature importances from the model
    feature_importances = clf.feature_importances_

    # Create a SelectFromModel object with the trained classifier
    model = SelectFromModel(clf, prefit=True)

    #Transform the original features to obtain the selected ones

    X_df = model.transform(df.values)

    selected_feature_indices = model.get_support(indices=True)

    #Get the indices of the selected features
    selected_columns = df.columns[selected_feature_indices]
    
    return X_df, selected_columns

In [None]:
def proc_smote(X_new, y):
    #Model Training
    from sklearn.model_selection import train_test_split

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)


    from imblearn.over_sampling import SMOTE
    X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

    sm = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train=sm.fit_resample(X_train,y_train)
    
    return X_train, y_train, X_test, y_test


#### Normalization

In [None]:
def proc_normalization(X_train, X_test):
    #Normalization
    from sklearn.preprocessing import StandardScaler
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    
    return X_train, X_test

### 모델별 학습 및 평가

In [None]:
def fit_predict(proc_type, drop_no, model_comparison, X_train, y_train, X_test, y_test):
    #Training with different models
    #entrenamiento con distintos modelos
    from sklearn.metrics import accuracy_score, f1_score, classification_report
    from sklearn.model_selection import cross_val_score
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier
    from lightgbm import LGBMClassifier
    from xgboost import XGBClassifier

    #Create a list of tuples with the model name and the classifier instance
    # Crear una lista de tuplas con el nombre del modelo y la instancia del clasificador
    models = [
        # ('Logistic Regression', LogisticRegression()),
        # ('Decision Tree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
        # ('KNN', KNeighborsClassifier(n_neighbors=5)),
        # ('Naive Bayes', GaussianNB()),
        # ('Random Forest', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)),
        # ('LightGBM', LGBMClassifier(n_estimators=500, random_state=42, boosting_type='GOSS')),
        ('LightGBM', LGBMClassifier(n_estimators=500, random_state=42, boosting_type='GOSS')),
        ('Xg Boost', XGBClassifier(n_estimators=500, random_state=42, use_label_encoder=False,  eval_metric='logloss')),
    ]


    for model_name, classifier in models:
        #Fit the model using the training set
        classifier.fit(X_train, y_train)


        #Make predictions on the test set
        y_pred = classifier.predict(X_test)
        pred_proba = classifier.predict_proba(X_test)[:, 1]

        
        #Calculate model metrics
        accuracy = accuracy_score(y_test, y_pred)
        #f1 = f1_score(y_pred, y_test, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        ## 확인 필요 ??? 
        accuracies = cross_val_score(estimator=classifier, X=X_test, y=y_test, cv=5, scoring="recall")
        cv_accuracy = accuracies.mean()
        cv_std = accuracies.std()
        accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
        accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1], )
        roc_auc = roc_auc_score(y_test, pred_proba)
        
        #Print model metrics
        # print("-" * 30)
        # print(f"Model: {model_name}")
        # print("-" * 30)
        # print(f"Model Accuracy: {accuracy * 100:.2f}%")
        # print(f"Model F1-Score: {f1 * 100:.2f}%")
        # print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
        # print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")


        #Add metrics to the models comparison dictionary
        model_comparison[f'{model_name}_{proc_type}_{drop_no}'] = [accuracy, accuracy_class_0, accuracy_class_1, f1, cv_accuracy, cv_std, roc_auc]
        # print(classification_report(y_pred, y_test, zero_division=1))
        # print("-" * 60)

        
        # get_clf_eval(y_test, y_pred, pred_proba)
        
        # precision_recall_curve_plot(y_test, pred_proba)
        # roc_curve_plot(y_test , pred_proba)
        
        # print("-" * 100)



### 학습 및 예측 결과

In [None]:
def print_eval_result(model_comparison):
    import pandas as pd

    # MODEL COMPARISSON

    Model_com_df=pd.DataFrame(model_comparison).T
    Model_com_df.columns=['Model Accuracy','Model Accuracy-0','Model Accuracy-1','Model F1-Score','CV Accuracy','CV std', 'AUC']
    Model_com_df=Model_com_df.sort_values(by='AUC',ascending=False)
    # display(Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma'))


    Model_com_df = pd.DataFrame(model_comparison).T
    Model_com_df.columns = ['Model Accuracy', 'Model Accuracy-No', 'Model Accuracy-Yes', 'Model F1-Score', 'CV Accuracy', 'CV std', 'AUC']
    Model_com_df = Model_com_df.sort_values(by='AUC', ascending=False)

    def highlight_below_75(s):
        if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
            return ['color: red' if value < 0.75 else 'color: black' for value in s]
        else:
            return ['color: black'] * len(s)

    styled_df = Model_com_df.iloc[:10,:].style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV Accuracy']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV Accuracy'])
    display(styled_df)

# 메인 처리 for 예측

# 메인 처리 함수

# 메인 처리 함수

In [None]:
def drop_null_column(df, drop_list):
    
    for col_name in drop_list:
        # print(col_name, type(col_name))
        df = df.drop(col_name, axis=1)
        df.dropna(axis=0, inplace=True)

    return df


from itertools import combinations

# result_list = []
# drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
# for j in range(1, len(drop_target_columns)+1):
#     for i in combinations(drop_target_columns, j):
#         result_list.append(list(i))

# for no, drop_column in enumerate(result_list):
#     ml_churner_df = pd.read_csv("./data/bank_churner.csv")
#     ml_churner_df = ml_churner_df.drop('cstno', axis=1)

#     ml_churner_df = drop_null_column(ml_churner_df, drop_column)
#     print(f'구분 : {no}, 남은 갯수: {len(ml_churner_df)}, Drop Col:{drop_column}')
    
    
    
    
model_comparison = {}  #Dictionary to store the comparison metrics of models
model_eval_comparison = {}                        
    
# -----------
# 예측
# -----------

# 데이터 로드 및 고객번호 삭제
# ml_churner_df = pd.read_csv("./data/bank_churner.csv")
# ml_churner_df = ml_churner_df.drop('cstno', axis=1)

# Null 처리
'''
result_list = []
drop_target_columns = ['sex','imcome_cat', 'tot_amt_ratio_q4_q1', 'mean_util_pct', 'tot_trans_cnt_for_12m','age','mean_open_to_buy','tot_trans_amt_for_12m']
# drop_target_columns = ['sex','imcome_cat']
#drop_target_columns = ['sex','imcome_cat']
for j in range(1, len(drop_target_columns)+1):
    for i in combinations(drop_target_columns, j):
        result_list.append(list(i))
'''

# result_list = [['tot_amt_ratio_q4_q1']
# ,['mean_util_pct']
# ,['sex', 'age']
# ,['sex', 'mean_open_to_buy']
# ,['tot_amt_ratio_q4_q1', 'mean_util_pct']
# ,['tot_amt_ratio_q4_q1', 'age']
# ,['tot_amt_ratio_q4_q1', 'mean_open_to_buy']
# ,['mean_util_pct', 'age']
# ,['sex', 'imcome_cat', 'mean_open_to_buy']
# ,['imcome_cat', 'mean_util_pct', 'mean_open_to_buy']
# ,['tot_amt_ratio_q4_q1', 'mean_util_pct', 'age']
# ,['tot_amt_ratio_q4_q1', 'mean_util_pct', 'mean_open_to_buy']
# ,['tot_amt_ratio_q4_q1', 'tot_trans_cnt_for_12m', 'mean_open_to_buy']
# ,['mean_util_pct', 'age', 'mean_open_to_buy']
# ,['sex', 'imcome_cat', 'age', 'mean_open_to_buy']
# ,['sex', 'mean_util_pct', 'age', 'mean_open_to_buy']
# ,['tot_amt_ratio_q4_q1', 'mean_util_pct', 'age', 'mean_open_to_buy']
# ,['sex', 'imcome_cat', 'mean_util_pct', 'age', 'mean_open_to_buy']
# ]

ml_churner_df = pd.read_csv("./data/bank_churner.csv")
ml_churner_df = ml_churner_df.drop('cstno', axis=1)
ml_churner_df = ml_churner_df.drop('sex', axis=1)
ml_churner_df['imcome_cat']=ml_churner_df['imcome_cat'].replace({'Less than $40K':40000, '$40K - $60K':50000, '$60K - $80K':70000, '$80K - $120K':100000, '$120K +':120000, 'Unknown':63000})

# ml_churner_df = ml_churner_df.groupby('age').apply(lambda x: x.fillna(x.mean(numeric_only=True)))
ml_churner_df = ml_churner_df.fillna(ml_churner_df.mean(numeric_only=True))
ml_churner_df.dropna(axis=0, inplace=True)

# ml_churner_df = drop_null_column(ml_churner_df, 'aaa')
after_null_drop_cnt = len(ml_churner_df)

ml_churner_df, catcols = encode_onehot(ml_churner_df)  


#We create our feature matrix and our target variable vector.
X=ml_churner_df.drop(['is_churned'],axis=1)
y=ml_churner_df['is_churned']

X_new, selected_columns = select_feature(X, 'ExtraTrees')
# display(selected_columns)

X_train, y_train, X_test, y_test = proc_smote(X_new, y)
X_train_org = X_train.copy()

X_train, X_test = proc_normalization(X_train, X_test)    

drop_no = 1
drop_column = 'abc'
print(f'구분: {drop_no}, X_train 건수: {len(X_new)}, X_train_SMOTE 건수: {len(X_train)}, After Drop 건수: {after_null_drop_cnt}, Drop Col:{drop_column}')

# display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
proc_type='P'
fit_predict(proc_type, drop_no, model_comparison, X_train, y_train, X_test, y_test)

# print_eval_result(model_comparison)



# -----------
# 평가
# -----------
test_df = pd.read_csv("./data/test_churner.csv")
test_df = test_df.drop('cstno', axis=1)


test_df, catcols = encode_onehot(test_df)  

    
#We create our feature matrix and our target variable vector.
X=test_df.drop(['is_churned'],axis=1)
y=test_df['is_churned']
y_test = y

# display(selected_columns)
X_new = X[selected_columns]


X_train_temp, X_test = proc_normalization(X_train_org, X_new.values)   


# display(X_test.shape)

# display(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
proc_type='E'
fit_predict(proc_type, drop_no, model_eval_comparison, X_train, y_train, X_test, y_test)

# print_eval_result(model_eval_comparison)
print('종료')
print('-'*70)

In [None]:
print_eval_result(model_comparison)

In [None]:
print_eval_result(model_eval_comparison)

## 최종 테스트 데이터로 평가
***

### 평가(학습 및 예측 결과)

# 메인 처리 for 평가

## 고객 이탈 예측 분석
LightGBM은 91%의 가장 높은 Attrited Customer Recall과 89%의 정밀도를 가지고 있음
고객 이탈을 사전에 방지하기 위해서 LightGBM 모델을 사용하는 것이 적합함