In [None]:
!pip install -q feature_engine

In [None]:
!pip install shap

In [None]:
!pip install catboost

In [None]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings 
warnings.filterwarnings("ignore")

import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import ast

pd.set_option('display.max_rows', 1000)

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

def test_transform(x_test):
    ''' 전처리 함수 정의'''
    
    # 불필요 컬럼 제거(고객번호)
    # -------------------------
    x_test = x_test.drop('cstno', axis=1)
    
    
    # 성별 변환('F':0, 'M':1)
    # -------------------------
    #x_test['sex']=x_test['sex'].replace({'F':0,'M':1})
    
    
    # # 다중공선성 컬럼 제거
    # # -------------------------
    # x_test = x_test.drop('mon_on_book', axis = 1)
    # x_test = x_test.drop('mean_open_to_buy', axis = 1)
    # x_test = x_test.drop('tot_trans_cnt_for_12m', axis = 1)
    
    
    # Null 처리
    x_test.drop(columns = ['mean_util_pct'], inplace=True)
    x_test.dropna(axis=0, inplace=True)
    
    return x_test

bank_churner_df_org = test_transform(bank_churner_df_org)
df = bank_churner_df_org.copy()

In [None]:
df.info()

In [None]:
int(False)

In [None]:
# select main label
main_label = 'is_churned'
# df[main_label] = (df[main_label]!='Existing Customer').astype(int)
# group columns by larger bins
df['age'] = df['age'].apply(lambda x: 5*round(1/5*x))
df['mon_on_book'] = df['mon_on_book'].apply(lambda x: 6*round(1/6*x))
df['tot_amt_ratio_q4_q1'] = df['tot_amt_ratio_q4_q1'].apply(lambda x: 1/10*round(10*x))
df['tot_cnt_ratio_q4_q1'] = df['tot_cnt_ratio_q4_q1'].apply(lambda x: 1/10*round(10*x))
# df['mean_util_pct '] = df['mean_util_pct '].apply(lambda x: 1/10*round(10*x))
# log10 transform columns and group by larger bins
for col in ['credit_line', 'tot_revol_balance', 'mean_open_to_buy', 'tot_trans_amt_for_12m', 'tot_trans_cnt_for_12m']:
    df[f'log10_{col}'] = df[col].apply(lambda x: 1/5*round(5*np.log10(1+x)))
    df = df.drop([col], axis=1)

# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['sex', 'education', 'marital_stat', 'imcome_cat', 'card_type']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with='Other', tol=20/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
print(df.shape)
df.sample(5).T

In [None]:
df.describe().T

## 머신 러닝

In [None]:
# initialize data
y = df[main_label].values.reshape(-1,)
X = df.drop([main_label], axis=1)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.5, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# initialize Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)
# specify the training parameters 
model = CatBoostClassifier(iterations=1200,
                           depth=5,
                           border_count=22,
                           l2_leaf_reg=0.3,
                           learning_rate=3e-2,
                           verbose=0)

#train the model
model.fit(train_pool)
# make the prediction using the resulting model
y_train_pred = model.predict_proba(train_pool)[:,1]
y_test_pred = model.predict_proba(test_pool)[:,1]
roc_auc_train = roc_auc_score(y_train, y_train_pred)
roc_auc_test = roc_auc_score(y_test, y_test_pred)
print(f"ROC AUC score for train {round(roc_auc_train,4)}, and for test {round(roc_auc_test,4)}")

In [None]:
# calculating the baseline ROC AUC score assuming the same probability from training labels to test
roc_auc_baseline = roc_auc_score(y_test, [np.mean(y_train)]*len(y_test))
print(roc_auc_baseline)

## Explanations with SHAP values

In [None]:
shap.initjs()
ex = shap.TreeExplainer(model)
print(f"Average churn probability is {round(np.mean(y_test),4)}")
shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values, X_test, max_display=30)

In [None]:
def show_shap(col, shap_values=shap_values, label=main_label, X_test=X_test, ylabel='SHAP value'):
    df_infl = X_test.copy()
    df_infl['shap_'] = shap_values[:,df_infl.columns.tolist().index(col)]
    gain = round(df_infl.groupby(col).mean()['shap_'],4)
    gain_std = round(df_infl.groupby(col).std()['shap_'],4)
    cnt = df_infl.groupby(col).count()['shap_']
    dd_dict = {'col': list(gain.index), 'gain': list(gain.values), 'gain_std': list(gain_std.values), 'count': cnt}
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    plt.figure(figsize=(8,6))
    plt.errorbar(df_res.index, df_res['gain'], yerr=df_res['gain_std'], fmt="o", color="r")
    plt.title(f'SHAP values for column {col}, label {label}')
    plt.ylabel(ylabel)
    plt.tick_params(axis="x", rotation=90)
    plt.show();
    print(df_res)
    return

for col in X_test.columns:
    print()
    print(col)
    print()
    show_shap(col, shap_values, label=main_label, X_test=X_test)