# 데이터 탐색 과정
---

## 데이터 분석을 위한 Library 데이터 준비 과정

In [None]:
#!pip install watermark
%load_ext watermark
%watermark -a 'DataLine' -nmv --packages numpy,pandas,sklearn,matplotlib,seaborn,plotly,imblearn,missingno

### 분석을 위한 Library 로딩

In [None]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import matplotlib as mpl  
# import missingno as msno
import warnings

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix

import scikitplot as skplt
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
# sns.set_theme(style="whitegrid")
# %matplotlib inline

# mpl.rc('font', family='Malgun Gothic')  # 한글 폰트 설정
#                                         # 윈도우 폰트 위치 - C:\Windows\Fonts
# plt.figure(figsize=(10,6))              # 그래프 사이즈 설정
# sns.set(font='Malgun Gothic', rc={'axes.unicode_minus':False}, style='darkgrid') # 마이너스 처리


### 데이터 세트 로딩
"data/bank_churner.csv"를 판다스 데이터프레임으로 로딩(pd.read_csv)

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

### 데이터 세트 정보 확인
- 일부 Feature에 Null 값 존재함을 확인 함
- 향후 모델학습시 Null 값 처리에 대한 필요성 확인 함
- 모델학습을 의해 Oject 항목을 적절하게 변형할 필요성을 확인 함 - sex, education, marital_stat, imcome_cat, card_type (5개 Features) 

In [None]:
bank_churner_df.info()

### 수치 데이터의 분포값 개략 확인

In [None]:
bank_churner_df.describe()

### 결측치 확인 및 시각화

In [None]:
#print(f'데이터 총건수 = {bank_churner_df.shape[0]}  컬럼별 Null 건수 = {bank_churner_df.isnull().sum()}')

# bank_churner_df.isnull().sum() / bank_churner_df.shape[0]
print(bank_churner_df.isnull().sum())

null_rates = bank_churner_df.isnull().sum() / 8101 * 100
null_rates


### 그래프 함수 정의

In [None]:
    
def print_category_graphs(df, column, column_desc):
    
    counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
    exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
    churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
    churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
    
    
    fig = make_subplots(rows=3, 
                    cols=2, 
                    subplot_titles=('【 전체 현황 】', '【 이탈율 】', '【 사분위 】', f'【 {column_desc} 중 전체 현황 】', f'【 {column_desc} 중 유지 현황 】', f'【 {column_desc} 중 이탈 현황 】'), 
                    # shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1,
                    specs=[[{"secondary_y": True}, {}],
                           [{}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}]]
                   )


    # 전체 현황
    # ---------
    fig.add_trace(go.Bar(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), marker_color="red", offsetgroup=0, name='이탈', 
                         text=churn_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto'), row=1, col=1, secondary_y=False)
    
    
    fig.add_trace(go.Bar(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), marker_color="blue", offsetgroup=0, name='유지', 
                         texttemplate='%{value:,}', 
                        #  text=exist_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto', base=churn_counts.sort_index()), row=1, col=1, secondary_y=False)
    
    fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', 
                             line_shape='linear'), row=1, col=1, secondary_y=True)
    
    fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)
    #fig.update_traces(texttemplate='%{value:,}', hovertemplate = '%{label}, %{value}', row=1, col=1)
    # fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
    

    # 이탈율
    # ------
    fig.add_trace(go.Bar(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="red", name='이탈율'),
                  row=1, col=2)

    
    # 사분위
    # ------
    fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], marker_color="red", name='이탈'), row=2, col=1)
    fig.add_trace(go.Box(x=df[df['is_churned']==0][column], marker_color="blue", name='유지'), row=2, col=1)


    # 유지/이탈 현황
    # -------------
    fig.add_trace(go.Pie(labels=counts.sort_index().index, values=counts.sort_index(), name=f'{column_desc} 분표 현황', title='전체', texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=2, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=2, col=2)

  
    # 유지 현황
    # ---------
    fig.add_trace(go.Pie(labels=exist_counts.sort_index().index, values=exist_counts.sort_index(), name="유지", title='유지',texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=1)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=1)


    # 이탈 현황
    # ---------
    fig.add_trace(go.Pie(labels=churn_counts.sort_index().index, values=churn_counts.sort_index(), name="이탈", title='이탈',texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=2)


    fig.add_annotation(dict(x=0.73, y=0.5, ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "<b>전체</b>", 
                    font_size=20,
                  ))

    fig.add_annotation(dict(x=0.21, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>유지</b>", 
                        font_size=20,
                      ))

    fig.add_annotation(dict(x=0.73, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>이탈</b>", 
                        font_size=20,
                      ))

    
    fig.update_layout(width=1200, 
                  height=1200, 
                  showlegend=False,
                  title_text=f'『 {column_desc} 』에 따른 분석 그래프',
                # barmode='stack'
                  hovermode="x",
                 )
    

    fig.show()    

In [None]:
def print_continuous_graphs(df, column, column_desc):

       counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
       exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
       churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
       churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    

       fig = make_subplots(rows=3, 
                     cols=2, 
                     subplot_titles=('전체 건수 분포', '유지/이탈별 사분위', '유지/이탈별 분포'), 
                     # shared_xaxes=True,
                     horizontal_spacing=0.1,
                     vertical_spacing=0.1,
                     specs=[[{"secondary_y": True}, {}],
                            [{"secondary_y": True}, {"secondary_y": True}],
                            [{"secondary_y": True},{}],
                            ]

                     )

       # 전체
       # ----
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column],  marker_color="red"), row=1, col=1, secondary_y=False)
       # fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], texttemplate="%{x}", marker_color="red"), row=1, col=1, secondary_y=False)
       
       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], marker_color="blue"), row=1, col=1, secondary_y=False)
       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=1, col=1, secondary_y=True)
       fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)


       
       # Box Graph
       # ---------
       # fig.add_trace(go.Box(x=exist_counts, 
       #               name='유지'), row=2, col=1)
       fig.add_trace(go.Box(x=df[df['is_churned']==0][column], name='유지', marker_color="blue"), row=1, col=2)
       fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈', marker_color="red"), row=1, col=2)


       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], marker_color="blue"), row=2, col=1)
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], marker_color="red"), row=2, col=1)

       fig.add_trace(go.Scatter(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), mode='lines+markers', marker_color="red", name='이탈'), row=2, col=2, secondary_y=False)
       fig.add_trace(go.Scatter(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), mode='lines+markers', marker_color='blue', name='유지'), row=2, col=2, secondary_y=False)

       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=2, col=2, secondary_y=True)


       # 이탈률
       # ------
       # churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
       fig.add_trace(go.Histogram(x=churn_rates.sort_index()), row=3, col=1)

       fig.update_yaxes(secondary_y=True, range=[0, 1], row=3, col=1)

       fig.update_layout(width=1200, 
                     height=1200, 
                     showlegend=False,
                     barmode='stack'
                     )

       fig.show()


## Feature별 특징 확인

### is_churned : 이탈  여부
- 0 : 유지, 1 : 이탈

In [None]:
bank_churner_df["is_churned"].value_counts()

In [None]:
tot_cnt = bank_churner_df['is_churned'].count().sum()
tot_null_cnt = bank_churner_df['is_churned'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
fig = go.Figure()
fig.add_trace(go.Pie(labels=['유지', '이탈'], values=bank_churner_df['is_churned'].value_counts().sort_index(), name='이탈별 분포', 
                    #  texttemplate = "<b>%{label}:</b> %{value:,}명 <br>(%{percent})",
                     texttemplate = "%{value:,}명 <br><b>(%{percent})</b>",
                     title='<b>전체<br> 8,101</b>',
                         textposition = "inside"))
fig.update_traces(hole=.4, hoverinfo="label+percent+name", pull=[0,0.1])
fig.update_layout(width=500, 
                  height=500, 
                  showlegend=True,
                  title_text="<b>유지/이탈 분포 현황<b>",
                  title_x = 0.5,
                  title_y = 0.9,
                  title_xanchor = "center",
                  title_yanchor = "middle")


fig.show()



### age : 나이

In [None]:
print_continuous_graphs(bank_churner_df, 'age', '나이')

- 나이에 대해 일부 이상치가 있지만, 대체적으로 유지 고객과 이탈 고객의 분포가 일치

In [None]:
# 나이를 나이대별로 범주화
def age_categorize(age):
    age = (age // 10) * 10
    return age

bank_churner_df['age_category'] = bank_churner_df.age.apply(age_categorize)

In [None]:
print_category_graphs(bank_churner_df, 'age_category', '나이 범주')

### sex : 성별

In [None]:
print_category_graphs(bank_churner_df, 'sex', '성별')

남성과 여성의 감소율에는 큰 차이가 없음

### dependent_num : 부양가족수

In [None]:
print_category_graphs(bank_churner_df, 'dependent_num', '부양가족수')

- 부양가족수별 고객이탈율은 비슷하게 형태를 보이고 있음

### education : 교육수준
- Graduate : 대학원
- High School : 고졸
- Unknown
- Uneducated : 미교육
- College : 단과대학
- Post-Graduate : 보딩스쿨(재수)
- Doctorate :박사

In [None]:
print_category_graphs(bank_churner_df, 'education', '교육수준')

In [None]:
박사 학위를 받은 고객의 경우 이탈율이 더 높게 나타나고, 고등학교 및 대학 교육을 받은 고객은 이탈율이 더 낮은 것으로 보이나, 
박사 학위 고객 수는 가장 낮음
박사 학위의 경우 카드사의 혜택을 더 많이 고려하여 이동하는 것으로 보임   

### marital_stat : 결혼상태
- Married  : 결혼
- Single   : 미혼
- Divorced : 이혼

In [None]:
print_category_graphs(bank_churner_df, 'marital_stat', '결혼상태')

- 결혼 상태에 따른 고객 감소율의 차이는 없는 것으로 보임

### imcome_cat : 수입규모
- Less than $40K    2277
- $40K - $60K       1151
- $60K - $80K        891
- $80K - $120K       988
- $120K +            473
- Unknown            702


In [None]:
print_category_graphs(bank_churner_df, 'imcome_cat', '수입규모')

소득 수입규모가 12만 달러 이상인 고객, 4만 달러 미만인 고객의 이탈율이 높게 나타나고 있으면, 
전체 고객대비 소득 수입규모가 40만 달러 미만인 고객이 차지하는 점유율이 높으므로 이 범주의 고객을 중점적으로 관리할 필요가 
있음
전반적으로 소득 수입규모별 감소율은 큰 차이가 없음

### card_type : 카드종류
- Blue       
- Silver     
- Gold       
- Platinum   

In [None]:
print_category_graphs(bank_churner_df, 'card_type', '카드종류')

플래티넘 카드 소유자의 건수는 매우 낮음에도 이탈율은 상당히 높게 나타나고 있음.
이는 플래티넘 카드를 사용하는 고객의 만족도가 매우 낮은 것으로 보이며 플래티넘 카드의 혜택을 다른 카드사와 비교하여 
부족한 부문을 찾아내어 개선하거나 고객이 만족할만한 혜택을 제공하여 이탈율을 낮출 필요가 있음 

### mon_on_book : 은행 거래 기간
- 은행 거래 개월 수

In [None]:
print_continuous_graphs(bank_churner_df, 'mon_on_book', '은행 거래 기간')

In [None]:
# 은행 거래 기간 변환
# 30대, 40대, 50대
 
def calcUseMonth(mon_on_book):
    mon_on_book = (mon_on_book // 10) * 10
    return mon_on_book


bank_churner_df['mon_on_book_category'] = bank_churner_df.mon_on_book.apply(calcUseMonth)


In [None]:
print_category_graphs(bank_churner_df, 'mon_on_book_category', '은행거래기간 범주')

은행거래 기간에 따른 이탈율은 큰 차이가 없음

### tot_product_count : 현재 보유 상품 개수

In [None]:
print_category_graphs(bank_churner_df, 'tot_product_count', '현재 보유 상품 개수')

- 이탈하지 않은 고객의 카드 보유 갯수의 중앙값은 4개이고 이탈한 고객의 중앙값은 3개임.
- 카드의 개수가 적을수록 이탈율이 높아지므로 4개 이상의 카드를 보유할 수 있도록 노력 필요

### months_inact_for_12m : 최근 12개월 동안 카드 거래가 없었던 개월 수

In [None]:
print_category_graphs(bank_churner_df, 'months_inact_for_12m', '최근 12개월 동안 카드 거래가 없었던 개월 수')

- 카드 거래 없는 개월수가 4개월 동안 없으면 이탈율이 정점을 찍고 이후 감소하는 경향을 보이고 있음
- 이탈한 고객의 평균 기간은 3개월이고 유지하는 고객의 경우 2개월 임
- 1개월 이상 카드 거래가 없는 고객은 잠재적으로 이탈할 확률이 높을 것으로 예상되므로 주기적으로 관리할 필요가 있음


### contact_cnt_for_12m : 최근 12개월 동안 연락 횟수

In [None]:
print_category_graphs(bank_churner_df, 'contact_cnt_for_12m', '최근 12개월 동안 연락 횟수')

- 고객 접촉 건수와 이탈율 사이에는 명확한 관계가 있어 보임
- 접촉 건수가 많을 수록 이탈율이 높아지는 경향을 보임

- 데이터상 12개월동안 접촉건수가 고객이 접촉한 건수인지, 은행에서 접속한 건수 인지를 구분하여 분석할 필요가 있음
   - 은행이 고객을 접촉한 건수라면 연체 등으로 잦은 고객 독촉으로 이탈율이 높아질 가능성 있어 보이며 
   - 고객이 은행을 접촉한 건수라면 카드 관련 서비스의 불만을 원할하게 해결하지 못해 건수가 증가하고 이로인해 이탈율이 상승.<br>
   따라서, 카드관련 부서에서는 대고객 접촉 서비스를 세부적으로 분석할 필요가 있으며 대고객 대응 가이드를 점검해 볼 필요가 있음


- 대고객 접촉 채널은 고객 행동을 분석하는데 있어 중요한 항목이므로 세부적으로 관리할 필요가 있으며 정기적으로 분석할 필요가 있음 
- 접촉 채널, 접촉 주체, 접촉 내용 구분, 접촉 세부 내용, 문제해결 여부, 접촉에 대한 만족도 등 

### credit_line : 카드 한도

In [None]:
bank_churner_df['credit_line']

In [None]:
print_continuous_graphs(bank_churner_df, 'credit_line', '카드 한도')


In [None]:
plt.title('카드 이탈 여부별 한도별 분포')
sns.histplot(x='credit_line', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
def calcCreditLine(credit_line):
    credit_line = (credit_line // 1000) * 1000
    return credit_line

bank_churner_df['credit_line_category'] = bank_churner_df.credit_line.apply(calcCreditLine)
print_category_graphs(bank_churner_df, 'credit_line_category', '카드 한도 범주')


신용한도에 따른 이탈율은 큰 특징은 없음

### tot_revol_balance : 리볼빙 잔액

In [None]:
print_continuous_graphs(bank_churner_df, 'tot_revol_balance', '리볼빙 잔액')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('리볼빙 잔액 분포 및 이탈여부 현황')
sns.histplot(x='tot_revol_balance', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
def calcRevolBal(tot_revol_balance):
    tot_revol_balance = (tot_revol_balance // 1000) * 1000
    return tot_revol_balance

bank_churner_df['tot_revol_balance_category'] = bank_churner_df.tot_revol_balance.apply(calcRevolBal)

In [None]:
print_category_graphs(bank_churner_df, 'tot_revol_balance_category', '리볼빙 잔액 범주')

- 신용카드 결제대상 잔액이 적을수록 이탈에 대한 비율이 커지므로 약 $1,000 이하 잔액이 있는 고객을 대상으로 마케팅할 필요가 있음  

### mean_open_to_buy : 평균 사용가능 신용한도

In [None]:
print_continuous_graphs(bank_churner_df, 'mean_open_to_buy', '평균 사용가능 신용한도')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('평균 사용가능 신용한도 분포 및 이탈 현황')
sns.histplot(x='mean_open_to_buy', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
def calcMeanOpenBuy(mean_open_to_buy):
    mean_open_to_buy = (mean_open_to_buy // 1000) * 1000
    return mean_open_to_buy

bank_churner_df['mean_open_to_buy_category'] = bank_churner_df.mean_open_to_buy.apply(calcMeanOpenBuy)

# 범주화를 줄일 필요가 있음 ??

In [None]:
bank_churner_df['mean_open_to_buy_category'].value_counts()

In [None]:
print_category_graphs(bank_churner_df, 'mean_open_to_buy_category', '평균 사용가능 신용한도 범주')


### tot_amt_ratio_q4_q1 : 1분기 대비 4분기의 거래 금액 비율

In [None]:
print_continuous_graphs(bank_churner_df, 'tot_amt_ratio_q4_q1', '1분기 대비 4분기의 거래 금액 비율')


In [None]:
tot_cnt = bank_churner_df['tot_amt_ratio_q4_q1'].count().sum()
tot_null_cnt = bank_churner_df['tot_amt_ratio_q4_q1'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
plt.figure(figsize=(12, 6))
plt.title('1분기 대비 4분기의 거래 금액 분포 및 이탈 현황')
sns.histplot(x='tot_amt_ratio_q4_q1', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
np.ceil(bank_churner_df.tot_amt_ratio_q4_q1 * 10) / 10


In [None]:
# 범주화

bank_churner_df['tot_amt_ratio_q4_q1_category'] = np.floor(bank_churner_df[bank_churner_df['tot_amt_ratio_q4_q1'].notnull()]['tot_amt_ratio_q4_q1'] * 10)/10

In [None]:
print_category_graphs(bank_churner_df, 'tot_amt_ratio_q4_q1_category', '1분기 대비 4분기의 거래 금액 비율 범주')


- 1분기 대비 4분기의 거래 금액의 변화는 비슷하나 이탈하지 않은 고객의 분포를 보면 이상치가 많음. 

### tot_trans_amt_for_12m : 최근 12개월 동안의 거래 금액

In [None]:
print_continuous_graphs(bank_churner_df, 'tot_trans_amt_for_12m', '최근 12개월 동안의 거래 금액')


In [None]:
tot_cnt = bank_churner_df['tot_trans_amt_for_12m'].count().sum()
tot_null_cnt = bank_churner_df['tot_trans_amt_for_12m'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
plt.figure(figsize=(12, 6))
plt.title('최근 12개월 동안의 거래 금액 분포 및 이탈 현황')
sns.histplot(x='tot_trans_amt_for_12m', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
# 금액 범주화 : $1000 단위
bank_churner_df['tot_trans_amt_for_12m_category'] = bank_churner_df.tot_trans_amt_for_12m.apply(calcMeanOpenBuy)

In [None]:
print_category_graphs(bank_churner_df, 'tot_trans_amt_for_12m_category', '최근 12개월 동안의 거래 금액 범주')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('최근 12개월 동안의 거래 금액 분포 및 이탈 현황')
sns.countplot(x='tot_trans_amt_for_12m_category', data=bank_churner_df, hue='is_churned')

# 범주화할 필요가 있음???

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
sns.boxplot(data=bank_churner_df, x='tot_trans_amt_for_12m', y='is_churned', orient='h', ax=axs[0])
sns.kdeplot(data=bank_churner_df, x='tot_trans_amt_for_12m', hue='is_churned', common_norm=False, ax=axs[1])
axs[0].set_ylabel('')
axs[1].set_ylabel('')
plt.show()

이탈하지 않은 고객의 거래 금액이 더 높은 것으로 나타나고 있으나, 거래 금액별로 이탈율이 다르게 나타나고 있음

### tot_trans_cnt_for_12m : 최근 12개월 동안의 거래 횟수

In [None]:
print_continuous_graphs(bank_churner_df, 'tot_trans_cnt_for_12m', '최근 12개월 동안의 거래 횟수')


In [None]:
tot_cnt = bank_churner_df['tot_trans_cnt_for_12m'].count().sum()
tot_null_cnt = bank_churner_df['tot_trans_cnt_for_12m'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
plt.figure(figsize=(12, 6))
plt.title('최근 12개월 동안의 거래 횟수 분포 및 이탈 현황')
sns.histplot(x='tot_trans_cnt_for_12m', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
# 거래 횟수 범주화 : 10 단위
bank_churner_df['tot_trans_cnt_for_12m_category'] = bank_churner_df['tot_trans_cnt_for_12m'] // 10 * 10

In [None]:
print_category_graphs(bank_churner_df, 'tot_trans_cnt_for_12m_category', '최근 12개월 동안의 거래 횟수 범주')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('최근 12개월 동안의 거래 금액 분포 및 이탈 현황')
sns.countplot(x='tot_trans_cnt_for_12m_category', data=bank_churner_df, hue='is_churned')

In [None]:
bank_churner_df['tot_trans_cnt_for_12m_category'].value_counts()

In [None]:
거개건수가 많을수록 유지되는 고객이 많으며 최근 12개월 동인 50번 이하의 고개 이탈율이 높아지므로 정기적으로 거래건수를 분석하여 대응할 필요가 있음

### tot_cnt_ratio_q4_q1 : 1분기 대비 4분기의 거래 횟수 비율

In [None]:
print_continuous_graphs(bank_churner_df, 'tot_cnt_ratio_q4_q1', '1분기 대비 4분기의 거래 횟수 비율')


In [None]:
tot_cnt = bank_churner_df['tot_cnt_ratio_q4_q1'].count().sum()
tot_null_cnt = bank_churner_df['tot_cnt_ratio_q4_q1'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
plt.figure(figsize=(12, 6))
plt.title('1분기 대비 4분기의 거래 횟수 분포 및 이탈 현황')
sns.histplot(x='tot_cnt_ratio_q4_q1', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
bank_churner_df['tot_cnt_ratio_q4_q1_category']

In [None]:
# 범주화

bank_churner_df['tot_cnt_ratio_q4_q1_category'] = np.floor(bank_churner_df[bank_churner_df['tot_cnt_ratio_q4_q1'].notnull()]['tot_cnt_ratio_q4_q1'] * 10)/10

In [None]:
print_category_graphs(bank_churner_df, 'tot_cnt_ratio_q4_q1_category', '1분기 대비 4분기의 거래 횟수 비율 범주')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('1분기 대비 4분기의 거래 횟수 분포 및 이탈 현황')
sns.countplot(x='tot_cnt_ratio_q4_q1_category', data=bank_churner_df, hue='is_churned')


In [None]:
bank_churner_df_cnt_ratio_churn = bank_churner_df[bank_churner_df['is_churned'] == 1][['tot_cnt_ratio_q4_q1_category', 'tot_cnt_ratio_q4_q1']]
bank_churner_df_cnt_ratio_churn

In [None]:
plt.figure(figsize=(12, 6))
plt.title('이탈 고객중 1분기 대비 4분기의 거래 횟수 분포')
sns.histplot(x='tot_cnt_ratio_q4_q1', data=bank_churner_df_cnt_ratio_churn)

In [None]:
plt.figure(figsize=(12, 6))
plt.title('이탈 고객중 1분기 대비 4분기의 거래 횟수 분포')
sns.countplot(x='tot_cnt_ratio_q4_q1_category', data=bank_churner_df_cnt_ratio_churn)

### mean_util_pct : 평균 한도 소진율

In [None]:
print_continuous_graphs(bank_churner_df, 'mean_util_pct', '평균 한도 소진율')


In [None]:
bank_churner_df['mean_util_pct'].describe()

In [None]:
tot_cnt = bank_churner_df['mean_util_pct'].count().sum()
tot_null_cnt = bank_churner_df['mean_util_pct'].isnull().sum()
print(f'전체 데이터 건수 = {tot_cnt:,} Null 건수 = {tot_null_cnt:,} 전체 데이터 중 널 비율 =  {round(tot_null_cnt / tot_cnt,2)}') 

In [None]:
plt.figure(figsize=(12, 6))
plt.title('평균 한도 소진율 분포 및 이탈 현황')
sns.histplot(x='mean_util_pct', data=bank_churner_df, kde=True, hue='is_churned')

In [None]:
# 범주화

bank_churner_df['mean_util_pct_category'] = np.floor(bank_churner_df[bank_churner_df['mean_util_pct'].notnull()]['mean_util_pct'] * 10)/10

In [None]:
print_category_graphs(bank_churner_df, 'mean_util_pct_category', '평균 한도 소진율 범주')


In [None]:
plt.figure(figsize=(12, 6))
plt.title('평균 한도 소진율 분포 및 이탈 현황')
sns.countplot(x='mean_util_pct_category', data=bank_churner_df, hue='is_churned')


In [None]:
bank_churner_df['mean_util_pct_category'].value_counts()

In [None]:
bank_churner_df_mean_util_pct_churn = bank_churner_df[bank_churner_df['is_churned'] == 1][['mean_util_pct_category', 'mean_util_pct']]
bank_churner_df_mean_util_pct_churn

In [None]:
plt.figure(figsize=(12, 6))
plt.title('이탈 고객중 평균 한도 소진율 분포')
sns.histplot(x='mean_util_pct', data=bank_churner_df_mean_util_pct_churn)

In [None]:
plt.figure(figsize=(12, 6))
plt.title('이탈 고객중 평균 한도 소진율 분포')
sns.countplot(x='mean_util_pct_category', data=bank_churner_df_mean_util_pct_churn)

한도 소진율이 낮을수록 이탈율은 올라가는 경향이 있음

In [None]:
plt.figure(figsize=(18, 18))
# for i, col in enumerate(bank_churner_df.drop(['is_churned'], axis=1).select_dtypes(include=['int','float']).columns):
for i, col in enumerate(['age', 'mon_on_book', 'credit_line', 'tot_revol_balance','mean_open_to_buy','tot_amt_ratio_q4_q1','tot_trans_amt_for_12m','tot_trans_cnt_for_12m','mean_util_pct']):        
# for i, col in enumerate(['age', 'mon_on_book', 'credit_line' ]):    
    # We exclude the 'y' column and only consider the columns of numerical type.
    # Excluimos la columna 'y' y solo consideramos las columnas de tipo numérico.

    plt.rcParams['axes.facecolor'] = 'white'
    ax = plt.subplot(4, 4, i+1)  # Creating a subplot for each column.
    # Creamos una subfigura para cada columna.

     # Plotting the histogram for each column
    sns.histplot(data=bank_churner_df, x=col, ax=ax, color='red', kde=True)

    # Plotting the KDE curve with custom color and linewidth
    # Plotting the histogram for each column.
    # Graficamos el histograma para cada columna.
    ax.tick_params(axis='x', labelsize=14)
    ax.tick_params(axis='y', labelsize=14)
    ax.set_xlabel(col, fontsize=18)
    ax.set_ylabel('Count', fontsize=18)
    
plt.suptitle('Data distribution of continuous variables',fontsize=18)
plt.tight_layout()

In [None]:
# for i, col in enumerate(['age', 'mon_on_book', 'credit_line', tot_revol_balance','mean_open_to_buy','tot_amt_ratio_q4_q1','tot_trans_amt_for_12m','tot_trans_cnt_for_12m','mean_util_pct']):    

## 다변량 분석

### 성별, 카드 종류별 비율
- Blue       
- Silver     
- Gold       
- Platinum   

In [None]:
fig = make_subplots(
    rows=2, cols=2,subplot_titles=('','<b>Platinum Card Holders','<b>Blue Card Holders<b>','Residuals'),
    vertical_spacing=0.09,
    specs=[[{"type": "pie","rowspan": 2}       ,{"type": "pie"}] ,
           [None                               ,{"type": "pie"}]            ,                                      
          ]
)

fig.add_trace(
    go.Pie(values=bank_churner_df.sex.value_counts().values,labels=['<b>여자<b>','<b>남자<b>'],hole=0.3,pull=[0,0.3]),
    row=1, col=1
)

fig.add_trace(
    go.Pie(
        labels=['Female Platinum Card Holders','Male Platinum Card Holders'],
        values=bank_churner_df.query('card_type=="Platinum"').sex.value_counts().values,
        pull=[0,0.05,0.5],
        hole=0.3
        
    ),
    row=1, col=2
)

# fig.add_trace(
#     go.Pie(
#         labels=['Female Gold Card Holders','Male Blue Card Holders'],
#         values=bank_churner_df.query('card_type=="Gold"').sex.value_counts().values,
#         pull=[0,0.2,0.5],
#         hole=0.3
#     ),
#     row=2, col=1
# )

fig.add_trace(
    go.Pie(
        labels=['Female Silver Card Holders','Male Blue Card Holders'],
        values=bank_churner_df.query('card_type=="Silver"').sex.value_counts().values,
        pull=[0,0.2,0.5],
        hole=0.3
    ),
    row=2, col=2
)



fig.update_layout(
    height=800,
    showlegend=True,
    title_text="<b>Distribution Of Gender And Different Card Statuses<b>",
)

fig.show()

# 전처리 과정

## 유사도, 상관도 분석

In [None]:
corr = bank_churner_df_org.corr() # 상관행렬 표 만들기
sns.heatmap(round(corr,1), 
            annot=True, # 상관계수 표시
            fmt='.1f', # 상관계수 소수점 자리
            cmap='coolwarm', # 컬러맵 색상 팔레트 
            vmax=1.0, # 상관계수 최댓값 
            vmin=-1.0, # 상관계수 최솟값
            linecolor='white', # 셀 테두리 색상 
            linewidths=.05) # 셀 간격 
sns.set(rc={'figure.figsize':(20,20)}) # 그래프 크기

In [None]:
sns.pairplot(data = bank_churner_df_org)
plt.show()

다중공선성: 다중공선성은 회귀 모델에서 두 개 이상의 독립변수가 높은 상관관계를 가질 때 발생합니다. 이로 인해 가변 계수의 해석이 어려워지고 모델의 안정성과 신뢰성이 낮아질 수 있습니다.

age와 mon_on_book, credit_line과 mean_open_to_buy, tot_trans_cnt_for_12m와 tot_trans_amt_for_12m 사이에도 강한 상관관계가 있음을 관찰
이로 인해 mon_on_book, mean_open_to_buy, tot_trans_cnt_for_12m 열을 제거할 예정입니다.

컬럼명               Null 건수
------------------- ---------  
age                         0 - 삭제1
sex                       808
imcome_cat               1619
mon_on_book                 0 - 삭제1 : 삭제
credit_line                 0 - 삭제2
tot_revol_balance        1521                 - 공선성1 
mean_open_to_buy            0 - 삭제2 : 삭제 
tot_amt_ratio_q4_q1      2435
tot_trans_amt_for_12m    1669 - 삭제3
tot_trans_cnt_for_12m    3250 - 삭제3 : 삭제
tot_cnt_ratio_q4_q1      1629
mean_util_pct            2526                 - 공선성1 

## 정규화, 표준화, 불필요 컬럼 삭제, 다중공선성 컬럼 제거 

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

def test_transform(x_test):
    ''' 전처리 함수 정의'''
    
    # 불필요 컬럼 제거(고객번호)
    # -------------------------
    x_test = x_test.drop('cstno', axis=1)
    
    
    # 성별 변환('F':0, 'M':1)
    # -------------------------
    x_test['sex']=x_test['sex'].replace({'F':0,'M':1})
    
    
    # 다중공선성 컬럼 제거
    # -------------------------
    x_test = x_test.drop('mon_on_book', axis = 1)
    x_test = x_test.drop('mean_open_to_buy', axis = 1)
    x_test = x_test.drop('tot_trans_cnt_for_12m', axis = 1)
    
    return x_test

bank_churner_df_org = test_transform(bank_churner_df_org)


In [None]:
corr = bank_churner_df_org.corr() # 상관행렬 표 만들기
sns.heatmap(round(corr,1), 
            annot=True, # 상관계수 표시
            fmt='.1f', # 상관계수 소수점 자리
            cmap='coolwarm', # 컬러맵 색상 팔레트 
            vmax=1.0, # 상관계수 최댓값 
            vmin=-1.0, # 상관계수 최솟값
            linecolor='white', # 셀 테두리 색상 
            linewidths=.05) # 셀 간격 
sns.set(rc={'figure.figsize':(20,20)}) # 그래프 크기

In [None]:
sns.pairplot(data = bank_churner_df_org)
plt.show()

In [None]:
sns.pairplot(hue = 'is_churned', data = bank_churner_df_org)
plt.show()

이 그래프를 시각적으로 분석하면 특정 변수에 대한 명확한 클러스터링 패턴을 관찰할 수 있습니다. 패턴이 다음과 같을 때 관심이 갑니다.
신용 한도의 대각선 그래프를 보면 은행을 떠나기로 결정한 사람들은 신용 한도가 낮은 사람들입니다.
신용한도가 높은 사람들은 은행을 떠나지 않기로 결정합니다(파란색 그래프의 정점). 신용 한도와 다른 변수를 그래프로 표시할 때,
흥미로운 클러스터링 패턴을 찾을 수 있습니다.
얼핏 보면 고객의 체류 여부를 결정하는 데 다음과 같은 변수가 중요한 영향을 미치는 것으로 보입니다.

age, credit_lile, tot_revol_balance, tot_amt_ratio_q4_q1, tot_trans_amt_for_12m, tot_cnt_ratio_q4_q1, mean_util_pct

In [None]:
bank_churner_df_org.head()

In [None]:
#Education level
plt.figure(figsize=(18, 6))
sns.set(style="whitegrid")
sns.set_palette("Set1")
sns.countplot(x="education", data=bank_churner_df_org,hue='is_churned',edgecolor='black')
plt.xlabel("education",fontsize=18)
plt.ylabel("Customers",fontsize=18)
plt.title("education vs Customer churn",fontsize=18)
legend = plt.legend(title="Response", labels=["No", "Yes"], fontsize=14)
legend.get_frame().set_facecolor('0.9')
legend.get_frame().set_alpha(0.8)
#plt.legend(title="Response", labels=["No", "Yes"],fontsize=16)
plt.xticks(rotation=90)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# Calculate the percentages of "Yes" and "No" responses for each education level
education_percents = bank_churner_df_org.groupby('education')['is_churned'].value_counts(normalize=True).unstack()
education_percents.reset_index(inplace=True)
education_percents.fillna(0, inplace=True)  # Fill with 0 for missing values

# Create a DataFrame with the percentages of "No" and "Yes" responses for each Education_Level value
education_percent_table = education_percents.rename(columns={0: 'No_Percentage', 1: 'Yes_Percentage'})

# Format the percentages as percentage notation
education_percent_table['No_Percentage'] = education_percent_table['No_Percentage'].apply(lambda x: f"{x:.2%}")
education_percent_table['Yes_Percentage'] = education_percent_table['Yes_Percentage'].apply(lambda x: f"{x:.2%}")

# Sort the table in descending order based on the "Yes_Percentage" value
education_percent_table = education_percent_table.sort_values(by='Yes_Percentage', ascending=False)

# Print the percentage table without the index column
print(education_percent_table[['education', 'No_Percentage', 'Yes_Percentage']].to_string(index=False))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the percentages of "Yes" and "No" responses for each education level
education_percents = bank_churner_df_org.groupby('education')['is_churned'].value_counts(normalize=True).unstack()
education_percents.reset_index(inplace=True)
education_percents.fillna(0, inplace=True)  # Fill with 0 for missing values

# Create a DataFrame with the percentages of response 0 and 1 for each Education_Level value
education_percent_table = education_percents.rename(columns={0: 'No_Percentage', 1: 'Yes_Percentage'})

# Format percentages as percentage notation
education_percent_table['No_Percentage'] = education_percent_table['No_Percentage'].apply(lambda x: x * 100)
education_percent_table['Yes_Percentage'] = education_percent_table['Yes_Percentage'].apply(lambda x: x * 100)

# Sort the table in descending order of "Yes_Percentage"
education_percent_table = education_percent_table.sort_values(by='Yes_Percentage', ascending=False)

# Use the "pastel" color palette from Seaborn
custom_palette = sns.color_palette("pastel")

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

# Flatten the array of subplots for iteration
axes = axes.flatten()

# Create donut charts for each education level in descending order of Yes_Percentage
for index, row in education_percent_table.iterrows():
    labels = ['No', 'Yes']
    sizes = [row['No_Percentage'], row['Yes_Percentage']]
    explode = (0.1, 0)  # Explode the first slice (No)

    ax = axes[index]
    patches, texts, autotexts = ax.pie(sizes, explode=explode, labels=labels, colors=custom_palette,
                                       autopct='%1.1f%%', shadow=True, startangle=90,
                                       textprops={'fontsize': 18})
    for text in texts:
        text.set_fontsize(20)
    for autotext in autotexts:
        autotext.set_fontsize(20)

    ax.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle
    ax.set_title(row['education'], fontsize=25)

# Remove unused subplots
for i in range(len(education_percent_table), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

#교육 수준이 높은 사람들이 은행을 더 자주 떠나는 경향이 있음을 관찰할 수 있습니다.

In [None]:
#Marital_Status
plt.figure(figsize=(18, 6))
sns.set(style="whitegrid")
sns.set_palette("Set1")
sns.countplot(x="marital_stat", data=bank_churner_df_org,hue='is_churned',edgecolor='black')
plt.xlabel("Marital Status",fontsize=18)
plt.ylabel("Customers",fontsize=18)
plt.title("Marital status vs Customer churn",fontsize=18)
legend = plt.legend(title="Response", labels=["No", "Yes"], fontsize=14)
legend.get_frame().set_facecolor('0.9')
legend.get_frame().set_alpha(0.8)
#plt.legend(title="Response", labels=["No", "Yes"],fontsize=16)
plt.xticks(rotation=90)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the percentages of "Yes" and "No" responses for each Marital status
marital_percents = bank_churner_df_org.groupby('marital_stat')['is_churned'].value_counts(normalize=True).unstack()
marital_percents.reset_index(inplace=True)
marital_percents.fillna(0, inplace=True)  # Fill with 0 for missing values

# Create a DataFrame with the percentages of response 0 and 1 for each Marital_status value
marital_percent_table = marital_percents.rename(columns={0: 'No_Percentage', 1: 'Yes_Percentage'})

# Format percentages as percentage notation
marital_percent_table['No_Percentage'] = marital_percent_table['No_Percentage'].apply(lambda x: x * 100)
marital_percent_table['Yes_Percentage'] = marital_percent_table['Yes_Percentage'].apply(lambda x: x * 100)

# Sort the table in descending order of "Yes_Percentage"
marital_percent_table = marital_percent_table.sort_values(by='Yes_Percentage', ascending=False)

# Use the "pastel" color palette from Seaborn
custom_palette = sns.color_palette("pastel")

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

# Flatten the array of subplots for iteration
axes = axes.flatten()

# Create donut charts for each marital status in descending order of Yes_Percentage
for index, row in marital_percent_table.iterrows():
    labels = ['No', 'Yes']
    sizes = [row['No_Percentage'], row['Yes_Percentage']]
    explode = (0.1, 0)  # Explode the first slice (No)

    ax = axes[index]
    patches, texts, autotexts = ax.pie(sizes, explode=explode, labels=labels, colors=custom_palette,
                                       autopct='%1.1f%%', shadow=True, startangle=90,
                                       textprops={'fontsize': 18})
    for text in texts:
        text.set_fontsize(20)
    for autotext in autotexts:
        autotext.set_fontsize(20)

    ax.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle
    ax.set_title(row['marital_stat'], fontsize=25)

# Remove unused subplots
for i in range(len(marital_percent_table), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
#Income Category
plt.figure(figsize=(18, 6))
sns.set(style="whitegrid")
sns.set_palette("Set1")
sns.countplot(x="imcome_cat", data=bank_churner_df_org, hue='is_churned',edgecolor='black')
plt.xlabel("Income",fontsize=18)
plt.ylabel("Customers",fontsize=18)
plt.title("Income vs Customer churn",fontsize=18)
legend = plt.legend(title="Response", labels=["No", "Yes"], fontsize=14)
legend.get_frame().set_facecolor('0.9')
legend.get_frame().set_alpha(0.8)
#plt.legend(title="Response", labels=["No", "Yes"],fontsize=16)
plt.xticks(rotation=90)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the percentages of "Yes" and "No" responses for each Income Category
income_percents = bank_churner_df_org.groupby('imcome_cat')['is_churned'].value_counts(normalize=True).unstack()
income_percents.reset_index(inplace=True)
income_percents.fillna(0, inplace=True)  # Fill with 0 for missing values

income_percent_table = income_percents.rename(columns={0: 'No_Percentage', 1: 'Yes_Percentage'})

# Format percentages as percentage notation
income_percent_table['No_Percentage'] = income_percent_table['No_Percentage'].apply(lambda x: x * 100)
income_percent_table['Yes_Percentage'] = income_percent_table['Yes_Percentage'].apply(lambda x: x * 100)

# Sort the table in descending order of "Yes_Percentage"
income_percent_table = income_percent_table.sort_values(by='Yes_Percentage', ascending=False)

# Use the "pastel" color palette from Seaborn
custom_palette = sns.color_palette("pastel")

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

# Flatten the array of subplots for iteration
axes = axes.flatten()

# Create donut charts for each marital status in descending order of Yes_Percentage
for index, row in income_percent_table.iterrows():
    labels = ['No', 'Yes']
    sizes = [row['No_Percentage'], row['Yes_Percentage']]
    explode = (0.1, 0)  # Explode the first slice (No)

    ax = axes[index]
    patches, texts, autotexts = ax.pie(sizes, explode=explode, labels=labels, colors=custom_palette,
                                       autopct='%1.1f%%', shadow=True, startangle=90,
                                       textprops={'fontsize': 18})
    for text in texts:
        text.set_fontsize(20)
    for autotext in autotexts:
        autotext.set_fontsize(20)

    ax.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle
    ax.set_title(row['imcome_cat'], fontsize=25)

# Remove unused subplots
for i in range(len(income_percent_table), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

In [None]:
#Card Category
plt.figure(figsize=(18, 6))
sns.set(style="whitegrid")
sns.set_palette("Set1")
sns.countplot(x="card_type", data=bank_churner_df_org,hue='is_churned',edgecolor='black')
plt.xlabel("Card",fontsize=18)
plt.ylabel("Customers",fontsize=18)
plt.title("card vs Customer churn",fontsize=18)
legend = plt.legend(title="Response", labels=["No", "Yes"], fontsize=14)
legend.get_frame().set_facecolor('0.9')
legend.get_frame().set_alpha(0.8)
#plt.legend(title="Response", labels=["No", "Yes"],fontsize=16)
plt.xticks(rotation=90)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

card_percents = bank_churner_df_org.groupby('card_type')['is_churned'].value_counts(normalize=True).unstack()
card_percents.reset_index(inplace=True)
card_percents.fillna(0, inplace=True)  # Fill with 0 for missing values

card_percent_table = card_percents.rename(columns={0: 'No_Percentage', 1: 'Yes_Percentage'})

# Format percentages as percentage notation
card_percent_table['No_Percentage'] = card_percent_table['No_Percentage'].apply(lambda x: x * 100)
card_percent_table['Yes_Percentage'] = card_percent_table['Yes_Percentage'].apply(lambda x: x * 100)

# Sort the table in descending order of "Yes_Percentage"
card_percent_table = card_percent_table.sort_values(by='Yes_Percentage', ascending=False)

# Use the "pastel" color palette from Seaborn
custom_palette = sns.color_palette("pastel")

# Create a figure with subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

# Flatten the array of subplots for iteration
axes = axes.flatten()

# Create donut charts for each marital status in descending order of Yes_Percentage
for index, row in card_percent_table.iterrows():
    labels = ['No', 'Yes']
    sizes = [row['No_Percentage'], row['Yes_Percentage']]
    explode = (0.1, 0)  # Explode the first slice (No)

    ax = axes[index]
    patches, texts, autotexts = ax.pie(sizes, explode=explode, labels=labels, colors=custom_palette,
                                       autopct='%1.1f%%', shadow=True, startangle=90,
                                       textprops={'fontsize': 18})
    for text in texts:
        text.set_fontsize(20)
    for autotext in autotexts:
        autotext.set_fontsize(20)

    ax.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle
    ax.set_title(row['card_type'], fontsize=25)

# Remove unused subplots
for i in range(len(card_percent_table), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

플래티넘 카드 소지자는 은행 서비스 이용을 중단하는 경향이 있습니다. 수수료가 너무 높거나 연간 서비스가 부족합니까?
#수수료 결제됐나요?

# EDA 분석 결과

저소득층에 집중: 구매력은 크지 않지만 대부분의 고객은 저소득층입니다. 저소득층을 위한 프로모션을 시행하는 것은 해당 클러스터 그룹의 고객 이탈을 줄이는 좋은 대안이 될 수 있습니다.
활동 수준이 낮을 때 조치: 활동 수준이 낮은 고객(트랜잭션 45개 미만)이 조직을 떠날 확률이 더 높다는 것을 확인했습니다. 직원이 활동 수준이 낮은 고객에게 전화를 걸어 그들의 요구 사항에 맞는 새로운 제품을 제안하거나 고객이 우리가 제공하는 서비스에 만족하는지, 개선하기 위해 할 수 있는 것이 있는지 묻는다면 우리는 아마도 무엇에 대해 더 나은 통찰력을 얻을 수 있을 것입니다. 우리는 활동 수준을 높이기 위해 할 수 있습니다.
회전 잔액이 적은 사람들에게 신용 한도를 늘리시겠습니까? 우리 모델에 따르면 회전 잔액이 낮은 고객은 조직을 떠날 가능성이 더 높습니다. 어쩌면 해당 고객에게 더 높은 신용 잔액을 구현함으로써 해당 세그먼트 그룹이 조직을 떠날 확률이 낮아질 수 있습니다.

# ------------------------------------ 여기서부터 시작

# 예측 실행 - Test Point

## 라이브러리 로딩

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


## 평가함수 정의

In [None]:
# get_clf_eval() 함수 
# -------------------
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))   


def precision_recall_curve_plot(y_test=None, pred_proba_c1=None):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()

def roc_curve_plot(y_test , pred_proba_c1):
    # 임곗값에 따른 FPR, TPR 값을 반환 받음. 
    fprs , tprs , thresholds = roc_curve(y_test ,pred_proba_c1)

    # ROC Curve를 plot 곡선으로 그림. 
    plt.plot(fprs , tprs, label='ROC')
    # 가운데 대각선 직선을 그림. 
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    
    # FPR X 축의 Scale을 0.1 단위로 변경, X,Y 축명 설정등   
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    plt.xlim(0,1); plt.ylim(0,1)
    plt.xlabel('FPR( 1 - Sensitivity )'); plt.ylabel('TPR( Recall )')
    plt.legend()
    plt.show()    

# roc_curve_plot(y_test, lr_clf.predict_proba(X_test)[:, 1] )    

In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df_org = bank_churner_df.copy()

def test_transform(x_test):
    ''' 전처리 함수 정의'''
    
    # 불필요 컬럼 제거(고객번호)
    # -------------------------
    x_test = x_test.drop('cstno', axis=1)
    
    
    # 성별 변환('F':0, 'M':1)
    # -------------------------
    x_test['sex']=x_test['sex'].replace({'F':0,'M':1})
    
    
    # 다중공선성 컬럼 제거
    # -------------------
    x_test = x_test.drop('mon_on_book', axis = 1)
    x_test = x_test.drop('mean_open_to_buy', axis = 1)
    x_test = x_test.drop('tot_trans_cnt_for_12m', axis = 1)


    # 범주형 데이터 One-Hot 인코딩
    # --------------------------
    x_test = pd.concat([x_test,pd.get_dummies(x_test['education']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['imcome_cat']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['marital_stat']).drop(columns=['Unknown'])],axis=1)
    x_test = pd.concat([x_test,pd.get_dummies(x_test['card_type']).drop(columns=['Platinum'])],axis=1)
    x_test.drop(columns = ['education','imcome_cat','marital_stat','card_type'],inplace=True)


    # Null 처리 1 방식
    # ---------------
    # x_test.dropna(axis=0, inplace=True)

    
    # Null 처리 2 방식
    # ---------------
    # x_test.drop(columns = ['sex'], inplace=True)
    # x_test.drop(columns = ['tot_revol_balance'], inplace=True)
    # x_test.drop(columns = ['tot_amt_ratio_q4_q1'], inplace=True)        
    # x_test.drop(columns = ['tot_trans_amt_for_12m'], inplace=True)        
    # x_test.drop(columns = ['tot_cnt_ratio_q4_q1'], inplace=True)        
    # x_test.drop(columns = ['mean_util_pct'], inplace=True)


    # # Null 처리 3 방식
    # # ----------------
    x_test.drop(columns = ['mean_util_pct'], inplace=True)
    x_test.dropna(axis=0, inplace=True)
        
    return x_test

bank_churner_df_org = test_transform(bank_churner_df_org)


In [None]:
# Shape
# -----
# Null 처리 1 방식 : (1768, 30)

bank_churner_df_org.shape

## Logistic Regression으로 학습 및 예측 수행

In [None]:
# 피처 데이터 세트 X, 레이블 데이터 세트 y를 추출. 
# 맨 끝이 Outcome 컬럼으로 레이블 값임. 컬럼 위치 -1을 이용해 추출 
X = bank_churner_df_org.drop(['is_churned'],axis=1)
y = bank_churner_df_org['is_churned']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156, stratify=y)

# 로지스틱 회귀로 학습,예측 및 평가 수행. 
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)
pred_proba = lr_clf.predict_proba(X_test)[:, 1]

get_clf_eval(y_test , pred, pred_proba)

## precision recall 곡선 그림

In [None]:
pred_proba_c1 = lr_clf.predict_proba(X_test)[:, 1]
precision_recall_curve_plot(y_test, pred_proba_c1)

## 분류결정 임곗값을 변경하면서 성능 측정

In [None]:
from sklearn.preprocessing import Binarizer

def get_eval_by_threshold(y_test , pred_proba_c1, thresholds):
    # thresholds 리스트 객체내의 값을 차례로 iteration하면서 Evaluation 수행.
    for custom_threshold in thresholds:
        binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_c1) 
        custom_predict = binarizer.transform(pred_proba_c1)
        print('임곗값:',custom_threshold)
        get_clf_eval(y_test , custom_predict, pred_proba_c1)

In [None]:
thresholds = [0.3 , 0.33 ,0.36,0.39, 0.42 , 0.45 ,0.48, 0.50]
pred_proba = lr_clf.predict_proba(X_test)
get_eval_by_threshold(y_test, pred_proba[:,1].reshape(-1,1), thresholds )

In [None]:
# 임곗값를 0.48로 설정한 Binarizer 생성
binarizer = Binarizer(threshold=0.42)

# 위에서 구한 lr_clf의 predict_proba() 예측 확률 array에서 1에 해당하는 컬럼값을 Binarizer변환. 
pred_th_042 = binarizer.fit_transform(pred_proba[:, 1].reshape(-1,1)) 

get_clf_eval(y_test , pred_th_042, pred_proba[:, 1])

# ---------------------------------------------  여기까지 한세트임 - 테스트

In [None]:
#We create our feature matrix and our target variable vector.
X=bank_churner_df_org.drop(['is_churned'],axis=1)
y=bank_churner_df_org['is_churned']

In [None]:
X.shape

In [None]:
#Selection of the most important features to conduct the training
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
import pandas as pd


# Set the random seed for reproducibility
np.random.seed(42)

# Define a list of available models for selection
available_models = {
    'ExtraTrees': ExtraTreesClassifier(n_estimators=100),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    #'SVM': SVC(kernel='linear'),
    #'KNN': KNeighborsClassifier(n_neighbors=5),
    #'LASSO': Lasso(alpha=0.01),  # Agrega LASSO aquí
    #'RFE': RFE(estimator=RandomForestClassifier(n_estimators=100), n_features_to_select=10)
    # Agrega otros modelos aquí si lo deseas
}

# Choose the desired model for feature selection
chosen_model = 'ExtraTrees'  

# Create the selected model
clf = available_models[chosen_model]

#Train the model with the data
# Entrenar el modelo con los datos
clf = clf.fit(X.values, y)

#Obtain feature importances from the model
# Obtener importancias de características del modelo
feature_importances = clf.feature_importances_

#Create a SelectFromModel object with the trained classifier
# Crear un objeto SelectFromModel con el clasificador entrenado
model = SelectFromModel(clf, prefit=True)

#Transform the original features to obtain the selected ones
# Transformar las características originales para obtener las seleccionadas
X_new = model.transform(X.values)

# Obtener los índices de las características seleccionadas
selected_feature_indices = model.get_support(indices=True)

#Get the indices of the selected features
# Obtener los nombres de las columnas seleccionadas
selected_columns = X.columns[selected_feature_indices]
#Print the selected columns
# Imprimir las columnas seleccionadas
print("Selected columns:")
print(selected_columns)

In [None]:
#Based on the analysis of the graphs, we had predicted that:
#At first glance, the following variables seem to have a significant influence on the determination of whether customers stay or not: Customer_Age, Credit_Limit,
#Total_Recovering_Bal, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Ct_Chng_Q4_Q1, Avg_Utilization_Ratio
#It seems that our intuition was correct.


#Con el análisis de las gráficas habíamos predicho que :
#A simple vista parecen tener gran peso para la determinación de la permanencia o no las siguientes variables; Customer_Age,Credit_Limit,
#Total_Recovering_Bal, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
#Al parecer nuestra intuición fue correcta

import matplotlib.pyplot as plt
import seaborn as sns

#Get the indices of all columns in descending order of importance
# Obtener los índices de todas las columnas en orden descendente de importancia
sorted_indices = feature_importances.argsort()[::-1]

#Get the names of all columns in the same order
# Obtener los nombres de todas las columnas en el mismo orden
sorted_columns = X.columns[sorted_indices]

#Get the sorted importances
# Obtener las importancias ordenadas
sorted_importances = feature_importances[sorted_indices]

plt.figure(figsize=(10, 6))

#Create a bar chart to display the importance of all columns in descending order
# Crear un gráfico de barras para mostrar la importancia de todas las columnas en orden descendente
sns.barplot(x=sorted_importances, y=sorted_columns, palette=['lightgrey' if i not in selected_feature_indices else 'blue' for i in sorted_indices])

plt.xlabel("Importance", fontsize=14)
plt.ylabel("Feattures", fontsize=14)
plt.title("Feature Importance", fontsize=16)
plt.yticks(rotation=0, fontsize=12)
plt.show()

In [None]:
X_new.shape

In [None]:
#Model Training
#Entrenamiento del modelo
from sklearn.model_selection import train_test_split
# Split the data into training and test sets
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)

#stratify=y: It is used to ensure that the distribution of classes in the training and test sets is similar to the original distribution 
#of the target variable y. This is particularly useful when dealing with umbalanced classes, as it ensures that both parts of the split have
#a similar proportion of each class.

#stratify=y: Se utiliza para garantizar
#que la distribución de las clases en el conjunto de entrenamiento y prueba sea similar a la distribución original de la variable objetivo y.
#Esto es especialmente útil cuando tienes clases desequilibradas, ya que asegura que ambas partes de la división tengan una proporción similar
#de cada clase.

In [None]:
#Verifying the size of the training and testing sets
#verificamos el tamaño de los set de entrenamiento y testeo
print("Training X size: ", X_train.shape)
print("Training y size: ", y_train.shape)
print("Test X size: ", X_test.shape)
print("Test y size: ", y_test.shape)

In [None]:
import matplotlib.pyplot as plt

plt.bar(['No', 'Yes'], y_train.value_counts(), color=['blue', 'orange'])
plt.xlabel('Response', fontsize=18)
plt.ylabel('Couts', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title('Target Variable Distribution', fontsize=16)

plt.show()

In [None]:
#SMOTE(Synthetic Minority Oversampling Technique)
#SMOTE is a technique for oversampling the minority class. Simply adding duplicate records of the minority class often does not add new
#information to the model. In SMOTE, new instances are generated from the existing data. To put it simply, SMOTE examines instances of 
#the minority class and uses the k-nearest neighbors method to select a randomly close neighbor, and a new synthetic instance is created 
#in the feature space.

#Class imbalance, where one or more classes are significantly less frequent than others, is a common challenge in machine learning.
#The presence of minority classes can cause the model to be biased towards the majority classes and have difficulty learning patterns from 
#the minority classes. This is where techniques like SMOTE (Synthetic Minority Oversampling Technique) can help by generating synthetic
#instances to balance the classes and improve the model's performance in predicting minority classes.


#SMOTE (Técnica de Sobremuestreo Sintético de la Clase Minoritaria) es una técnica para sobremuestrear la clase minoritaria. 
#Simplemente agregar registros duplicados de la clase minoritaria a menudo no agrega información nueva al modelo.
#En SMOTE, se generan nuevas instancias a partir de los datos existentes. Si lo explicamos en palabras sencillas,
#SMOTE examina las instancias de la clase minoritaria y utiliza el método de los k vecinos más cercanos para seleccionar
#un vecino cercano al azar, y se crea una nueva instancia sintética aleatoria en el espacio de características.

#El desequilibrio de clases, donde una o más clases son significativamente menos frecuentes que otras,
#es un desafío común en el aprendizaje automático. La presencia de clases minoritarias puede hacer que el
#modelo sea sesgado hacia las clases mayoritarias y que tenga dificultades para aprender patrones de las clases minoritarias.
#Es aquí donde técnicas como SMOTE (Técnica de Sobremuestreo Sintético de la Clase Minoritaria) pueden ayudar al generar instancias
#sintéticas para equilibrar las clases y mejorar el rendimiento del modelo en la predicción de clases minoritarias.


from imblearn.over_sampling import SMOTE
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.25,stratify=y,random_state=0)
sm = SMOTE(sampling_strategy='auto', random_state=42)
X_train,y_train=sm.fit_resample(X_train,y_train)

In [None]:
import matplotlib.pyplot as plt

plt.bar(['No', 'Yes'], y_train.value_counts(), color=['blue', 'orange'])
plt.xlabel('Response', fontsize=18)
plt.ylabel('Couts', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title('Target Variable Distribution', fontsize=16)

plt.show()

In [None]:
#Normalization
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
model_comparison={}

In [None]:
#Training with different models
#entrenamiento con distintos modelos
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#Create a list of tuples with the model name and the classifier instance
# Crear una lista de tuplas con el nombre del modelo y la instancia del clasificador
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)),
    ('Xg Boost', XGBClassifier())
]

model_comparison = {}  #Dictionary to store the comparison metrics of models
                        # Diccionario para almacenar las métricas de comparación de modelos

for model_name, classifier in models:
    #Fit the model using the training set
    # Ajustar el modelo usando el conjunto de entrenamiento
    classifier.fit(X_train, y_train)
    #Make predictions on the test set
    # Realizar predicciones en el conjunto de prueba
    y_pred = classifier.predict(X_test)
    #Calculate model metrics
    # Calcular métricas del modelo
    accuracy = accuracy_score(y_pred, y_test)
    f1 = f1_score(y_pred, y_test, average='weighted')
    accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5, scoring="recall")
    cv_accuracy = accuracies.mean()
    cv_std = accuracies.std()
    accuracy_class_0 = accuracy_score(y_pred[y_test == 0], y_test[y_test == 0])
    accuracy_class_1 = accuracy_score(y_pred[y_test == 1], y_test[y_test == 1])
    #Print model metrics
    # Imprimir métricas del modelo
    print(f"Modelo: {model_name}")
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print(f"Model F1-Score: {f1 * 100:.2f}%")
    print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
    print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")
    #Add metrics to the models comparison dictionary
    # Agregar métricas al diccionario de comparación de modelos
    model_comparison[model_name] = [accuracy, accuracy_class_0, accuracy_class_1, f1, cv_accuracy, cv_std]
    print(classification_report(y_pred, y_test, zero_division=1))
    print("-" * 60)

In [None]:
#Ensemble methods in machine learning involve combining multiple models (often weaker models or base models) to create a stronger,
#more robust predictive model. The idea behind ensembling is that by combining the predictions of multiple models, the strengths 
#of each individual model can compensate for the weaknesses of others, leading to improved overall performance.
from sklearn.ensemble import VotingClassifier

models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)),
    ('Xg Boost', XGBClassifier())
]

voting_classifier = VotingClassifier(estimators=models, voting='soft')  # Puedes usar 'hard' o 'soft' para el voto

voting_classifier.fit(X_train, y_train)

y_pred = voting_classifier.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average='weighted')
accuracies = cross_val_score(estimator=voting_classifier, X=X_train, y=y_train, cv=5, scoring="recall")
cv_accuracy = accuracies.mean()
cv_std = accuracies.std()
accuracy_class_0 = accuracy_score(y_pred[y_test == 0], y_test[y_test == 0])
accuracy_class_1 = accuracy_score(y_pred[y_test == 1], y_test[y_test == 1])

print("Modelo: Voting Classifier")
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1-Score: {f1 * 100:.2f}%")
print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")

model_comparison['Voting Classifier'] = [accuracy, accuracy_class_0, accuracy_class_1, f1, cv_accuracy, cv_std]
print(classification_report(y_pred, y_test, zero_division=1))
print("-" * 60)

In [None]:
#Models Comparisson
# Comparación de modelos
for model_name, metrics in model_comparison.items():
    print(f"Modelo: {model_name}")
    print(f"Model Accuracy: {metrics[0] * 100:.2f}%")
    print(f"Model F1-Score: {metrics[3] * 100:.2f}%")
    print(f"Cross Val Accuracy: {metrics[4] * 100:.2f}%")
    print(f"Cross Val Standard Deviation: {metrics[5] * 100:.2f}%")
    print("-" * 60)

#Compare the performance of the Voting Classifier with the individual models    
# Comparar el rendimiento del Voting Classifier con los modelos individuales
voting_metrics = model_comparison['Voting Classifier']
for model_name in model_comparison:
    if model_name != 'Voting Classifier':
        individual_metrics = model_comparison[model_name]
        print(f"Comparando con {model_name}:")
        print(f"Mejora en Accuracy: {voting_metrics[0] - individual_metrics[0]:.2f}")
        print(f"Mejora en F1-Score: {voting_metrics[3] - individual_metrics[3]:.2f}")
        print("-" * 40)

In [None]:
#MODEL COMPARISSON
#COMPARACIÓN DE MODELOS

Model_com_df=pd.DataFrame(model_comparison).T
Model_com_df.columns=['Model Accuracy','Model Accuracy-0','Model Accuracy-1','Model F1-Score','CV Accuracy','CV std']
Model_com_df=Model_com_df.sort_values(by='Model F1-Score',ascending=False)
Model_com_df.style.format("{:.2%}").background_gradient(cmap='magma')

In [None]:
import pandas as pd

Model_com_df = pd.DataFrame(model_comparison).T
Model_com_df.columns = ['Model Accuracy', 'Model Accuracy-No', 'Model Accuracy-Yes', 'Model F1-Score', 'CV Accuracy', 'CV std']
Model_com_df = Model_com_df.sort_values(by='Model F1-Score', ascending=False)

def highlight_below_75(s):
    if s.name != 'CV std' and isinstance(s, pd.Series) and s.dtype == 'float64':
        return ['color: red' if value < 0.75 else 'color: black' for value in s]
    else:
        return ['color: black'] * len(s)

styled_df = Model_com_df.style.highlight_max(axis=0).apply(highlight_below_75, subset=pd.IndexSlice[:, :'CV Accuracy']).format("{:.2%}", subset=pd.IndexSlice[:, :'CV Accuracy'])
styled_df

In [None]:
#As we can see, the XGBoost algorithm has the best values for Model Accuracy and F1-Score, making it a viable option to implement the model.
#However, the Voting Classifier has parameters very similar to XGBoost but with significantly better CV Accuracy, making it a more robust model.
#Ensemble methods in machine learning involve combining multiple models (often weaker models or base models) to create a stronger,
#more robust predictive model. The idea behind ensembling is that by combining the predictions of multiple models, the strengths 
#of each individual model can compensate for the weaknesses of others, leading to improved overall performance.
from sklearn.ensemble import VotingClassifier

models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier(criterion='entropy', random_state=0)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
    ('Random Forest', RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)),
    ('Xg Boost', XGBClassifier())
]

voting_classifier = VotingClassifier(estimators=models, voting='soft')  # Puedes usar 'hard' o 'soft' para el voto

voting_classifier.fit(X_train, y_train)

y_pred = voting_classifier.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average='weighted')
accuracies = cross_val_score(estimator=voting_classifier, X=X_train, y=y_train, cv=5, scoring="recall")
cv_accuracy = accuracies.mean()
cv_std = accuracies.std()
accuracy_class_0 = accuracy_score(y_pred[y_test == 0], y_test[y_test == 0])
accuracy_class_1 = accuracy_score(y_pred[y_test == 1], y_test[y_test == 1])

print("Modelo: Voting Classifier")
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1-Score: {f1 * 100:.2f}%")
print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")

model_comparison['Voting Classifier'] = [accuracy, accuracy_class_0, accuracy_class_1, f1, cv_accuracy, cv_std]
print(classification_report(y_pred, y_test, zero_division=1))
print("-" * 60)

In [None]:
# Compute confusion matrix for the best model
from sklearn.metrics import classification_report, confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix. Normalization can be applied by setting normalize=True.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix without normalization')

    print(cm)

    #Plot the confusion matrix.
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.grid(False)
    plt.title(title,fontsize=18)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Add labels to the cells of the confusion matrix.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",fontsize=16,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label',fontsize=15)
    plt.xlabel('Predicted label',fontsize=15)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

print(confusion_matrix(y_test, y_pred, labels=[0,1]))


cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['y=0','y=1'],normalize= True,  title='Confusion matrix')

In [None]:
#DEEP LEARNING
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Scale the data with StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Build the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predict on the test data
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba >= 0.5).astype(int)


# Calculate metrics
print(f"Model Accuracy: {accuracy_score(y_pred, y_test) * 100:.2f}%")
print(f"Model F1-Score: {f1_score(y_pred, y_test, average='weighted') * 100:.2f}%")
print(classification_report(y_pred, y_test, zero_division=1))

# Calculate accuracies per class
accuracy_class_0 = accuracy_score(y_pred[y_test == 0], y_test[y_test == 0])
accuracy_class_1 = accuracy_score(y_pred[y_test == 1], y_test[y_test == 1])

In [None]:
# Compute confusion matrix for the Deep learning model
from sklearn.metrics import classification_report, confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix. Normalization can be applied by setting normalize=True.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix without normalization')

    print(cm)

    #Plot the confusion matrix.
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.grid(False)  # <-- Agregar esta línea para evitar el aviso de deprecación
    plt.title(title,fontsize=18)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    # Add labels to the cells of the confusion matrix.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",fontsize=16,
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label',fontsize=15)
    plt.xlabel('Predicted label',fontsize=15)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

print(confusion_matrix(y_test, y_pred, labels=[0,1]))


cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0,1])
np.set_printoptions(precision=2)

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['y=0','y=1'],normalize= True,  title='Confusion matrix')

# 이탈고객 예측 과정

## 이탈고객 예측을 위한 라이브러리 로딩

In [None]:
# models & cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import cross_val_score

# data preprocessing
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# results and reports
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn.metrics import accuracy_score

In [None]:
# creates a dataframe from the classification report
def report(y_true, predictions):
    report_string = classification_report(y_true, predictions)

    # Parse the string and convert it into a dataframe
    report_list = [line.split() for line in report_string.split('\n')[2:-5]]
    report_df = pd.DataFrame(report_list, columns=['class', 'precision', 'recall', 'f1-score', 'support'])
    report_df.set_index('class', inplace=True)
    return report_df

## 데이터 세트 로딩

In [None]:
bank_churner_df = pd.read_csv("./data/BankChurner_kaggle.csv")
bank_churner_df = bank_churner_df.iloc[:, :-2]
bank_churner_df = bank_churner_df.drop('CLIENTNUM', axis=1)

In [None]:
bank_churner_df.info()

## 이탈고객 예측 모델 생성을 위한 전처리 

In [None]:
X = bank_churner_df.drop(['Attrition_Flag'], axis=1)
y = bank_churner_df['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_dummies = pd.get_dummies(X_train)

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_dummies, y_train)

## 모델별 학습 및 평가

### Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)

In [None]:
scores = cross_val_score(rf_model, pd.get_dummies(X), y, cv=5)

# The 'cv' parameter is the number of folds, 'scores' will contain the accuracy of the model on each fold
print("Score per fold: ", scores)
print("Average score: ", scores.mean())

In [None]:
rf_model.fit(X_resampled, y_resampled)

In [None]:
predictions = rf_model.predict(pd.get_dummies(X_test))

In [None]:
print(predictions)

In [None]:
aa = classification_report(y_test, predictions)
aa
# print(aa.split()) 
#pd.read(aa)

In [None]:
report_list = [classification_report(y_test, predictions).split('\n')[2:-5]]
report_list
report_df = pd.DataFrame(report_list)
report_df
#report_df = pd.DataFrame(report_list, columns=['class', 'precision', 'recall', 'f1-score', 'support'])
    # report_df.set_index('class', inplace=True)

In [None]:
rf_report = report(y_test, predictions)
rf_report

In [None]:
# get feature importance
feature_importance = rf_model.feature_importances_

feature_df = pd.DataFrame(list(zip(X_resampled.columns, feature_importance)), 
                          columns=['Feature_Name', 'Importance'])

# Sort the DataFrame by importance
feature_df = feature_df.sort_values(by='Importance', ascending=False)

# Top 10 Features
feature_df.head(10)

### Lightgbm

In [None]:
lgbm_model = lgb.LGBMClassifier(n_estimators=500, random_state=42, boosting_type='GOSS')

In [None]:
scores = cross_val_score(lgbm_model, pd.get_dummies(X), y, cv=5)

# The 'cv' parameter is the number of folds, 'scores' will contain the accuracy of the model on each fold
print("Score per fold: ", scores)
print("Average score: ", scores.mean())

In [None]:
lgbm_model.fit(X_resampled, y_resampled)

In [None]:
predictions = lgbm_model.predict(pd.get_dummies(X_test))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
lgbm_report = report(y_test, predictions)
lgbm_report

In [None]:
# View the feature importance
feature_importance = lgbm_model.feature_importances_

feature_df = pd.DataFrame(list(zip(X_resampled.columns, feature_importance)), 
                          columns=['Feature_Name', 'Importance'])

# Sort the DataFrame by importance
feature_df = feature_df.sort_values(by='Importance', ascending=False)

# Top 10 Features
feature_df.head(10)

### Logistic Regression

In [None]:
scaler = MinMaxScaler()

X = bank_churner_df.drop(['Attrition_Flag'], axis=1)
y = bank_churner_df['Attrition_Flag']

# scale the data
X_num_data = [col for col in X.columns if X[col].dtype != 'object']
X_num_data = scaler.fit_transform(X[X_num_data])
X_num_data = pd.DataFrame(X_num_data, columns=[col for col in X.columns if X[col].dtype != 'object'])

# one hot encode
X_dum_data = [col for col in X.columns if X[col].dtype == 'object']
X_dum_data = pd.get_dummies(X[X_dum_data])

# Combine scaled and one hot encoded data
X = pd.merge(X_num_data, X_dum_data, left_index=True, right_index=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
logistic_model = LogisticRegression(max_iter=5000)

In [None]:
logistic_model.fit(X_resampled, y_resampled)

In [None]:
predictions = logistic_model.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
logistic_report = report(y_test, predictions)

### Tensorflow

In [None]:
scaler = RobustScaler()

X = bank_churner_df.drop(['Attrition_Flag'], axis=1)
y = bank_churner_df['Attrition_Flag'].map({'Existing_Customer': 1, 'Attrited_Customer': 0})

# scale the data
X_num_data = [col for col in X.columns if X[col].dtype != 'object']
X_num_data = scaler.fit_transform(X[X_num_data])
X_num_data = pd.DataFrame(X_num_data, columns=[col for col in X.columns if X[col].dtype != 'object'])

# one hot encode
X_dum_data = [col for col in X.columns if X[col].dtype == 'object']
X_dum_data = pd.get_dummies(X[X_dum_data])

# Combine scaled and one hot encoded data
X = pd.merge(X_num_data, X_dum_data, left_index=True, right_index=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
tf_model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_resampled.shape[1],)),
    keras.layers.Dense(16, activation='sigmoid'),
    keras.layers.Dense(4, activation='sigmoid'),
    keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile the model
optimizer = Adam(learning_rate=0.00005)
early_stop = EarlyStopping(monitor='val_loss', patience=3)
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = tf_model.fit(X_resampled, y_resampled, validation_split=0.2, epochs=500, batch_size=128, verbose=0, callbacks=[early_stop])

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
# Make predictions
predictions = tf_model.predict(X_test)

# convert predicted probabilities into binary values
predictions = (predictions > 0.5).astype("int32")

# convert 2d array to 1d
predictions = np.squeeze(predictions)

predictions = pd.Series(predictions)

In [None]:
predictions = predictions.map({1: 'Existing_Customer', 0: 'Attrited_Customer'})
y_true = y_test.map({1: 'Existing_Customer', 0: 'Attrited_Customer'})

In [None]:
print(classification_report(y_true, predictions))

In [None]:
tf_report = report(y_true, predictions)
tf_report

In [None]:
final_report = pd.concat([lgbm_report, rf_report, logistic_report, tf_report], keys=['LightGBM', 
                                                                      'Random Forest', 
                                                                      'Logistic Regression', 
                                                                      'Tensorflow'])

In [None]:
# 결론

## 기술적 분석
거래금액과 거래횟수가 많을수록 이탈 가능성이 낮아짐
플리티넘 카드 보유자 수가 적고 회원 탈퇴율이 높아 좀 더 관심을 가져야 할 분야
이탈율은 비활성화 된지 4개월이 지나면 최고조에 달하니 이러한 현생이 발생한 이유를 좀 더 세부적으로 분석할 필요가 있음

카드의 종류가 많을수록 이탈 가능성이 줄어들며
접촉 건수에 따른 이탈율을 좀더 분석할 필요가 있음

In [None]:
## 고객 이탈 예측 분석
LightGBM은 91%의 가장 높은 Attrited Customer Recall과 89%의 정밀도를 가지고 있음
고객 이탈을 사전에 방지하기 위해서 LightGBM 모델을 사용하는 것이 적합함

In [None]:
final_report