# 7 - Credit card churn predict ROC AUC 0.9836 ML+SHAP

## Importing libraries and loading data

In [1]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings 
warnings.filterwarnings("ignore")

import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import ast

pd.set_option('display.max_rows', 1000)

In [4]:
df = pd.read_csv("./data/BankChurner_kaggle.csv").drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1).drop_duplicates()
print(df.shape)
df.sample(5).T

(10127, 20)


Unnamed: 0,10005,1397,2750,465,2963
Attrition_Flag,Existing Customer,Existing Customer,Existing Customer,Existing Customer,Existing Customer
Customer_Age,43,46,43,53,48
Gender,M,M,F,M,M
Dependent_count,5,5,3,2,3
Education_Level,Graduate,Uneducated,College,Post-Graduate,Graduate
Marital_Status,Married,Divorced,Married,Unknown,Single
Income_Category,$120K +,$60K - $80K,Unknown,$80K - $120K,$80K - $120K
Card_Category,Blue,Blue,Blue,Blue,Blue
Months_on_book,36,36,36,36,35
Total_Relationship_Count,6,3,3,4,3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Customer_Age,10127.0,46.32596,8.016814,26.0,41.0,46.0,52.0,73.0
Dependent_count,10127.0,2.346203,1.298908,0.0,1.0,2.0,3.0,5.0
Months_on_book,10127.0,35.928409,7.986416,13.0,31.0,36.0,40.0,56.0
Total_Relationship_Count,10127.0,3.81258,1.554408,1.0,3.0,4.0,5.0,6.0
Months_Inactive_12_mon,10127.0,2.341167,1.010622,0.0,2.0,2.0,3.0,6.0
Contacts_Count_12_mon,10127.0,2.455317,1.106225,0.0,2.0,2.0,3.0,6.0
Credit_Limit,10127.0,8631.953698,9088.77665,1438.3,2555.0,4549.0,11067.5,34516.0
Total_Revolving_Bal,10127.0,1162.814061,814.987335,0.0,359.0,1276.0,1784.0,2517.0
Avg_Open_To_Buy,10127.0,7469.139637,9090.685324,3.0,1324.5,3474.0,9859.0,34516.0
Total_Amt_Chng_Q4_Q1,10127.0,0.759941,0.219207,0.0,0.631,0.736,0.859,3.397


## Data transformation

In [7]:
# select main label
main_label = 'Attrition_Flag'
df[main_label] = (df[main_label]!='Existing Customer').astype(int)

# group columns by larger bins
df['Customer_Age'] = df['Customer_Age'].apply(lambda x: 5*round(1/5*x))
df['Months_on_book'] = df['Months_on_book'].apply(lambda x: 6*round(1/6*x))
df['Total_Amt_Chng_Q4_Q1'] = df['Total_Amt_Chng_Q4_Q1'].apply(lambda x: 1/10*round(10*x))
df['Total_Ct_Chng_Q4_Q1'] = df['Total_Ct_Chng_Q4_Q1'].apply(lambda x: 1/10*round(10*x))
df['Avg_Utilization_Ratio'] = df['Avg_Utilization_Ratio'].apply(lambda x: 1/10*round(10*x))

# log10 transform columns and group by larger bins
for col in ['Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Trans_Amt', 'Total_Trans_Ct']:
    df[f'log10_{col}'] = df[col].apply(lambda x: 1/5*round(5*np.log10(1+x)))
    df = df.drop([col], axis=1)

# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']:
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with='Other', tol=20/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])

print(df.shape)
df.sample(5).T

(10127, 20)


Unnamed: 0,6205,3973,7460,5681,1791
Attrition_Flag,0,0,0,1,0
Customer_Age,55,35,50,40,35
Gender,F,F,F,M,F
Dependent_count,2,2,2,4,2
Education_Level,Graduate,College,Graduate,Graduate,Graduate
Marital_Status,Unknown,Married,Married,Unknown,Single
Income_Category,Less than $40K,Less than $40K,Unknown,$40K - $60K,Less than $40K
Card_Category,Blue,Blue,Blue,Silver,Blue
Months_on_book,36,18,30,30,36
Total_Relationship_Count,5,5,5,4,4


In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Attrition_Flag,10127.0,0.16066,0.367235,0.0,0.0,0.0,0.0,1.0
Customer_Age,10127.0,46.331589,8.161953,25.0,40.0,45.0,50.0,75.0
Dependent_count,10127.0,2.346203,1.298908,0.0,1.0,2.0,3.0,5.0
Months_on_book,10127.0,35.900464,8.117226,12.0,30.0,36.0,42.0,54.0
Total_Relationship_Count,10127.0,3.81258,1.554408,1.0,3.0,4.0,5.0,6.0
Months_Inactive_12_mon,10127.0,2.341167,1.010622,0.0,2.0,2.0,3.0,6.0
Contacts_Count_12_mon,10127.0,2.455317,1.106225,0.0,2.0,2.0,3.0,6.0
Total_Amt_Chng_Q4_Q1,10127.0,0.759593,0.221166,0.0,0.6,0.7,0.9,3.4
Total_Ct_Chng_Q4_Q1,10127.0,0.712758,0.240036,0.0,0.6,0.7,0.8,3.7
Avg_Utilization_Ratio,10127.0,0.275165,0.277037,0.0,0.0,0.2,0.5,1.0


In [9]:
# initialize data
y = df[main_label].values.reshape(-1,)
X = df.drop([main_label], axis=1)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.5, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5063, 19), (5064, 19), (5063,), (5064,))

In [10]:
# initialize Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)

# specify the training parameters 
model = CatBoostClassifier(iterations=1200,
                           depth=5,
                           border_count=22,
                           l2_leaf_reg=0.3,
                           learning_rate=3e-2,
                           verbose=0)

#train the model
model.fit(train_pool)
# make the prediction using the resulting model
y_train_pred = model.predict_proba(train_pool)[:,1]
y_test_pred = model.predict_proba(test_pool)[:,1]
roc_auc_train = roc_auc_score(y_train, y_train_pred)
roc_auc_test = roc_auc_score(y_test, y_test_pred)
print(f"ROC AUC score for train {round(roc_auc_train,4)}, and for test {round(roc_auc_test,4)}")

ROC AUC score for train 0.9991, and for test 0.9836


In [11]:
# calculating the baseline ROC AUC score assuming the same probability from training labels to test
roc_auc_baseline = roc_auc_score(y_test, [np.mean(y_train)]*len(y_test))
print(roc_auc_baseline)

0.5


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl  
import missingno as msno
import warnings

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix

import scikitplot as skplt
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")
%matplotlib inline

mpl.rc('font', family='Malgun Gothic')  # 한글 폰트 설정
                                        # 윈도우 폰트 위치 - C:\Windows\Fonts
plt.figure(figsize=(10,6))              # 그래프 사이즈 설정
sns.set(font='Malgun Gothic', rc={'axes.unicode_minus':False}, style='darkgrid') # 마이너스 처리


In [None]:
bank_churner_df = pd.read_csv("./data/bank_churner.csv")
bank_churner_df = bank_churner_df.drop('cstno', axis=1)

exist_customer_df = bank_churner_df.query('is_churned == 0')
churn_customer_df = bank_churner_df.query('is_churned == 1')

In [None]:
def print_category_graphs(df, column, column_desc):
    
    counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
    exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
    churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
    churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
    
    print(churn_rates.sort_values())
    print(type(churn_rates))
    
    fig = make_subplots(rows=2, 
                    cols=2, 
                    subplot_titles=('【 전체 현황 】', '【 이탈율 】 ', '【 사분위 】'), 
                    # shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1,
                    specs=[[{"secondary_y": True},{}],
                           [{},{}]]
                   )

    # fig.add_trace(go.Bar(x=exist_counts.sort_values().index, y=exist_counts.sort_values(), offsetgroup=0), row=1, col=1)
    fig.add_trace(go.Bar(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), offsetgroup=0), secondary_y=False, row=1, col=1)
    fig.add_trace(go.Bar(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), offsetgroup=0,base=churn_counts.sort_index()), secondary_y=False, row=1, col=1)
    # fig.add_trace(go.Scatter(x=churn_rates.sort_values().index, y=churn_rates.sort_values(), mode='lines', line_shape='linear'),secondary_y=True, row=1, col=1)
    fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), line_shape='linear'),secondary_y=True, row=1, col=1)


    fig.add_trace(go.Bar(x=churn_rates.sort_index().index, y=churn_rates.sort_index()), row=1, col=2)
    
    fig.add_trace(go.Box(x=df[df['is_churned']==0][column],  name='유지'), row=2, col=1)
    fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈'), row=2, col=1)

    
    fig.update_layout(width=1200, 
                  height=800, 
                  showlegend=False,
                  title_text=f'『 {column_desc} 』에 따른 분석 그래프',
                #   barmode='stack'
                 )

    fig.show()

In [None]:

#---------------------------------------------------------------------------------
# 1 Figure
# graph_objects 모듈을 활용한 그래프 생성
#---------------------------------------------------------------------------------
fig = go.Figure(
    # Data 입력
    data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2])],
    # layout 입력
    layout=go.Layout(
        title=go.layout.Title(text="1 Figure"))
    )
fig.show()


# 2 Figure
#----------
fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2],
            title="2 Figure")
fig.show()


#---------------------------------------------------------------------------------
# 3 Figure
# express 모듈을 활용한 그래프 생성
#---------------------------------------------------------------------------------

# px.bar() 함수를 활용해서 bar chart 생성과 동시에 Data, Layout 값 입력
fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2], title="3 Figure")

#show하면 내 노트북 (주피터 노트북 등)에 그래프가 나타남.
fig.show()


#---------------------------------------------------------------------------------
# 4 Figure
# add_trace()
#---------------------------------------------------------------------------------
fig = go.Figure()

fig.add_trace(go.Bar(x=[1, 2, 3], y=[1, 3, 2]))
fig.update_layout(title_text="4 Figure",title_font_size=30)

fig.show()


#---------------------------------------------------------------------------------
# 5 Figure
# add_ trace()
# 이미 Trace가 있는Figure에Trace 추가하여 겹쳐 그리기
#---------------------------------------------------------------------------------
df = px.data.iris()
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species",
                 title="5 Figure")
fig.add_trace(
    go.Scatter(
        x=[2, 4],
        y=[4, 8],
        mode="lines",
        line=go.scatter.Line(color="gray"),
        showlegend=False)
)

fig.show()


#---------------------------------------------------------------------------------
# 6 Figure - make_subplots
# update_traces()
# update_trace() 함수를 사용하면 이미 생성된 trace의 type, 색, 스타일, 템플릿 등 추가 편집이 가능합니다. 
# 물론 처음 Trace를 생성할 때 스타일 지정을 해서 생성이 가능하지만 update_trace 를 활용하면 한번에 모든 
# Trace의 스타일 업데이트가 가능하여 코드의 길이를 줄일 수 있으며 가독성이 높은 코드구현을 위해 Trace 
# 생성부 와 Trace 편집부를 나눠서 작성 가능하게 합니다.
#---------------------------------------------------------------------------------

fig = make_subplots(rows=1, cols=2)

# Trace 추가하기
fig.add_scatter(y=[4, 2, 3.5], mode="markers",
                marker=dict(size=20, color="LightSeaGreen"),
                name="a", row=1, col=1)

fig.add_bar(y=[2, 1, 3],
            marker=dict(color="MediumPurple"),
            name="b", row=1, col=1)

fig.add_scatter(y=[2, 3.5, 4], mode="markers",
                marker=dict(size=20, color="MediumPurple"),
                name="c", row=1, col=2)

fig.add_bar(y=[1, 3, 2],
            marker=dict(color="LightSeaGreen"),
            name="d", row=1, col=2)

# 한번에 Bar plot 만 파란색으로 바꾸기
fig.update_traces(marker=dict(color="RoyalBlue"),
                  selector=dict(type="bar"))

fig.update_layout(title_text='6 Figure - make_subplots', title_font_size=30
                 )


fig.show()


#---------------------------------------------------------------------------------
# 7 Figure
# update_layout()
# 그래프 사이즈, 제목 및 텍스트, 글꼴크기 와 같은 Trace 외적인 그래프 요소를 업데이트 가능
#---------------------------------------------------------------------------------

#그래프 생성
fig = go.Figure(data=go.Bar(x=[1, 2, 3], y=[1, 3, 2]))

# 타이틀 추가하기
fig.update_layout(title_text="7 Figure",title_font_size=30)

fig.show()


#---------------------------------------------------------------------------------
# 8 Figure
# update_xaxes() / update_yaxes()
#---------------------------------------------------------------------------------
#데이터 생성
df = px.data.tips()
x = df["total_bill"]
y = df["tip"]

# 그래프 그리기
fig = go.Figure(data=go.Scatter(x=x, y=y, mode='markers'))

fig.update_layout(title_text="8 Figure",title_font_size=30)

# 축 타이틀 추가하기
fig.update_xaxes(title_text='Total Bill ($)')
fig.update_yaxes(title_text='Tip ($)')

fig.show()

In [None]:
    
def print_category_graphs(df, column, column_desc):
    
    counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
    exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
    churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
    churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
    
    
    fig = make_subplots(rows=3, 
                    cols=2, 
                    subplot_titles=('【 전체 현황 】', '【 이탈율 】', '【 사분위 】', f'【 {column_desc} 중 전체 현황 】', f'【 {column_desc} 중 유지 현황 】', f'【 {column_desc} 중 이탈 현황 】'), 
                    # shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1,
                    specs=[[{"secondary_y": True}, {}],
                           [{}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}]]
                   )


    # 전체 현황
    # ---------
    fig.add_trace(go.Bar(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), marker_color="red", offsetgroup=0, name='이탈', 
                         text=churn_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto'), row=1, col=1, secondary_y=False)
    
    
    fig.add_trace(go.Bar(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), marker_color="blue", offsetgroup=0, name='유지', 
                         texttemplate='%{value:,}', 
                        #  text=exist_counts.sort_index(), 
                         hovertemplate = '%{label}: %{value:,}',
                         textposition='auto', base=churn_counts.sort_index()), row=1, col=1, secondary_y=False)
    
    fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', 
                             line_shape='linear'), row=1, col=1, secondary_y=True)
    
    fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)
    #fig.update_traces(texttemplate='%{value:,}', hovertemplate = '%{label}, %{value}', row=1, col=1)
    # fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
    

    # 이탈율
    # ------
    fig.add_trace(go.Bar(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="red", name='이탈율'),
                  row=1, col=2)

    
    # 사분위
    # ------
    fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], marker_color="red", name='이탈'), row=2, col=1)
    fig.add_trace(go.Box(x=df[df['is_churned']==0][column], marker_color="blue", name='유지'), row=2, col=1)


    # 유지/이탈 현황
    # -------------
    fig.add_trace(go.Pie(labels=counts.sort_index().index, values=counts.sort_index(), name=f'{column_desc} 분표 현황', texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=2, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=2, col=2)

  
    # 유지 현황
    # ---------
    fig.add_trace(go.Pie(labels=exist_counts.sort_index().index, values=exist_counts.sort_index(), name="유지", texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=1)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=1)


    # 이탈 현황
    # ---------
    fig.add_trace(go.Pie(labels=churn_counts.sort_index().index, values=churn_counts.sort_index(), name="이탈", texttemplate = "%{label}: %{value:,} <br>(%{percent})",
                         textposition = "inside"), row=3, col=2)
    fig.update_traces(hole=.4, hoverinfo="label+percent+name", row=3, col=2)


    fig.add_annotation(dict(x=0.73, y=0.5, ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "<b>전체</b>", 
                    font_size=20,
                  ))

    fig.add_annotation(dict(x=0.21, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>유지</b>", 
                        font_size=20,
                      ))

    fig.add_annotation(dict(x=0.73, y=0.13, ax=0, ay=0,
                        xref = "paper", yref = "paper", 
                        text= "<b>이탈</b>", 
                        font_size=20,
                      ))

    # fig.update_layout(
    #     title_text="Global Emissions 1990-2011",
    #     # Add annotations in the center of the donut pies.
    #     annotations=[dict(text='GHG', x=0.18, y=0.5, font_size=20, showarrow=False)])
    #                 # dict(text='CO2', x=0.82, y=0.5, font_size=20, showarrow=False)])
    
    fig.update_layout(width=1200, 
                  height=1200, 
                  showlegend=False,
                  title_text=f'『 {column_desc} 』에 따른 분석 그래프',
                # barmode='stack'
                  hovermode="x",
                 )
    

    fig.show()    

In [None]:
bank_churner_df['dependent_num'].value_counts()

In [None]:
print_category_graphs(bank_churner_df, 'dependent_num', '부양가족수')

In [None]:
# def print_continuous_graphs(df, column, column_desc):
#     fig = make_subplots(rows=3, 
#                         cols=1, 
#                         subplot_titles=('전체 건수 분포', '유지/이탈별 사분위', '유지/이탈별 분포'), 
#                         # shared_xaxes=True,
#                         horizontal_spacing=0.1,
#                         vertical_spacing=0.1
#                     )

#     # 전체 데이터 분포
#     # fig.add_trace(go.Histogram(x=df[column]), row=1, col=1)
#     fig.add_trace(go.Histogram(x=df[column]), row=1, col=1)
    
#     # Box Plot - 고객 유지, 이탈 구분
#     fig.add_trace(go.Box(x=df[df['is_churned']==0][column], 
#                 name='유지'), row=2, col=1)
#     fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈'), row=2, col=1)

#     # 고객 유지, 이탈별 분포
#     fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column]), row=3, col=1)
#     fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column]), row=3, col=1)


#     fig.update_layout(width=1200, 
#                     height=800, 
#                     showlegend=False,
#                     title_text=f'【{column_desc}】에 따른 분석 그래프'
#                     )

#     fig.show()


def print_continuous_graphs(df, column, column_desc):

       counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
       exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
       churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
       churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    

       fig = make_subplots(rows=5, 
                     cols=1, 
                     subplot_titles=('전체 건수 분포', '유지/이탈별 사분위', '유지/이탈별 분포'), 
                     # shared_xaxes=True,
                     horizontal_spacing=0.1,
                     vertical_spacing=0.1,
                     specs=[[{"secondary_y": True}],
                            [{}],
                            [{"secondary_y": True}],
                            [{"secondary_y": True}],
                            [{}],
                            ]

                     )

       # 전체
       # ----
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], texttemplate="%{x}", marker_color="red"), row=1, col=1, secondary_y=False)
       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], texttemplate="%{x}", marker_color="blue"), row=1, col=1, secondary_y=False)
       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=1, col=1, secondary_y=True)
       fig.update_yaxes(secondary_y=True, range=[0, 1], row=1, col=1)


       
       # Box Graph
       # ---------
       # fig.add_trace(go.Box(x=exist_counts, 
       #               name='유지'), row=2, col=1)
       fig.add_trace(go.Box(x=df[df['is_churned']==0][column], name='유지'), row=2, col=1)
       fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈'), row=2, col=1)


       fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column], marker_color="blue"), row=3, col=1)
       fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column], marker_color="red"), row=3, col=1)

       fig.add_trace(go.Scatter(x=churn_counts.sort_index().index, y=churn_counts.sort_index(), mode='lines+markers', marker_color="red", name='이탈'), row=4, col=1, secondary_y=False)
       fig.add_trace(go.Scatter(x=exist_counts.sort_index().index, y=exist_counts.sort_index(), mode='lines+markers', marker_color='blue', name='유지'), row=4, col=1, secondary_y=False)

       fig.add_trace(go.Scatter(x=churn_rates.sort_index().index, y=churn_rates.sort_index(), marker_color="green", name='이탈율', line_shape='linear'),
                     row=4, col=1, secondary_y=True)


       # 이탈률
       # ------
       # churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
       fig.add_trace(go.Histogram(x=churn_rates.sort_index()), row=5, col=1)

       fig.update_yaxes(secondary_y=True, range=[0, 1], row=3, col=1)

       fig.update_layout(width=1200, 
                     height=800, 
                     showlegend=False,
                     barmode='stack'
                     )

       fig.show()


In [None]:
print_continuous_graphs(bank_churner_df, 'age', '나이')

In [None]:
import plotly.express as px

#데이터 불러오기
df = px.data.gapminder().query("continent=='Oceania'")

#그래프 그리기
fig = px.line(df, x="year", y="lifeExp", color="country",text='pop')
fig.update_traces(mode="markers+lines")

fig.update_traces(hovertemplate='연도: %{x} <br>'+
                                'pop: %{text} <br>'+
                                 'lifeExp : %{y}')

fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots


# This dataframe has 244 lines, but 4 distinct values for `day`
df = px.data.tips()
pie = px.pie(df, values='tip', names='day')
fig=make_subplots(rows=1, cols=2,
                  specs=[[{"type": "domain"},{"type": "xy"}]])

fig.add_trace(pie.data[0], row=1, col=1)
fig.add_trace(go.Bar(x=['A', 'B', 'C'], y=[25, 17, 19], marker_color="blue", name= "Quantity"), row=1, col=2)
fig.update_layout(width=700, height=350, bargap=0.05)

fig.add_annotation(dict(x=0.22, y=-0.1,   ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "Hello, pie chart!"
                  ))
fig.add_annotation(dict(x=0.72, y=-0.14,   ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "My Bars!"
                  ))

In [None]:
print_category_graphs(bank_churner_df, 'dependent_num', '부양가족수')

In [None]:
print_category_graphs1(bank_churner_df, 'dependent_num', '부양가족수')

In [None]:
def print_category_graphs(df, column, column_desc):
    
    counts = df[column].value_counts() # 해당 컬럼의 속성별 합계
    
    exist_counts = df[df['is_churned'] == 0][column].value_counts() # 유지 - 해당 컬럼의 속성별 합계
    churn_counts = df[df['is_churned'] != 0][column].value_counts() # 이탈 - 해당 컬럼의 속성별 합계
    
    # churn_rates = df[df['is_churned'] == 1][column].value_counts() / df[column].value_counts() # 해당 컴럼의 속성별 이탈율    
    churn_rates = churn_counts / counts
    
    
    # fig = make_subplots(rows=2, 
    #                 cols=2, 
    #                 subplot_titles=('전체 현황', '이탈율'), 
    #                 # shared_xaxes=True,
    #                 horizontal_spacing=0.1,
    #                 vertical_spacing=0.1
    #                )
    
    
    fig = px.bar(df, x=df[column], title="Long-Form Input")
    

    # fig.add_trace(go.Bar(x=exist_counts.sort_values().index, y=exist_counts.sort_values()), row=1, col=1)
    # fig.add_trace(go.Bar(x=churn_counts.sort_values().index, y=churn_counts.sort_values()), row=1, col=1)

    # fig.add_trace(go.Bar(x=churn_rates.sort_values().index, y=churn_rates.sort_values()), row=1, col=2)
    
    # fig.add_trace(go.Box(x=df[df['is_churned']==0][column],  name='유지'), row=2, col=1)
    # fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈'), row=2, col=1)

    
    # fig.update_layout(width=1000, 
    #               height=500, 
    #               showlegend=False,
    #               title_text=f'【{column_desc}】에 따른 분석 그래프',
    #             #   barmode='stack'
    #              )

    fig.show()

In [None]:
exist_counts = bank_churner_df[bank_churner_df['is_churned'] == 0]['dependent_num'].value_counts() # 유지 - 해당 컬럼의 속성별 합계
churn_counts = bank_churner_df[bank_churner_df['is_churned'] != 0]['dependent_num'].value_counts() # 이탈 - 해당 컬럼의 속성별 합계1
exist_counts

In [None]:
fig = px.bar(x=bank_churner_df[bank_churner_df['is_churned']==0]['age'])
fig.show()

In [None]:
bank_churner_df.groupby(['dependent_num', 'is_churned']).count()

In [None]:
print_category_graphs(bank_churner_df, 'dependent_num', '부양가족수')

In [None]:
def print_continuous_graphs(df, column, column_desc):
    fig = make_subplots(rows=3, 
                        cols=1, 
                        subplot_titles=('전체 건수 분포', '유지/이탈별 사분위', '유지/이탈별 분포'), 
                        # shared_xaxes=True,
                        horizontal_spacing=0.1,
                        vertical_spacing=0.1
                    )

    # 전체 데이터 분포
    # fig.add_trace(go.Histogram(x=df[column]), row=1, col=1)
    fig.add_trace(go.Histogram(x=df[df['is_churned'] == 0][column]), row=1, col=1)
    fig.add_trace(go.Histogram(x=df[df['is_churned'] != 0][column]), row=1, col=1)
    
    # Box Plot - 고객 유지, 이탈 구분
    fig.add_trace(go.Box(x=df[df['is_churned']==0][column], 
                name='유지'), row=2, col=1)
    fig.add_trace(go.Box(x=df[df['is_churned']!=0][column], name='이탈'), row=2, col=1)

    # 고객 유지, 이탈별 분포
    fig.add_trace(go.Histogram(x=df[df['is_churned']==0][column]), row=3, col=1)
    fig.add_trace(go.Histogram(x=df[df['is_churned']!=0][column]), row=3, col=1)


    fig.update_layout(width=1200, 
                    height=800, 
                    showlegend=False,
                    title_text=f'【{column_desc}】에 따른 분석 그래프',
                    barmode='stack'
                    )

    fig.show()


In [None]:
print_continuous_graphs(bank_churner_df, 'age','나이')

In [None]:
# 연속형 데이터 그래프 구현 함수
def cont_feature_graphs(column):
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    sns.boxplot(data=bank_churner_df, x=column, y='is_churned', orient='h', ax=axs[0])
    sns.kdeplot(data=bank_churner_df, x=column, hue='is_churned', common_norm=False, ax=axs[1]) # KDE(커널밀도추정())
    axs[0].set_ylabel('')
    axs[1].set_ylabel('')
    plt.show()
    
# 범주형 데이터 그래프 구현 함수
def cat_feature_graphs(column):
    counts = bank_churner_df[column].value_counts() # calculate the counts by education level
    churn_rates = churn_customer_df[column].value_counts() / bank_churner_df[column].value_counts() # attrition rate by education level
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    sns.barplot(x=churn_rates.sort_values().index, y=churn_rates.sort_values(), ax=axs[0])
    sns.barplot(x=counts.sort_values().index, y=counts.sort_values(), ax=axs[1])
    axs[0].set_ylabel('')
    axs[0].set_title('Churn Rates')
    axs[1].set_ylabel('')
    axs[1].set_title('Counts')
    plt.show()

In [None]:
cont_feature_graphs('age')

In [None]:
# 범주형 데이터 그래프 구현 함수
def print_cat_feature_graphs(column, column_desc):
    counts = bank_churner_df[column].value_counts() # calculate the counts by education level
    churn_rates = churn_customer_df[column].value_counts() / bank_churner_df[column].value_counts() # attrition rate by education level
    
    fig = make_subplots(rows=1, 
                    cols=2, 
                    # subplot_titles=("이탈율", f"{column_desc}별 건수"), 
                    subplot_titles=("이탈율", "건수"), 
                    # shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1
                   )

    
    fig.add_trace(go.Bar(x=churn_rates.sort_values().index, y=churn_rates.sort_values()), row=1, col=1)
    fig.add_trace(go.Bar(x=counts.sort_values().index, y=counts.sort_values()), row=1, col=2)
    
    fig.update_layout(width=1000, 
                  height=500, 
                  showlegend=False,
                  title_text=f"【{column_desc}】에 따른 분석 그래프"
                 )

    
    fig.show()
    
print_cat_feature_graphs('marital_stat', '결혼상태')

In [None]:
cat_feature_graphs('marital_stat')

In [None]:
import plotly.graph_objects as go
name = "나이"

fig = make_subplots(rows=2, 
                    cols=2, 
                    subplot_titles=("전체 건수 분포", "유지/이탈별 사분위", "유지/이탈별 분포", "유지/이탈별 누적합 분포"), 
                    shared_xaxes=True,
                    horizontal_spacing=0.1,
                    vertical_spacing=0.1
                   )

# 전체 데이터 분포
fig.add_trace(go.Histogram(x=bank_churner_df['age']), row=1, col=1)

# Box Plot - 고객 유지, 이탈 구분
fig.add_trace(go.Box(x=bank_churner_df[bank_churner_df['is_churned']==0]['age'], 
              name='유지'), row=1, col=2)
fig.add_trace(go.Box(x=bank_churner_df[bank_churner_df['is_churned']!=0]['age'], name='이탈'), row=1, col=2)

# 고객 유지, 이탈별 분포
fig.add_trace(go.Histogram(x=bank_churner_df[bank_churner_df['is_churned']==0]['age']), row=2, col=1)
fig.add_trace(go.Histogram(x=bank_churner_df[bank_churner_df['is_churned']!=0]['age']), row=2, col=1)


#fig.update_layout(width=2000,height=1000)
fig.update_layout(width=1200, 
                  height=800, 
                  showlegend=False,
                  title_text=f"{name}에 따른 분석 그래프"
                 )

fig.show()

# x = bank_churner_df[bank_churner_df['is_churned']!=0]['age']
# hist_data = [x]
# group_labels = ['distplot'] # name of the dataset

# fig = ff.create_distplot(hist_data, group_labels, show_rug=False)
# fig.show()


In [None]:
# I largely keep the codes and comments the same as the original answer, with the modification highlighted under '#######'
import plotly.express as px
import plotly.subplots as sp

my_df = px.data.medals_long()

#fig = px.box(bank_churner_df, x='age', color="is_churned")
    
# Create figures in Express
#figure1 = px.bar(my_df, x = "nation", y = "count", color = "medal")
figure1 = px.box(bank_churner_df, x='age', color="is_churned")
figure2 = px.line(my_df, x = "nation", y = "count", color = "medal")

# For as many traces that exist per Express figure, get the traces from each plot and store them in an array.
# This is essentially breaking down the Express fig into its traces
figure1_traces = []
figure2_traces = []

for trace in range(len(figure1["data"])):
    figure1_traces.append(figure1["data"][trace])
    
for trace in range(len(figure2["data"])):
    ############ The major modification. Manually set 'showlegend' attribute to False. ############
    figure2["data"][trace]['showlegend'] = False             
    figure2_traces.append(figure2["data"][trace])

   
# Create a 1x2 subplot
this_figure = sp.make_subplots(rows = 1, cols = 2, subplot_titles = ['Box', 'Line'])
this_figure.update_layout(height = 500, width = 1200, title_text = "Medals count by country", title_font_size = 25)

# Get the Express fig broken down as traces and add the traces to the proper plot within the subplot
  
for traces in figure1_traces:
    this_figure.append_trace(traces, row = 1, col = 1)
    
for traces in figure2_traces:
    this_figure.append_trace(traces, row = 1, col = 2)
    
this_figure.show()


In [None]:
import plotly.express as px

# 데이터 불러오기
df = px.data.tips()

fig = px.box(bank_churner_df, x='age', color="is_churned")

fig.show()

In [None]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)

x = bank_churner_df[bank_churner_df['is_churned']!=0]['age']
hist_data = [x]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels, show_rug=False)
fig.show()

#fig.add_trace(go.Histogram(x=bank_churner_df[bank_churner_df['is_churned']!=0]['age'], histfunc='sum'), row=2, col=2)

In [None]:
help(make_subplots)

In [None]:
import plotly.express as px

fig = px.box(bank_churner_df, x="sex", y="age")

fig.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6]),
              row=1, col=1)

fig.show()
#fig.add_trace(px.box(bank_churner_df, x="age", color="is_churned"), row=1, col=1)

#fig = px.box(bank_churner_df, x="age", color="is_churned")

fig.show()

In [None]:
# imports
import plotly.express as px
import plotly.graph_objects as go

# data
df = px.data.tips()

# plotly setup
fig=go.Figure()

# a plotly trace for each subcategory
for i, smokes in enumerate(df['smoker'].unique()):
    df_plot=df[df['smoker']==smokes]

    fig.add_trace(go.Box(x=df_plot['time'], y=df_plot['total_bill'],
                         notched=True,
                         line=dict(color='black'),
                         #line=dict(color=colors[i]),
                         fillcolor='yellow',
                         #fillcolor=colors[i+4],
                         name='smoker=' + smokes))

# figure layout adjustments
fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

In [None]:
import plotly.express as px

fig = px.box(bank_churner_df, x="age", facet_col="is_churned")

fig.show()
facet_col="sex"

In [None]:
bank_churner_df.info()

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px

def cont_feature_graphs(column):
    
    fig = make_subplots(rows=2, cols=1)

    tr1=go.Box(x=bank_churner_df['age'], facet_col='is_churned', name='나이 박스 플롯', boxmean=True)
    #tr1=px.box(bank_churner_df, x=column, color='is_churned')
    tr2=go.Histogram(x=bank_churner_df[column], name='나이 히스토그램')

    fig.add_trace(tr1,row=1,col=1)
    fig.add_trace(tr2,row=2,col=1)

    fig.update_layout(height=700, width=1200, title_text="나이별 분포")
    fig.show()
    
    
cont_feature_graphs('age')

In [None]:
# 연속형 데이터 그래프 구현 함수
def cont_feature_graphs(column, xlabel):
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
    sns.boxplot(data=bank_churner_df, x=column, y='is_churned', orient='h', ax=axs[0])
    sns.kdeplot(data=bank_churner_df, x=column, hue='is_churned', common_norm=False, ax=axs[1]) # KDE(커널밀도추정())
    axs[0].set_ylabel('')
    axs[0].set_xlabel(xlabel)
    axs[1].set_ylabel('')
    axs[0].set_xlabel(xlabel)
    plt.show()

In [None]:
cont_feature_graphs('age','나이')