In [1]:
# libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read data file
df = pd.read_csv('./data/churn.csv')

In [3]:
df.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [4]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [5]:
df = df[['CreditScore', 'Geography', 'Gender', 'Tenure', 'Balance', 
         'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']]
print(df.head())

   CreditScore Geography  Gender  Tenure    Balance  NumOfProducts  HasCrCard  \
0          619    France  Female       2       0.00              1          1   
1          608     Spain  Female       1   83807.86              1          0   
2          502    France  Female       8  159660.80              3          1   
3          699    France  Female       1       0.00              2          0   
4          850     Spain  Female       2  125510.82              1          1   

   IsActiveMember  EstimatedSalary  Exited  
0               1        101348.88       1  
1               1        112542.58       0  
2               0        113931.57       1  
3               0         93826.63       0  
4               1         79084.10       0  


In [6]:
geo_df = pd.get_dummies(df['Geography'])
gender_df = pd.get_dummies(df['Gender'])

print(geo_df.head())

   France  Germany  Spain
0       1        0      0
1       0        0      1
2       1        0      0
3       1        0      0
4       0        0      1


In [7]:
df = pd.concat([df, geo_df, gender_df], axis=1)

In [8]:
df = df.drop(['Geography', 'Gender'], axis=1)
print(df.head())

   CreditScore  Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0          619       2       0.00              1          1               1   
1          608       1   83807.86              1          0               1   
2          502       8  159660.80              3          1               0   
3          699       1       0.00              2          0               0   
4          850       2  125510.82              1          1               1   

   EstimatedSalary  Exited  France  Germany  Spain  Female  Male  
0        101348.88       1       1        0      0       1     0  
1        112542.58       0       0        0      1       1     0  
2        113931.57       1       1        0      0       1     0  
3         93826.63       0       1        0      0       1     0  
4         79084.10       0       0        0      1       1     0  


In [9]:
Y = df['Exited']

X = df.copy()
X = X.drop(['Exited'], axis=1)

In [22]:
df.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [28]:
df.corr()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,Female,Male
CreditScore,1.0,0.000842,0.006268,0.012238,-0.005458,0.025651,-0.001384,-0.027094,-0.008928,0.005538,0.00478,0.002857,-0.002857
Tenure,0.000842,1.0,-0.012254,0.013444,0.022583,-0.028362,0.007784,-0.014001,-0.002848,-0.000567,0.003868,-0.014733,0.014733
Balance,0.006268,-0.012254,1.0,-0.30418,-0.014858,-0.010084,0.012797,0.118533,-0.231329,0.40111,-0.134892,-0.012087,0.012087
NumOfProducts,0.012238,0.013444,-0.30418,1.0,0.003183,0.009612,0.014204,-0.04782,0.00123,-0.010419,0.009039,0.021859,-0.021859
HasCrCard,-0.005458,0.022583,-0.014858,0.003183,1.0,-0.011866,-0.009933,-0.007138,0.002467,0.010577,-0.01348,-0.005766,0.005766
IsActiveMember,0.025651,-0.028362,-0.010084,0.009612,-0.011866,1.0,-0.011421,-0.156128,0.003317,-0.020486,0.016732,-0.022544,0.022544
EstimatedSalary,-0.001384,0.007784,0.012797,0.014204,-0.009933,-0.011421,1.0,0.012097,-0.003332,0.010297,-0.006482,0.008112,-0.008112
Exited,-0.027094,-0.014001,0.118533,-0.04782,-0.007138,-0.156128,0.012097,1.0,-0.104955,0.173488,-0.052667,0.106512,-0.106512
France,-0.008928,-0.002848,-0.231329,0.00123,0.002467,0.003317,-0.003332,-0.104955,1.0,-0.580359,-0.575418,-0.006772,0.006772
Germany,0.005538,-0.000567,0.40111,-0.010419,0.010577,-0.020486,0.010297,0.173488,-0.580359,1.0,-0.332084,0.024628,-0.024628


In [10]:
# # Separating traing & test sets
# X = df.iloc[:,:-1]
# Y = (df['Y'] > 140).factorize()[0]

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.33,random_state=1) 

In [11]:
ytrain

2961    0
2919    0
2136    0
4283    0
9820    0
       ..
2895    0
7813    1
905     1
5192    0
235     1
Name: Exited, Length: 6700, dtype: int64

In [None]:
####################################################
# Linear Regression
####################################################

In [12]:
# simple linear regression, Linear Regression
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression().fit(xtrain, ytrain)

In [14]:
from sklearn.metrics import mean_squared_error

pred = lr_model.predict(xtest)
mse = mean_squared_error(ytest, pred)
np.round(mse,4)

0.2094

In [None]:
####################################################
# KNN Classifier
####################################################

In [15]:
# simple KNN classification
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier().fit(xtrain, ytrain)

In [19]:
from sklearn.metrics import classification_report

pred2 = knn_model.predict(xtest)
print('knn_model classification_report')
print(classification_report(ytest, pred2))

knn_model classification_report
              precision    recall  f1-score   support

           0       0.79      0.94      0.86      2609
           1       0.24      0.08      0.11       691

    accuracy                           0.76      3300
   macro avg       0.51      0.51      0.49      3300
weighted avg       0.68      0.76      0.70      3300



In [None]:
####################################################
# Classification Tree
####################################################

In [17]:
# classification tree , SVM, RF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

dt_model = DecisionTreeClassifier(random_state=1).fit(xtrain, ytrain)
rf_model = RandomForestClassifier(random_state=1).fit(xtrain, ytrain)

In [25]:
from sklearn.metrics import classification_report

pred3 = dt_model.predict(xtest)
print('dt_model classification_report')
print(classification_report(ytest, pred3))
print('')

pred4 = rf_model.predict(xtest)
print('rf_model classification_report')
print(classification_report(ytest, pred4))

# confusion matrix

dt_model classification_report
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      2609
           1       0.41      0.40      0.40       691

    accuracy                           0.75      3300
   macro avg       0.63      0.62      0.62      3300
weighted avg       0.75      0.75      0.75      3300


rf_model classification_report
              precision    recall  f1-score   support

           0       0.83      0.95      0.89      2609
           1       0.63      0.29      0.39       691

    accuracy                           0.81      3300
   macro avg       0.73      0.62      0.64      3300
weighted avg       0.79      0.81      0.79      3300



In [30]:
# feature_importances method는 트리 기반 모델에서만 사용 가능!
rf_model.feature_importances_ # feature의 중요도

array([0.21098637, 0.11191163, 0.20332561, 0.14489901, 0.02027272,
       0.03070934, 0.21701668, 0.01039368, 0.02244213, 0.00944173,
       0.0091513 , 0.0094498 ])

In [31]:
xtrain.columns

Index(['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'France', 'Germany', 'Spain',
       'Female', 'Male'],
      dtype='object')

In [None]:
# Scaler(Standard, MinMax...)
# train with important features only
# Correlation(VIF, correlation matrix) - 다중공선성