In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

In [94]:
# Load the dataset

df = pd.read_csv('C:/Users/vumac/Desktop/Springboard_Capstone2/Dataset/BankChurners_cleaned.csv', index_col=0)
df.shape

(10127, 20)

In [95]:
# First few rows of the dataset

df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,60K - 80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than 40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,80K - 120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,,Less than 40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,60K - 80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [96]:
# columns with missing value

null = df.isnull().sum()
null[null>0]

Education_Level    1519
Marital_Status      749
Income_Category    1112
dtype: int64

In [97]:
# Fill missing value with 'Other'. Per previous EDA notebook, Education_Level, Marital_Status and Income_Category have 

df.fillna('Other', inplace=True)
df.isnull().values.any()

False

In [98]:
# The dataset has an imbalanced dependend variable

df.Attrition_Flag.value_counts(normalize=True)

Existing Customer    0.83934
Attrited Customer    0.16066
Name: Attrition_Flag, dtype: float64

In [99]:
# Convert Atttrition Flag to 0 for Existing Customer and 1 for Attrited Customer

df['Attrition_Flag'] = df.Attrition_Flag.map({'Existing Customer':0, 'Attrited Customer':1})

In [100]:
# One Hot Encoding for categorical features

df_feature = pd.get_dummies(df, columns=['Gender','Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])

In [101]:
# Create training and testing set
# The dataset is imbalanced, so we will set 'stratify' to make train and test set have the same proportion of churn and no churn customers 

X = df_feature.drop('Attrition_Flag', axis=1)
y = df_feature['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=48, stratify=y)

In [102]:
# List of numeric columns to apply StandardScaler

numeric_col = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
               'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 
               'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

In [103]:
# Apply StandardScaler to df

# Fit StandardScaler to training set
scaler = StandardScaler()
scaler.fit(X_train[numeric_col].values)

# Create copies of training and testing set
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Transform and assign scaled values to training set
X_train_features = scaler.transform(X_train[numeric_col].values)
X_train_scaled[numeric_col] = X_train_features

# Transform and assign scaled values to testing set
X_test_features = scaler.transform(X_test[numeric_col].values)
X_test_scaled[numeric_col] = X_test_features