In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

In [10]:
# Load the dataset

df = pd.read_csv('C:/Users/vumac/Desktop/Springboard_Capstone2/Dataset/BankChurners_cleaned.csv', index_col=0)

In [11]:
# Preprocessing from previous notebook

# Fill missing value with 'Other'. Per previous EDA notebook, Education_Level, Marital_Status and Income_Category have 
df.fillna('Other', inplace=True)
df.isnull().values.any()

# Convert Atttrition Flag to 0 for Existing Customer and 1 for Attrited Customer
df['Attrition_Flag'] = df.Attrition_Flag.map({'Existing Customer':0, 'Attrited Customer':1})

# One Hot Encoding for categorical features
df_feature = pd.get_dummies(df, columns=['Gender','Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])

# Create training and testing set
# The dataset is imbalanced, so we will set 'stratify' to make train and test set have the same proportion of churn and no churn customers 
X = df_feature.drop('Attrition_Flag', axis=1)
y = df_feature['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=48, stratify=y)

# List of numeric columns to apply StandardScaler
numeric_col = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
               'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 
               'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

# Apply StandardScaler to df
# Fit StandardScaler to training set
scaler = StandardScaler()
scaler.fit(X_train[numeric_col].values)

# Create copies of training and testing set
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Transform and assign scaled values to training set
X_train_features = scaler.transform(X_train[numeric_col].values)
X_train_scaled[numeric_col] = X_train_features

# Transform and assign scaled values to testing set
X_test_features = scaler.transform(X_test[numeric_col].values)
X_test_scaled[numeric_col] = X_test_features

In [14]:
y

0        0
1        0
2        0
3        0
4        0
        ..
10122    0
10123    1
10124    1
10125    1
10126    1
Name: Attrition_Flag, Length: 10127, dtype: int64