Load Data

In [1]:
import numpy as np
import pandas as pd

In [None]:
churn_df = pd.read_csv('bank.data.csv')
churn_df.head()

Part 1: Data Exploration

Part 1.1: Understand the Raw Dataset

In [None]:
# check data info
churn_df.info() # No missing data

In [None]:
# check the unique values for each column
churn_df.nunique()

In [5]:
# get target variable
y = churn_df['Exited']

Part 1.2: Understand the Features

In [None]:
# check missing values
churn_df.isnull().sum()

In [None]:
# understand numerical feature
# discrete/continuous
# distribution of continuous features
churn_df[['CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'Balance', 'EstimatedSalary']].describe()

In [9]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
_,axss = plt.subplots(2,3, figsize=[20,10])
sns.boxplot(x='Exited', y='CreditScore', data=churn_df, ax=axss[0][0])
sns.boxplot(x='Exited', y='Age', data=churn_df, ax=axss[0,1])
sns.boxplot(x='Exited', y='Tenure', data=churn_df, ax=axss[0,2])
sns.boxplot(x='Exited', y='NumOfProducts', data=churn_df, ax=axss[1,0])
sns.boxplot(x='Exited', y='Balance', data=churn_df,ax=axss[1,1])
sns.boxplot(x='Exited', y='EstimatedSalary', data=churn_df, ax=axss[1,2])


In [None]:
# Multiple boxplots for numerical feature
churn_df['HasCrCard'] = churn_df['HasCrCard'].astype(str)
churn_df['IsActiveMember'] = churn_df['IsActiveMember'].astype(str)
_,axss = plt.subplots(2,2, figsize=[20,10])
sns.countplot(x='Exited', hue='Geography', data=churn_df, ax=axss[0][0])
sns.countplot(x='Exited', hue='Gender', data=churn_df, ax=axss[0][1])
sns.countplot(x='Exited', hue='HasCrCard', data=churn_df, ax=axss[1][0])
sns.countplot(x='Exited', hue='IsActiveMember', data=churn_df, ax=axss[1][1])

Part 2: Feature Preprocessing

In [14]:
# Get feature space by dropping useless feature
to_drop = ['RowNumber', 'CustomerId', 'Surname', 'Exited']
X = churn_df.drop(to_drop, axis=1) # axis=1 means columns are being dropped

In [None]:
X.head()

In [None]:
X.dtypes

In [19]:
cat_cols = X.columns[X.dtypes == 'object']
num_cols = X.columns[(X.dtypes == 'int64') | (X.dtypes == 'float64')]

In [None]:
from sklearn import model_selection
# Why stratified sampling?
# We want to have both customers who stayed and customers who exited in both training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, stratify=y, random_state=1)

print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')

In [21]:
from sklearn.preprocessing import OneHotEncoder

def OneHotEncoding(df, enc, categories):
    # Covert sparse matrix to ndarray
    transformed = pd.DataFrame(enc.transform(df[categories]).toarray(), columns=enc.get_feature_names_out(categories))
    return pd.concat([df.reset_index(drop=True), transformed], axis=1).drop(categories, axis=1)

categories = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
enc_ohe = OneHotEncoder()
enc_ohe.fit(X_train[categories])

X_train = OneHotEncoding(X_train, enc_ohe, categories)
X_test = OneHotEncoding(X_test, enc_ohe, categories)


In [None]:
X_train.head()

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[num_cols])

X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
X_train.head()

Part 3: Model Training and Result Evaluation

Part 3.1: Model Training

build models

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

classifier_logistic = LogisticRegression()

classifier_KNN = KNeighborsClassifier()

classifier_RF = RandomForestClassifier()

In [None]:
classifier_logistic.fit(X_train, y_train)

In [None]:
classifier_logistic.predict(X_test)

In [None]:
classifier_logistic.score(X_test, y_test)

How to search for the proper parameters?

In [None]:
from sklearn.model_selection import GridSearchCV

def print_grid_search_metrics(gs):
    print('Best score:' + str(gs.best_score_))
    print('Best parameters set:')
    best_parameters = gs.best_params_
    for param_name in sorted(best_parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [None]:
parameters = {
    'penalty': ('l2', 'l1'),
    'C': (0.01, 0.05, 0.1, 0.2, 1)
}

Grid_LR = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv = 5)
Grid_LR.fit(X_train, y_train)

In [None]:
print_grid_search_metrics(Grid_LR)

In [None]:
best_LR_model = Grid_LR.best_estimator_

In [None]:
best_LR_model.predict(X_test)

In [None]:
best_LR_model.score(X_test, y_test)

In [None]:
LR_models = pd.DataFrame(Grid_LR.cv_results_)
res = (LR_models.pivot(index='param_penalty', columns='param_C', values='mean_test_score'))
_ = sns.heatmap(res, cmap = 'viridis')

In [None]:
parameters = {
    'n_neighbors':[1,3,5,7,9]
}
Grid_KNN = GridSearchCV(KNeighborsClassifier(), parameters, cv=5)
Grid_KNN.fit(X_train, y_train)

In [None]:
print_grid_search_metrics(Grid_KNN)

In [None]:
best_KNN_model = Grid_KNN.best_estimator_

In [None]:
best_KNN_model.predict(X_test)

In [None]:
best_KNN_model.score(X_test, y_test)

In [None]:
parameters = {
    'n_estimatoers': [60, 80, 100],
    'max_depth': [1, 5, 10]
}
Grid_RF = GridSearchCV(RandomForestClassifier(solver = 'liblinear'), parameters, cv = 5)