## Predicting the churn rate of bank customers
## with Random Forest Classifier

In [11]:
import seaborn as sns
import numpy as np
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
df = pd.read_csv('bank_customers.csv')
df.tail()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [7]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

## Feature Engineering
Wrangling and encoding the features for the following classification models.

In [9]:
X = df.drop(['CustomerId','RowNumber','Surname','Exited','Geography','Gender'], axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 8), (2000, 8), (8000,), (2000,))

In [None]:
"""
# encode the columns Geography (3) and Gender (2)
# bin the columns CreditScore, Age, Balance, EstimatedSalary, then scale them 

X_train['CreditScore'] = pd.cut(X_train['CreditScore'], bins=2, labels=['low_score','high_score'])
X_train['Balance'] = pd.cut(X_train['Balance'], bins=2, labels=['low_balance','high_balance'])
X_train['EstimatedSalary'] = pd.cut(X_train['EstimatedSalary'], bins=2, labels=['low_salary','high_salary'])
X_train['Age'] = pd.cut(X_train['Age'], bins=3, labels=['young','adult','senior'])
"""

In [None]:
# make a pipeline to impute missing values, bin continuous values, and encode them

impute_bin_encode = make_pipeline(
    KBinsDiscretizer(n_bins=2, encode='onehot-dense', strategy='uniform'),
    OneHotEncoder(sparse=False, handle_unknown='ignore')
)

In [None]:
# make a pipeline to impute missing values than scale values

impute_then_scale = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)

In [None]:
fe = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Gender','Geography']),
    ('impute_bin_scale', impute_bin_encode, ['CreditScore','Geography','Gender','Age','Balance','EstimatedSalary']),
    ('impute_scale', impute_then_scale, ['Tenure'])
])

In [None]:
"""
# apply the transformations to the columns

ct = ColumnTransformer([
    ('one-hot-encoding', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['CreditScore','Geography','Gender','Age','Balance','EstimatedSalary']),
    ('scaler', MinMaxScaler(), ['Tenure']),
    ('keep', 'passthrough', ['NumOfProducts','HasCrCard','IsActiveMember'])
    ])
"""

In [None]:
fe.fit(X_train)
X_train_trans = fe.transform(X_train)

In [None]:
"""
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
labels = ['low_score','high_score','geo1','geo2','geo3','male','female','young','adult','senior','tenure','low_balance','high_balance','num_products','has_card','active_member','low_salary','high_salary']

X_train_trans = pd.DataFrame(X_train_trans, columns=labels)
X_train_trans.head()
sns.heatmap(abs(X_train_trans.corr()))
"""

In [None]:
#ct.fit(X_test)
#X_test_trans = ct.transform(X_test)

#labels = ['low_score','high_score','geo1','geo2','geo3','male','female','young','adult','senior','tenure','low_balance','high_balance','num_products','has_card','active_member','low_salary','high_salary']

#X_test_trans = pd.DataFrame(X_test_trans, columns=labels)
#X_test_trans.head()
#sns.heatmap(abs(X_test_trans.corr()))

## Random Forest

In [19]:
# try different estimators (default=100) and criterion ('gini' vs 'entropy')

rf = RandomForestClassifier(n_estimators=200, criterion='gini')
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [20]:
print(
    "RF train score:", round(rf.score(X_train, y_train), 3),
    "\nRF test score:", round(rf.score(X_test, y_test), 3),
    "\nRF precision score:", round(precision_score(y_test, y_pred), 3),
    "\nRF recall score:", round(recall_score(y_test, y_pred), 3),
    "\nRF f1 score:", round(f1_score(y_test, y_pred), 3)
)

RF train score: 1.0 
RF test score: 0.861 
RF precision score: 0.751 
RF recall score: 0.438 
RF f1 score: 0.553
