## Import Libraries and initialize other values

In [1]:
# import libraries
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, f1_score, confusion_matrix, recall_score, 
                             precision_score, roc_curve, roc_auc_score)

In [2]:
# set pandas parameters
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

## Import and clean data

In [3]:
# set paths
directory_path = 'bank-additional'
file_name = 'bank-additional-full.csv'

In [4]:
# import data
df_raw = pd.read_csv(os.path.join(directory_path, file_name), sep = ';',
                     dtype = {'y' : 'category'})

# view shape
df_raw.shape

(41188, 21)

In [5]:
# encode target variable as 0 and 1
df_raw['y'] = df_raw['y'].cat.codes

In [6]:
# view sample rows
df_raw.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [7]:
# view column details
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null int8
dtypes: float64(5), int64(5), int8(1), object(10)
mem

In [8]:
# view basic columns details
df_raw.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911,0.112654
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528,0.316173
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6,0.0
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1,0.0
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0,0.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1,0.0
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1,1.0


#### Analyze various columns

In [9]:
df_raw.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [10]:
df_raw['job'].value_counts() # missing values - medium

admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64

In [11]:
df_raw['marital'].value_counts() # missing values - low

married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64

In [12]:
df_raw['education'].value_counts() # missing values - low

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64

In [13]:
df_raw['default'].value_counts() # missing values - low

no         32588
unknown     8597
yes            3
Name: default, dtype: int64

In [14]:
df_raw['housing'].value_counts() # missing values - low

yes        21576
no         18622
unknown      990
Name: housing, dtype: int64

In [15]:
df_raw['loan'].value_counts() # missing values - low

no         33950
yes         6248
unknown      990
Name: loan, dtype: int64

In [16]:
df_raw['contact'].value_counts() # missing values - nil

cellular     26144
telephone    15044
Name: contact, dtype: int64

In [17]:
df_raw['month'].value_counts() # missing values - nil

may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: month, dtype: int64

In [18]:
df_raw['day_of_week'].value_counts() # missing values - nil

thu    8623
mon    8514
wed    8134
tue    8090
fri    7827
Name: day_of_week, dtype: int64

In [19]:
df_raw['poutcome'].value_counts() # very high value of non-existent

nonexistent    35563
failure         4252
success         1373
Name: poutcome, dtype: int64

In [20]:
df_raw['y'].value_counts() # missing valies - nil

0    36548
1     4640
Name: y, dtype: int64

In [21]:
# replace missing values for each column
df_raw.replace(to_replace = 'unknown', 
               value = {'job' : 'unemployed',
                        'marital' : 'single',
                        'education' : 'illiterate',
                        'default' : 'no',
                        'housing' : 'no',
                        'loan' : 'no'},
              inplace = True)

## Start modelling process

#### Split data into training and test set

In [22]:
# create stratified shuffle split object
sss = StratifiedShuffleSplit(n_splits = 2, test_size = 0.2, random_state = 42)

In [23]:
# reset index
df_raw.reset_index(inplace = True, drop = True)

# split between train and test dfs
for train_indices, test_indices in sss.split(df_raw, df_raw['y']):
    df_train = df_raw.loc[train_indices]
    df_test = df_raw.loc[test_indices]

print('Train data shape:', df_train.shape)
print('Test data shape:', df_test.shape)

Train data shape: (32950, 21)
Test data shape: (8238, 21)


In [24]:
print('Incidence rate in total data :', df_raw['y'].sum() / df_raw.shape[0])
print('Incidence rate in training data :', df_train['y'].sum() / df_train.shape[0])
print('Incidence rate in test data :', df_test['y'].sum() / df_test.shape[0])

Incidence rate in total data : 0.11265417111780131
Incidence rate in training data : 0.11265553869499241
Incidence rate in test data : 0.11264870114105366


#### Create pipeline for data transformation

In [25]:
df_raw.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,no,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [26]:
# create transformer for yes/no columns
class encode_categorical_cols(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X = pd.get_dummies(X, prefix = X.columns, prefix_sep = '_', drop_first = True, sparse = False)
        self.categories_ = X.columns
        return X

In [27]:
# define numeric cols
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 
            'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [28]:
# define categorical cols
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 
            'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [29]:
# column transformer for standard scaler and one hot encoding
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', encode_categorical_cols(), cat_cols)
], remainder = 'drop')

In [30]:
## create X and y sets for training and testing data
# training data
X_train = full_pipeline.fit_transform(df_train)
y_train = df_train['y']

# testing data
X_test = full_pipeline.transform(df_test)
y_test = df_test['y']

In [31]:
# create list of variables
cat_encoder = full_pipeline.named_transformers_['cat']
cat_attribs = cat_encoder.categories_.tolist()

attributes = num_cols + cat_attribs

### Try different models

#### Logistic Regression

In [32]:
# initialize model object
log_mod = LogisticRegression(n_jobs = -1, 
                             solver = 'lbfgs', 
                             max_iter = 500)

In [33]:
# fit training data
log_mod.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
# view accuracy on training and test data
print('Training data accuracy:', accuracy_score(y_train, log_mod.predict(X_train)))
print('Test data accuracy:', accuracy_score(y_test, log_mod.predict(X_test)))

# view precision and recall on test data
print('Test Data Precision:', precision_score(y_test, log_mod.predict(X_test)))
print('Test Data Recall:', recall_score(y_test, log_mod.predict(X_test)))

Training data accuracy: 0.9098330804248862
Test data accuracy: 0.9172129157562515
Test Data Precision: 0.7204301075268817
Test Data Recall: 0.4331896551724138


In [35]:
# view confusion matrix on test data
confusion_matrix(y_test, log_mod.predict(X_test)) # actuals in rows

array([[7154,  156],
       [ 526,  402]])

#### Random Forest

In [36]:
# initialize model object
rf_mod = RandomForestClassifier()

# create grid search object
param_grid_rf = {'n_estimators' : [150],
                 'criterion' : ['gini', 'entropy'],
                 'max_depth' : [25, 50, 75],
                 'min_samples_leaf' : [15, 20, 50]}

# create grid search object
grid_rf = GridSearchCV(rf_mod, param_grid = param_grid_rf, n_jobs = -1, 
                       scoring = 'accuracy', cv = 3,  verbose = 2)

In [37]:
# fit training data
grid_rf.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   32.3s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [38]:
# view best params
grid_rf.best_params_

{'criterion': 'gini',
 'max_depth': 75,
 'min_samples_leaf': 15,
 'n_estimators': 150}

In [39]:
# save the best model
rf_best = grid_rf.best_estimator_

In [40]:
# view accuracy on training and test data
print('Training data accuracy:', accuracy_score(y_train, rf_best.predict(X_train)))
print('Test data accuracy:', accuracy_score(y_test, rf_best.predict(X_test)))

# view precision and recall on test data
print('Test Data Precision:', precision_score(y_test, rf_best.predict(X_test)))
print('Test Data Recall:', recall_score(y_test, rf_best.predict(X_test)))

Training data accuracy: 0.9231259484066768
Test data accuracy: 0.911750424860403
Test Data Precision: 0.7342657342657343
Test Data Recall: 0.3394396551724138


In [41]:
# view confusion matrix on test data
confusion_matrix(y_test, rf_best.predict(X_test)) # actuals in rows

array([[7196,  114],
       [ 613,  315]])

In [42]:
# view feature importances
sorted(zip(rf_best.feature_importances_, attributes), reverse = True)

[(0.3927351912298091, 'duration'),
 (0.12399649230876802, 'nr.employed'),
 (0.12224694723822498, 'euribor3m'),
 (0.050970217266991635, 'poutcome_success'),
 (0.04421795840487245, 'pdays'),
 (0.039731082895721165, 'cons.conf.idx'),
 (0.036584605345555056, 'emp.var.rate'),
 (0.036500282977070746, 'cons.price.idx'),
 (0.028970513552831256, 'age'),
 (0.01339954365463609, 'previous'),
 (0.011651406187634158, 'contact_telephone'),
 (0.009013025000359364, 'campaign'),
 (0.00876800343120149, 'month_may'),
 (0.007973707488620502, 'poutcome_nonexistent'),
 (0.00762781684287284, 'month_mar'),
 (0.007158044173870031, 'month_oct'),
 (0.004677062042613968, 'education_university.degree'),
 (0.004575132924625689, 'day_of_week_mon'),
 (0.00409968796868193, 'housing_yes'),
 (0.0039223573670901565, 'day_of_week_thu'),
 (0.003553174309362668, 'marital_single'),
 (0.003286333131268774, 'marital_married'),
 (0.0030669268462936755, 'day_of_week_tue'),
 (0.0026806316640516532, 'day_of_week_wed'),
 (0.00255892

#### GBM

In [43]:
# initialize model object
gbm_mod = GradientBoostingClassifier()

# create grid search object
param_grid_gbm = {'n_estimators' : [100, 500],
                 'learning_rate' : [0.01, 0.1],
                 'max_depth' : [1, 3, 5, 10],
                 'subsample' : [0.5, 1]}

# create grid search object
grid_gbm = GridSearchCV(gbm_mod, param_grid = param_grid_gbm, n_jobs = -1, 
                       scoring = 'roc_auc', cv = 3,  verbose = 2)

In [44]:
# fit training data
grid_gbm.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  96 out of  96 | elapsed: 16.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

In [45]:
# view best params
grid_gbm.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 0.5}

In [46]:
# save the best model
gbm_best = grid_gbm.best_estimator_

In [47]:
# view accuracy on training and test data
print('Training data accuracy:', accuracy_score(y_train, gbm_best.predict(X_train)))
print('Test data accuracy:', accuracy_score(y_test, gbm_best.predict(X_test)))

# view precision and recall on test data
print('Test Data Precision:', precision_score(y_test, gbm_best.predict(X_test)))
print('Test Data Recall:', recall_score(y_test, gbm_best.predict(X_test)))

Training data accuracy: 0.9295902883156297
Test data accuracy: 0.9195193008011653
Test Data Precision: 0.6827586206896552
Test Data Recall: 0.5334051724137931


In [48]:
# view confusion matrix on test data
confusion_matrix(y_test, gbm_best.predict(X_test)) # actuals in rows

array([[7080,  230],
       [ 433,  495]])

In [49]:
# view feature importances
# sorted(zip(gbm_best.feature_importances_, attributes), reverse = True)

#### XG Boost

In [50]:
# define imbalance ratio
imbalance_ratio = (y_train.count() - y_train.sum()) / y_train.sum()

# initialize model object
xgb_mod = xgboost.XGBClassifier(objective = 'binary:logistic',
                                scale_pos_weight = imbalance_ratio
                               )

# create grid search object
param_grid_xgb = {'n_estimators' : [200],
                  'learning_rate' : [0.01, 0.1],
                  'max_depth' : [5, 10, 25],
                  'subsample' : [0.25, 0.5, 1],
                  'colsample_bytree' : [1],
                  'colsample_bylevel' : [0.5, 1],}

# # create grid search object
grid_xgb = GridSearchCV(xgb_mod, param_grid = param_grid_xgb, n_jobs = -1, 
                        scoring = 'recall', cv = 5,  verbose = 2)

In [51]:
# fit training data
grid_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 11.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constrai...
                                     scale_pos_weight=7.876616379310345,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bylevel': [0.5, 1], 'cols

In [52]:
# view best params
grid_xgb.best_params_

{'colsample_bylevel': 0.5,
 'colsample_bytree': 1,
 'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 200,
 'subsample': 1}

In [53]:
# save the best model
xgb_best = grid_xgb.best_estimator_

In [54]:
# view accuracy on training and test data
print('Training data accuracy:', accuracy_score(y_train, xgb_best.predict(X_train)))
print('Test data accuracy:', accuracy_score(y_test, xgb_best.predict(X_test)))

# view precision and recall on test data
print('Test Data Precision:', precision_score(y_test, xgb_best.predict(X_test)))
print('Test Data Recall:', recall_score(y_test, xgb_best.predict(X_test)))

Training data accuracy: 0.8433383915022762
Test data accuracy: 0.8413449866472444
Test Data Precision: 0.4102321174798674
Test Data Recall: 0.9331896551724138


In [55]:
# view confusion matrix on test data
confusion_matrix(y_test, xgb_best.predict(X_test)) # actuals in rows

array([[6065, 1245],
       [  62,  866]])