In [2]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


# Statistical Modelling

This section of the project will look to set up the statistical models in order to make our predictions for the effectiveness of the bank's marketing campaign based on the dataframe that we have created.


We will begin by creating a Train, Validation and Test split of our data.

We will then look to implement a RFE to see if we should look to drop any of the features we have in our dataset.

Once this is complete, we will set up a baseline model using the Logistic Regression Learner.

We will then look to set up a Decision Tree, Guassian Naive Bayes, Random Forest and Bagged Tree models.

We will also look to hypertune each of the selected models in order to increase the performance of each.

Once all of this is completed, we will select our final model based on the best ROC-AUC score and move onto the threshold selection and model evaluation

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# import xgboost as xgb
from sklearn import metrics
import itertools
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from warnings import filterwarnings
filterwarnings('ignore')

## Reading in the data 

In [32]:
#Reading in the data saved under data_feat and saving it under a variable called data.
data = pd.read_csv('/Users/naweedahmed/Desktop/Flatiron_Feb/capstone_project/dsc-capstone-submission-checklist/CSV Files/data_feat.csv')

In [33]:
#Checking the data.
data.head()

Unnamed: 0.1,Unnamed: 0,contact,month,day_of_week,duration,campaign,pdays,previous,cons_price_idx,cons_conf_idx,...,job_unemployed,age_group_16-24,age_group_25-34,age_group_35-44,age_group_45-54,age_group_55-64,age_group_65-74,age_group_75-79,Combined_Tax,Feature_2
0,0,1,5,1,261,1,999,0,93.994,-36.4,...,0,0,0,0,0,1,0,0,62.0,50914.6344
1,1,1,5,1,149,1,999,0,93.994,-36.4,...,0,0,0,0,0,1,0,0,62.0,50914.6344
2,2,1,5,1,226,1,999,0,93.994,-36.4,...,0,0,0,1,0,0,0,0,62.0,50914.6344
3,3,1,5,1,151,1,999,0,93.994,-36.4,...,0,0,0,1,0,0,0,0,62.0,50914.6344
4,4,1,5,1,307,1,999,0,93.994,-36.4,...,0,0,0,0,0,1,0,0,62.0,50914.6344


In [34]:
#Dropping unwanted columns.
data.drop(columns='Unnamed: 0', inplace=True)

In [35]:
#Checking the data.
data.head()

Unnamed: 0,contact,month,day_of_week,duration,campaign,pdays,previous,cons_price_idx,cons_conf_idx,euribor3m,...,job_unemployed,age_group_16-24,age_group_25-34,age_group_35-44,age_group_45-54,age_group_55-64,age_group_65-74,age_group_75-79,Combined_Tax,Feature_2
0,1,5,1,261,1,999,0,93.994,-36.4,4.857,...,0,0,0,0,0,1,0,0,62.0,50914.6344
1,1,5,1,149,1,999,0,93.994,-36.4,4.857,...,0,0,0,0,0,1,0,0,62.0,50914.6344
2,1,5,1,226,1,999,0,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,62.0,50914.6344
3,1,5,1,151,1,999,0,93.994,-36.4,4.857,...,0,0,0,1,0,0,0,0,62.0,50914.6344
4,1,5,1,307,1,999,0,93.994,-36.4,4.857,...,0,0,0,0,0,1,0,0,62.0,50914.6344


In [36]:
#Checking the data.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 53 columns):
contact                     41188 non-null int64
month                       41188 non-null int64
day_of_week                 41188 non-null int64
duration                    41188 non-null int64
campaign                    41188 non-null int64
pdays                       41188 non-null int64
previous                    41188 non-null int64
cons_price_idx              41188 non-null float64
cons_conf_idx               41188 non-null float64
euribor3m                   41188 non-null float64
y                           41188 non-null int64
MOM_Inflation               41188 non-null float64
Wage_Growth                 41188 non-null float64
EURUSD                      41188 non-null float64
euro_zoneGDP_Growth_Rate    41188 non-null float64
loan_no                     41188 non-null int64
loan_yes                    41188 non-null int64
housing_no                  41188 non-null i

In [37]:
#Checking the data.
data.columns

Index(['contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'y',
       'MOM_Inflation', 'Wage_Growth', 'EURUSD', 'euro_zoneGDP_Growth_Rate',
       'loan_no', 'loan_yes', 'housing_no', 'housing_yes', 'marital_divorced',
       'marital_married', 'marital_single', 'poutcome_nonexistent',
       'poutcome_success', 'default_no', 'default_yes', 'edu_basic.4y',
       'edu_basic.6y', 'edu_basic.9y', 'edu_high.school', 'edu_illiterate',
       'edu_professional.course', 'edu_university.degree', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'age_group_16-24',
       'age_group_25-34', 'age_group_35-44', 'age_group_45-54',
       'age_group_55-64', 'age_group_65-74', 'age_group_75-79', 'Combined_Tax',
       'Feature_2'],
      dtype='object')

### Checking the value counts for our target variable 'y'.

In [38]:
#Checking value counts for y.
data['y'].value_counts()

0    36548
1     4640
Name: y, dtype: int64

We may have a class imbalance problem. We will look to address this by using class_weight in the hyperparameter section, and also look to use the SMOTE technique to address this.

### Assigning the X and y variables for our model.

In [39]:
X = data.drop(columns=['y'], axis=1)
y = data['y']

### Setting up the Train,Test and Validation splits.

In [40]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state = 42)

In [41]:
#Train Validation Split.
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.20, random_state = 42)

In [42]:
#Checking to see the size of each set.
print('Shapes of Each Set')
print('-------------------')
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_val: {X_val.shape}')
print(f'y_val: {y_val.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

Shapes of Each Set
-------------------
X_train: (26360, 52)
y_train: (26360,)
X_val: (6590, 52)
y_val: (6590,)
X_test: (8238, 52)
y_test: (8238,)


## Baseline Model

### Logistic Regression

In [43]:
logreg = LogisticRegression(penalty='none', solver='lbfgs')
log_reg = logreg.fit(X_train, y_train)
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [44]:
log_reg_yval_predproba = log_reg.predict_proba(X_val)
print("ROC_AUC of Baseline model (val) =", roc_auc_score(y_val, log_reg_yval_predproba[:,1]))

ROC_AUC of Baseline model (val) = 0.9057938116009897


In [45]:
log_reg_ytrain_predproba = log_reg.predict_proba(X_train)
print("ROC_AUC of Baseline model (val) =", roc_auc_score(y_train, log_reg_ytrain_predproba[:,1]))

ROC_AUC of Baseline model (val) = 0.9073029329792693


## RFE

In [46]:
rfe = RFE(estimator=log_reg, step=1)
rfe = rfe.fit(X_train, y_train)

In [47]:

selected_rfe_features = pd.DataFrame({'Feature':list(X_train.columns),
                                      'Ranking':rfe.ranking_})
selected_rfe_features.sort_values(by='Ranking')

Unnamed: 0,Feature,Ranking
0,contact,1
50,Combined_Tax,1
28,edu_high.school,1
23,default_no,1
22,poutcome_success,1
21,poutcome_nonexistent,1
31,edu_university.degree,1
33,job_blue-collar,1
37,job_retired,1
16,housing_no,1


In [48]:
X_train.columns

Index(['contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'cons_price_idx', 'cons_conf_idx', 'euribor3m',
       'MOM_Inflation', 'Wage_Growth', 'EURUSD', 'euro_zoneGDP_Growth_Rate',
       'loan_no', 'loan_yes', 'housing_no', 'housing_yes', 'marital_divorced',
       'marital_married', 'marital_single', 'poutcome_nonexistent',
       'poutcome_success', 'default_no', 'default_yes', 'edu_basic.4y',
       'edu_basic.6y', 'edu_basic.9y', 'edu_high.school', 'edu_illiterate',
       'edu_professional.course', 'edu_university.degree', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'age_group_16-24',
       'age_group_25-34', 'age_group_35-44', 'age_group_45-54',
       'age_group_55-64', 'age_group_65-74', 'age_group_75-79', 'Combined_Tax',
       'Feature_2'],
      dtype='object')

In [49]:
X_train_rfe = rfe.transform(X_train)
X_val_rfe = rfe.transform(X_val)

In [50]:
log_reg_rfe_model = log_reg.fit(X_train_rfe, y_train)

In [51]:
rfecv = RFECV(estimator=log_reg, step=1, cv=5, scoring='accuracy')
rfecv = rfecv.fit(X_train, y_train)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train.columns[rfecv.support_])

Optimal number of features : 52
Best features : Index(['contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'cons_price_idx', 'cons_conf_idx', 'euribor3m',
       'MOM_Inflation', 'Wage_Growth', 'EURUSD', 'euro_zoneGDP_Growth_Rate',
       'loan_no', 'loan_yes', 'housing_no', 'housing_yes', 'marital_divorced',
       'marital_married', 'marital_single', 'poutcome_nonexistent',
       'poutcome_success', 'default_no', 'default_yes', 'edu_basic.4y',
       'edu_basic.6y', 'edu_basic.9y', 'edu_high.school', 'edu_illiterate',
       'edu_professional.course', 'edu_university.degree', 'job_admin.',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'age_group_16-24',
       'age_group_25-34', 'age_group_35-44', 'age_group_45-54',
       'age_group_55-64', 'age_group_65-74', 'age_group_75-79', 'Combined_Tax',
    

In [52]:
rfecv.grid_scores_

array([0.88918816, 0.8952959 , 0.89525797, 0.89567527, 0.89559939,
       0.8965478 , 0.89643399, 0.89635812, 0.89662367, 0.89688923,
       0.89707891, 0.89647193, 0.89647193, 0.89662367, 0.89635812,
       0.89628225, 0.89609256, 0.89586495, 0.89590288, 0.89582701,
       0.89609256, 0.89594082, 0.89616844, 0.89594082, 0.89590288,
       0.89616844, 0.89559939, 0.89563733, 0.89601669, 0.89962064,
       0.89943096, 0.89905159, 0.90094841, 0.90045524, 0.90030349,
       0.90037936, 0.90045524, 0.90053111, 0.89981032, 0.89981032,
       0.90254173, 0.90208649, 0.90155539, 0.90314871, 0.9026176 ,
       0.90572838, 0.90618361, 0.9064871 , 0.90667678, 0.90660091,
       0.90679059, 0.9069044 ])

### Hyperparameter Tuning

#### Iteration 1:

In [53]:
param_grid01 = {'C': [0.01,0.1,1,10,100],
             'penalty': ['l1','l2'],
             'solver': ['liblinear','saga']}

In [54]:
log_reg_gs01 = GridSearchCV(log_reg,
                         param_grid01,
                         cv=5,
                         scoring='roc_auc',
                         return_train_score=True)

log_reg_gs01.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='none',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [55]:
log_reg_gs01_model = log_reg_gs01.best_estimator_
log_reg_gs01_model

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
#Checking to see the parameters and R-Squared
print(f'best model:{log_reg_gs01.best_params_}')
print(f'best score: {log_reg_gs01.best_score_}')

best model:{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
best score: 0.9253789653683622


In [57]:
log_reg_gs01_yval_predprob = log_reg_gs01_model.predict_proba(X_val)
print("ROC_AUC of model (val) =", roc_auc_score(y_val, log_reg_gs01_yval_predprob[:,1]))

ROC_AUC of model (val) = 0.9257008639348675


#### Iteration 2:

In [58]:
param_grid02 = {'C': [0.5,1,5,10],
             'penalty': ['l1','l2'],
             'solver': ['liblinear','saga']}

In [59]:
log_reg_gs02 = GridSearchCV(log_reg,
                         param_grid02,
                         cv=5,
                         scoring='roc_auc',
                         return_train_score=True)

log_reg_gs02.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='none',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.5, 1, 5, 10], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [60]:
log_reg_gs02_model = log_reg_gs02.best_estimator_
log_reg_gs02_model

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [61]:
log_reg_gs02_yval_predprob = log_reg_gs02_model.predict_proba(X_val)
print("ROC_AUC of model (val) =", roc_auc_score(y_val, log_reg_gs02_yval_predprob[:,1]))

ROC_AUC of model (val) = 0.9254761689669285


#### Iteration 3:

In [67]:
param_grid03 = {'C': [0.5,1,5],
             'penalty': ['l1','l2'],
             'solver': ['liblinear','saga'],
            'class_weight': [None,'balanced'],
            
             }

In [68]:
log_reg_gs03 = GridSearchCV(log_reg,
                         param_grid03,
                         cv=5,
                         scoring='roc_auc',
                         return_train_score=True)

log_reg_gs03.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='none',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.5, 1, 5], 'class_weight': [None, 'balanced'],
                         'penalty': ['l1', 'l2'],
                         'solver': ['liblinear', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='roc_auc', verbose=0)

In [71]:
log_reg_gs03_model = log_reg_gs03.best_estimator_
log_reg_gs03_model

LogisticRegression(C=0.5, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [72]:
log_reg_gs03_yval_predprob = log_reg_gs03_model.predict_proba(X_val)
print("ROC_AUC of model (val) =", roc_auc_score(y_val, log_reg_gs03_yval_predprob[:,1]))

ROC_AUC of model (val) = 0.9287769956600456


### Decision Tree

In [73]:
dec_tree = DecisionTreeClassifier(criterion='entropy')

dec_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [74]:
dec_tree_yval_pred = dec_tree.predict(X_val)

print('Accuracy: ', accuracy_score(y_val, dec_tree_yval_pred))

Accuracy:  0.8872534142640364


In [75]:
dec_tree_yval_predproba = dec_tree.predict_proba(X_val)
print("ROC_AUC of Baseline model (val) =", roc_auc_score(y_val, dec_tree_yval_predproba[:,1]))

ROC_AUC of Baseline model (val) = 0.7229010954974354


### Bagged Tree

In [76]:
bag_tree =  BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), 
                                 n_estimators=20)

In [77]:
bag_tree.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=5,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,
   

In [78]:
bag_tree.score(X_train, y_train)

0.9200303490136571

In [79]:
bag_tree.score(X_val, y_val)

0.9132018209408195

### Random Forest

In [80]:
ran_for = RandomForestClassifier(n_estimators=100, max_depth= 5)
ran_for.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [81]:
ran_for.score(X_train, y_train)

0.9044385432473444

In [82]:
ran_for.score(X_test, y_test)

0.900825443068706

In [83]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
training_preds = clf.predict(X_train)
val_preds = clf.predict(X_val)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_val, val_preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

NameError: name 'xgb' is not defined

In [118]:
test_preds = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print("Test accuracy: {:.4}%".format(test_accuracy * 100))

Test accuracy: 91.72%


In [113]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)


GaussianNB(priors=None, var_smoothing=1e-09)

In [114]:
y_pred = gnb.predict(X_val)

In [115]:
print("Accuracy:",metrics.accuracy_score(y_val, y_pred))

Accuracy: 0.8629742033383915


So far our best model appears to Iteration 3 of the hyperparameter tuned Logistic Regression model. Going forward, we will look to run hyperparameter tuing for all the other models we have selected and then move onto the final model selection. Once we have our final model, we will look to threshold selection and also interpretation of the model.