In [1]:
# Importing the needed modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)

# To display plots inside the iPython Notebook itself
%matplotlib inline

In [10]:
# To verify how data is orgainzed in file(to find the delimiter) and then
# use corresponding function to open the file. eg
# data could be in .csv. .tsv, excel format etc.
pathOfDataFile = "data/bank-full.csv"
firstFewLines = list()
noOfLinesToView = 5

with open(pathOfDataFile) as dataFile:
    firstFewLines = [next(dataFile) for i in range(noOfLinesToView)]
    for line in firstFewLines:
        print(line)

# Import the semi-colon delimited data file into pandas dataFrame
bankPromo_df = pd.read_csv(pathOfDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromo_df = bankPromo_df.rename(columns={"y":"Subscribed"})


"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"

58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"

44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"

33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"

47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"



In [11]:
bankPromo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age           45211 non-null int64
job           45211 non-null object
marital       45211 non-null object
education     45211 non-null object
default       45211 non-null object
balance       45211 non-null int64
housing       45211 non-null object
loan          45211 non-null object
contact       45211 non-null object
day           45211 non-null int64
month         45211 non-null object
duration      45211 non-null int64
campaign      45211 non-null int64
pdays         45211 non-null int64
previous      45211 non-null int64
poutcome      45211 non-null object
Subscribed    45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [12]:
# Get the unique values(Levels) for categorical variables.
# List to hold names of categorical variables
categoricalVars = list()
# List to hold names of numerical variables
numericalVars = list()

for colName in bankPromo_df.columns:
    if bankPromo_df[colName].dtype == np.int64:
        numericalVars.append(colName)
    elif bankPromo_df[colName].dtype == np.object:
        categoricalVars.append(colName)
    else:
        pass
    
# Remove Target column from final categorical Var list
categoricalVars.remove('default')

print(numericalVars)
print(categoricalVars)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['job', 'marital', 'education', 'housing', 'loan', 'contact', 'month', 'poutcome', 'Subscribed']


### Perform One Hot Encoding for categorical variables in dataset

In [13]:
# Make a copy of original data frame
bankPromoModel_def_Df = bankPromo_df.copy()
bankPromoModel_def_Df['Target'] = bankPromoModel_def_Df['default'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoModel_def_Df['Target'] = bankPromoModel_def_Df['Target'].astype(np.int)
# Delete the original 'Subscribed' column
del bankPromoModel_def_Df['default']

In [14]:
# Covert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoModel_def_Df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoModel_def_Df = pd.concat((bankPromoModel_def_Df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoModel_def_Df.drop(categoricalVars, inplace=True, axis=1)
bankPromoModel_def_Df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 43 columns):
age                    45211 non-null int64
balance                45211 non-null int64
day                    45211 non-null int64
duration               45211 non-null int64
campaign               45211 non-null int64
pdays                  45211 non-null int64
previous               45211 non-null int64
Target                 45211 non-null int32
job_blue-collar        45211 non-null uint8
job_entrepreneur       45211 non-null uint8
job_housemaid          45211 non-null uint8
job_management         45211 non-null uint8
job_retired            45211 non-null uint8
job_self-employed      45211 non-null uint8
job_services           45211 non-null uint8
job_student            45211 non-null uint8
job_technician         45211 non-null uint8
job_unemployed         45211 non-null uint8
job_unknown            45211 non-null uint8
marital_married        45211 non-null uint8
marital_sin

### Perform 5 fold Cross Validation with 80/20 Split for Model Selection

In [15]:
# Training and Test Split
from sklearn.model_selection import ShuffleSplit

if 'Target' in bankPromoModel_def_Df:
    y = bankPromoModel_def_Df['Target'].values # get the labels we want
    del bankPromoModel_def_Df['Target']        # get rid of the class label
    X = bankPromoModel_def_Df.values           # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2, random_state=999)
                         
print(cv_object)


ShuffleSplit(n_splits=5, random_state=999, test_size=0.2, train_size=None)


### Getting ready Additional Test Dataset(with 10% instances) for final model fitting and weights interpretation

In [16]:
pathOfAdditionalDataFile = "data/bank.csv"

# Import the semi-colon delimited data file into pandas dataFrame
bankPromoAdditional_df = pd.read_csv(pathOfAdditionalDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromoAdditional_df = bankPromoAdditional_df.rename(columns={"y":"Subscribed"})

bankPromoAdditional_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
age           4521 non-null int64
job           4521 non-null object
marital       4521 non-null object
education     4521 non-null object
default       4521 non-null object
balance       4521 non-null int64
housing       4521 non-null object
loan          4521 non-null object
contact       4521 non-null object
day           4521 non-null int64
month         4521 non-null object
duration      4521 non-null int64
campaign      4521 non-null int64
pdays         4521 non-null int64
previous      4521 non-null int64
poutcome      4521 non-null object
Subscribed    4521 non-null object
dtypes: int64(7), object(10)
memory usage: 600.5+ KB


In [18]:
bankPromoAdditional_def_df = bankPromoAdditional_df.copy()

bankPromoAdditional_def_df['Target'] = bankPromoAdditional_def_df['default'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoAdditional_def_df['Target'] = bankPromoAdditional_def_df['Target'].astype(np.int)
# Delete the original 'Subscribed' column
del bankPromoAdditional_def_df['default']

# Covert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoAdditional_def_df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoAdditional_def_df = pd.concat((bankPromoAdditional_def_df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoAdditional_def_df.drop(categoricalVars, inplace=True, axis=1)

if 'Target' in bankPromoAdditional_def_df:
    y_Final = bankPromoAdditional_def_df['Target'].values # get the labels we want
    del bankPromoAdditional_def_df['Target']        # get rid of the class label
    X_Final = bankPromoAdditional_def_df.values

bankPromoAdditional_def_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 42 columns):
age                    4521 non-null int64
balance                4521 non-null int64
day                    4521 non-null int64
duration               4521 non-null int64
campaign               4521 non-null int64
pdays                  4521 non-null int64
previous               4521 non-null int64
job_blue-collar        4521 non-null uint8
job_entrepreneur       4521 non-null uint8
job_housemaid          4521 non-null uint8
job_management         4521 non-null uint8
job_retired            4521 non-null uint8
job_self-employed      4521 non-null uint8
job_services           4521 non-null uint8
job_student            4521 non-null uint8
job_technician         4521 non-null uint8
job_unemployed         4521 non-null uint8
job_unknown            4521 non-null uint8
marital_married        4521 non-null uint8
marital_single         4521 non-null uint8
education_secondary    4521 non-n

## Create Model

### Simple Logistic Model Fit

In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run logistic regression model

logisticModel = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', C=1.0, class_weight=None, random_state=999))

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    logisticModel.fit(X_train,y_train)  # train object
    y_hat = logisticModel.predict(X_test) # get test set precitions

    # now let's get the accuracy,precision,recall,auc,F1 and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())


====Iteration 0  ====
confusion matrix
 [[8871    2]
 [ 169    1]]
====Iteration 1  ====
confusion matrix
 [[8863    6]
 [ 172    2]]
====Iteration 2  ====
confusion matrix
 [[8855    1]
 [ 182    5]]
====Iteration 3  ====
confusion matrix
 [[8893    6]
 [ 140    4]]
====Iteration 4  ====
confusion matrix
 [[8891    2]
 [ 144    6]]


Unnamed: 0,Accuracy,AuC,F1 Score,Precision,Recall
0,0.98109,0.502828,0.011561,0.333333,0.005882
1,0.980316,0.505409,0.021978,0.25,0.011494
2,0.979763,0.513313,0.051813,0.833333,0.026738
3,0.983855,0.513552,0.051948,0.4,0.027778
4,0.983855,0.519888,0.075949,0.75,0.04


Average Model Performnace Metrices 


Accuracy     0.981776
AuC          0.510998
F1 Score     0.042650
Precision    0.513333
Recall       0.022378
dtype: float64

### Tuning The Model Hyper Parameters Using Grid Search

In [20]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

param_grid = {
     'logisticregression__penalty' : ['l1', 'l2'],
    'logisticregression__C' : np.logspace(-4, 4, 50),
    'logisticregression__solver' : ['liblinear'],
    'logisticregression__class_weight' : [None, 'balanced']}


scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

# Create grid search object

clf = GridSearchCV(make_pipeline(StandardScaler(), LogisticRegression(random_state=999)), \
                   param_grid = param_grid, cv = 5, verbose=False, n_jobs=-1, scoring=scoring, refit='AUC', \
                   return_train_score=True)

# Fit on data

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    clf.fit(X_train,y_train)  # train object
    y_hat = clf.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)
    print("Best Estimator Model Parameters\n", clf.best_params_)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

====Iteration 0  ====
confusion matrix
 [[8871    2]
 [ 167    3]]
Best Estimator Model Parameters
 {'logisticregression__C': 0.5689866029018293, 'logisticregression__class_weight': None, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
====Iteration 1  ====
confusion matrix
 [[8861    8]
 [ 170    4]]
Best Estimator Model Parameters
 {'logisticregression__C': 0.5689866029018293, 'logisticregression__class_weight': None, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
====Iteration 2  ====
confusion matrix
 [[8853    3]
 [ 181    6]]
Best Estimator Model Parameters
 {'logisticregression__C': 0.8286427728546842, 'logisticregression__class_weight': None, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
====Iteration 3  ====
confusion matrix
 [[8893    6]
 [ 140    4]]
Best Estimator Model Parameters
 {'logisticregression__C': 0.18420699693267145, 'logisticregression__class_weight': None, 'logist

Unnamed: 0,Accuracy,AuC,F1 Score,Precision,Recall
0,0.981312,0.508711,0.034286,0.6,0.017647
1,0.980316,0.511043,0.043011,0.333333,0.022989
2,0.979653,0.515873,0.061224,0.666667,0.032086
3,0.983855,0.513552,0.051948,0.4,0.027778
4,0.983744,0.519831,0.075472,0.666667,0.04


Average Model Performnace Metrices 


Accuracy     0.981776
AuC          0.513802
F1 Score     0.053188
Precision    0.533333
Recall       0.028100
dtype: float64

### Simple SVM Fit

In [26]:
from datetime import datetime
from sklearn.svm import SVC
# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run SVM model

start = datetime.now()
svmModel = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3 , gamma='auto', random_state=999 , class_weight='balanced'))

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    
    
    # train the reusable logisitc regression model on the training data
    svmModel.fit(X_train, y_train)  # train object
    y_hat = svmModel.predict(X_test) # get test set precitions

    # now let's get the accuracy,precision,recall,auc,F1 and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

end = datetime.now()
print("time taken for Linear SVM")
print(end-start)

====Iteration 0  ====
confusion matrix
 [[7191 1682]
 [  69  101]]
====Iteration 1  ====
confusion matrix
 [[7190 1679]
 [  68  106]]
====Iteration 2  ====
confusion matrix
 [[7123 1733]
 [  66  121]]
====Iteration 3  ====
confusion matrix
 [[7173 1726]
 [  58   86]]
====Iteration 4  ====
confusion matrix
 [[7210 1683]
 [  62   88]]


Unnamed: 0,Accuracy,AuC,F1 Score,Precision,Recall
0,0.80637,0.702277,0.103431,0.056646,0.594118
1,0.806812,0.709942,0.108218,0.059384,0.609195
2,0.801062,0.725686,0.118569,0.065264,0.647059
3,0.80272,0.701634,0.087935,0.047461,0.597222
4,0.807033,0.698708,0.091619,0.049689,0.586667


Average Model Performnace Metrices 


Accuracy     0.804799
AuC          0.707649
F1 Score     0.101954
Precision    0.055689
Recall       0.606852
dtype: float64

time taken for Linear SVM
0:07:53.800706
