In [90]:
import pandas as pd
import numpy as np
import scipy as sc
import seaborn as sns
from copy import deepcopy
from collections import OrderedDict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score



import matplotlib.pyplot as plt
import scikitplot as skplt
%matplotlib inline

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('precision',2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline

In [91]:
lst = ['marital', 'education', 'job', 'contact', 'poutcome']
def make_dummies(df, lst):
    temp = deepcopy(df)
    Y = df['y']
    temp = temp.drop(['y'], axis=1)
    for col in temp:
        if temp[col].dtype == 'object':
            tmp = pd.get_dummies(temp[col], prefix=col)
            num_col = len(tmp.columns)
            tmp = tmp.iloc[:,0:num_col-1]
            temp.drop([col], axis=1,inplace=True)
            lst = [temp, tmp]
            temp = pd.concat(lst, axis=1)
    return temp, Y

In [92]:
df_bankFull = pd.read_csv("bank-full.csv", delimiter=";")
df_bank = pd.read_csv("bank.csv", delimiter=";")

#### Standardizing Numerical Variables

- Standardizing the data is essential for clustering algorithms and dimension reduction.
- It's not necessary for tree based algorithms.

In [5]:
dfBankFull = deepcopy(df_bankFull)
cols_to_norm = ['age', 'balance', 'duration', 'day','campaign', 'pdays','previous']
dfBankFull[cols_to_norm] = dfBankFull[cols_to_norm]\
    .apply(lambda x: (x - x.min()) / (x.max()-x.min()))

In [6]:
mainDF, y = make_dummies(dfBankFull, lst)

#### Downsampling Majority Class:

- In the cell directly below you can see the big imbalance between the two classes we're trying to predict. We're going to need to fix this so when it's fed into our models there is no prediction bias toward any one class.

In [7]:
y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [8]:
from sklearn.utils import resample

In [9]:
frames = [mainDF, y]
df = pd.concat(frames, axis=1)
df_maj = df[df.y=='no']
df_min = df[df.y=='yes']
df_majority_downsampled = resample(df_maj, replace=False,
                                  n_samples=5289,
                                  random_state=101)

frames_resampled = [df_majority_downsampled,df_min]
df_resampled = pd.concat(frames_resampled)

In [10]:
df_resampled['y'].value_counts()

no     5289
yes    5289
Name: y, dtype: int64

In [107]:
x_train, x_test, y_train, y_test = train_test_split(df_resampled.drop(['y'], axis=1),
                                                    df_resampled['y'], test_size=.3, random_state=101)

# Neural Network via Sklearn

In [13]:
from sklearn.neural_network import MLPClassifier

In [161]:
mlp = MLPClassifier(hidden_layer_sizes=(30,60,90), activation='relu', solver='adam', random_state=101)

In [162]:
mlp

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30, 60, 90), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=101,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [163]:
mlp.fit(x_train, y_train)
predictions = mlp.predict(x_test)

In [164]:
print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test,predictions))
print("\n")
print(accuracy_score(y_test, predictions))

[[1296  285]
 [ 228 1365]]


             precision    recall  f1-score   support

         no       0.85      0.82      0.83      1581
        yes       0.83      0.86      0.84      1593

avg / total       0.84      0.84      0.84      3174



0.8383742911153119


In [165]:
from sklearn.model_selection import GridSearchCV