# Chapter 11: Neural nets

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck 
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) 
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.

## Import required packages

In [1]:
import os
!pip install pandas
import pandas as pd
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from dmba import classificationSummary
%matplotlib inline

no display found. Using non-interactive Agg backend


In [2]:
path = '/Users/acast/Dropbox/Baruch/Teaching/Spring 2020/dmba-datasets/dmba/'
os.chdir(path)

## Table 11.2

In [3]:
example_df = pd.read_csv('TinyData.csv')

predictors = ['Fat', 'Salt']
outcome = 'Acceptance'

X = example_df[predictors]
y = example_df[outcome]
classes = sorted(y.unique())

clf = MLPClassifier(hidden_layer_sizes=(3), activation='logistic', solver='lbfgs', random_state=1)
clf.fit(X, y)
clf.predict(X)

# Network structure
print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction
print(pd.concat([
    example_df,
    pd.DataFrame(clf.predict_proba(X), columns=classes)
], axis=1))

Intercepts
[array([0.13368045, 4.07247552, 7.00768104]), array([14.30748676])]
Weights
[array([[ -1.30656481,  -4.20427792, -13.29587332],
       [ -0.04399727,  -4.91606924,  -6.03356987]]), array([[ -0.27348313],
       [ -9.01211573],
       [-17.63504694]])]
   Obs.  Fat  Salt Acceptance   dislike      like
0     1  0.2   0.9       like  0.000490  0.999510
1     2  0.1   0.1    dislike  0.999994  0.000006
2     3  0.2   0.4    dislike  0.999741  0.000259
3     4  0.2   0.5    dislike  0.997368  0.002632
4     5  0.4   0.5       like  0.002133  0.997867
5     6  0.3   0.8       like  0.000075  0.999925


In [51]:
-1.307*0.2-13.296*0.9

-12.2278

In [4]:
example_df = pd.read_csv('TinyData.csv')

predictors = ['Fat', 'Salt']
outcome = 'Acceptance'

X = example_df[predictors]
y = example_df[outcome]
classes = sorted(y.unique())

clf = MLPClassifier(hidden_layer_sizes=(3), activation='logistic', solver='lbfgs', random_state=1)
clf.fit(X, y)
clf.predict(X)

# Network structure
print('Intercepts')
print(clf.intercepts_)

print('Weights')
print(clf.coefs_)

# Prediction
print(pd.concat([
    example_df,
    pd.DataFrame(clf.predict_proba(X), columns=classes)
], axis=1))

Intercepts
[array([0.13368045, 4.07247552, 7.00768104]), array([14.30748676])]
Weights
[array([[ -1.30656481,  -4.20427792, -13.29587332],
       [ -0.04399727,  -4.91606924,  -6.03356987]]), array([[ -0.27348313],
       [ -9.01211573],
       [-17.63504694]])]
   Obs.  Fat  Salt Acceptance   dislike      like
0     1  0.2   0.9       like  0.000490  0.999510
1     2  0.1   0.1    dislike  0.999994  0.000006
2     3  0.2   0.4    dislike  0.999741  0.000259
3     4  0.2   0.5    dislike  0.997368  0.002632
4     5  0.4   0.5       like  0.002133  0.997867
5     6  0.3   0.8       like  0.000075  0.999925


In [5]:
# Pretty print layers
for i, (weights, intercepts) in enumerate(zip(clf.coefs_, clf.intercepts_)):
    print('Hidden layer' if i == 0 else 'Output layer', '{0[0]} => {0[1]}'.format(weights.shape))
    print(' Intercepts:\n ', intercepts)
    print(' Weights:')
    for weight in weights:
        print(' ', weight)
    print()


Hidden layer 2 => 3
 Intercepts:
  [0.13368045 4.07247552 7.00768104]
 Weights:
  [ -1.30656481  -4.20427792 -13.29587332]
  [-0.04399727 -4.91606924 -6.03356987]

Output layer 3 => 1
 Intercepts:
  [14.30748676]
 Weights:
  [-0.27348313]
  [-9.01211573]
  [-17.63504694]



# Bank example

In [8]:
bank_df = pd.read_csv('UniversalBank.csv')

In [9]:
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'}
bank_df.Education.cat.rename_categories(new_categories, inplace=True)
bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True)

In [10]:
X = bank_df.drop(columns=['Personal_Loan'])
y = bank_df['Personal_Loan']
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=1)

In [30]:
# train neural network with 2 hidden nodes
clf = MLPClassifier(hidden_layer_sizes=(20), activation='logistic', solver='lbfgs',
                    random_state=1)
clf.fit(train_X, train_y.values)

# training performance (use idxmax to revert the one-hot-encoding)
classificationSummary(train_y, clf.predict(train_X))

# validation performance
classificationSummary(valid_y, clf.predict(valid_X))

Confusion Matrix (Accuracy 0.9549)

       Prediction
Actual    0    1
     0 3129   40
     1  118  213
Confusion Matrix (Accuracy 0.9393)

       Prediction
Actual    0    1
     0 1332   19
     1   72   77


In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Accuracy on train is:",accuracy_score(train_y,clf.predict(train_X)))
print("Accuracy on test is:",accuracy_score(valid_y,clf.predict(valid_X)))
print("precision score on test is :",precision_score(valid_y,clf.predict(valid_X)))
print("Recall score on train is :",recall_score(valid_y,clf.predict(valid_X)))
print("f-1 score on train is :",f1_score(valid_y,clf.predict(valid_X)))

Accuracy on train is: 0.9548571428571428
Accuracy on test is: 0.9393333333333334
precision score on test is : 0.8020833333333334
Recall score on train is : 0.5167785234899329
f-1 score on train is : 0.6285714285714287


In [32]:
import h2o
from h2o.estimators import H2ODeepLearningEstimator
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_144"; Java(TM) SE Runtime Environment (build 1.8.0_144-b01); Java HotSpot(TM) 64-Bit Server VM (build 25.144-b01, mixed mode)
  Starting server from /Users/acast/opt/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/5g/xczpm95d75572js_3n8sww9w0000gn/T/tmph1zu5804
  JVM stdout: /var/folders/5g/xczpm95d75572js_3n8sww9w0000gn/T/tmph1zu5804/h2o_acast_started_from_python.out
  JVM stderr: /var/folders/5g/xczpm95d75572js_3n8sww9w0000gn/T/tmph1zu5804/h2o_acast_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.2
H2O_cluster_version_age:,6 months and 11 days !!!
H2O_cluster_name:,H2O_from_python_acast_pdqhfo
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.556 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [33]:
bank_h2o = h2o.H2OFrame(bank_df)
bank_h2o['Personal_Loan'] = bank_h2o['Personal_Loan'].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
bank_h2o

In [42]:
# Run AutoML
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_runtime_secs= 120,max_models= 10, seed= 1,nfolds= 3)
aml.train(x = bank_h2o.columns, y = 'Personal_Loan',
          training_frame = bank_h2o)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [43]:
#check the leaderboard
lb = aml.leaderboard
lb

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_3_AutoML_20201109_125158,0.998444,0.0304072,0.987873,0.0485804,0.0922249,0.00850543
GBM_2_AutoML_20201109_125158,0.998333,0.0320496,0.987194,0.048691,0.0952011,0.00906324
GBM_4_AutoML_20201109_125158,0.998161,0.0339355,0.986681,0.0438145,0.0955178,0.00912366
GBM_1_AutoML_20201109_125158,0.998081,0.0341978,0.986611,0.0505531,0.0963629,0.0092858
XGBoost_3_AutoML_20201109_125158,0.997197,0.0365045,0.981306,0.0429019,0.100532,0.0101067
StackedEnsemble_AllModels_AutoML_20201109_125158,0.996435,0.0374887,0.984321,0.0448562,0.0945752,0.00894447
StackedEnsemble_BestOfFamily_AutoML_20201109_125158,0.996407,0.0375031,0.984271,0.0410214,0.0942902,0.00889065
GBM_5_AutoML_20201109_125158,0.996086,0.0478638,0.975145,0.0529222,0.110044,0.0121097
XGBoost_1_AutoML_20201109_125158,0.995901,0.0454179,0.975671,0.0544524,0.106452,0.0113321
DRF_1_AutoML_20201109_125158,0.994314,0.0599529,0.981239,0.0438145,0.107,0.0114489




In [47]:
# Build and train the model:
dl = H2ODeepLearningEstimator(nfolds = 3)
dl.train(x = bank_h2o.columns, y = 'Personal_Loan', training_frame = bank_h2o)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [49]:
dl

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_model_python_1604939848298_1010


Status of Neuron Layers: predicting Personal_Loan, 2-class classification, bernoulli distribution, CrossEntropy loss, 43,202 weights/biases, 515.7 KB, 52,322 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,12,Input,0.0,,,,,,,,,
1,,2,200,Rectifier,0.0,0.0,0.0,0.00314273,0.0027399,0.0,-0.003736,0.104611,0.436504,0.0382117
2,,3,200,Rectifier,0.0,0.0,0.0,0.0527952,0.139942,0.0,-0.00630984,0.0730069,0.981874,0.0318575
3,,4,2,Softmax,,0.0,0.0,0.00327804,0.00594565,0.0,-0.00394354,0.382758,-0.000125139,0.024339




ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.01441609471317134
RMSE: 0.12006704257693425
LogLoss: 0.08085818767071302
Mean Per-Class Error: 0.03538901179941001
AUC: 0.9897762260324484
AUCPR: 0.9688411548514201
Gini: 0.9795524520648968

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.06174209087229067: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,4498.0,22.0,0.0049,(22.0/4520.0)
1,1,46.0,434.0,0.0958,(46.0/480.0)
2,Total,4544.0,456.0,0.0136,(68.0/5000.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.06174209,0.92735,234.0
1,max f2,0.01148116,0.924784,287.0
2,max f0point5,0.4282438,0.962092,179.0
3,max accuracy,0.06174209,0.9864,234.0
4,max precision,0.9999988,1.0,0.0
5,max recall,1.096159e-07,1.0,399.0
6,max specificity,0.9999988,1.0,0.0
7,max absolute_mcc,0.06174209,0.920222,234.0
8,max min_per_class_accuracy,0.001430729,0.9625,348.0
9,max mean_per_class_accuracy,0.001720216,0.964611,342.0



Gains/Lift Table: Avg response rate:  9.60 %, avg score:  7.94 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.01,0.9999999,10.416667,10.416667,1.0,1.0,1.0,1.0,0.104167,0.104167,941.666667,941.666667
1,,2,0.02,0.9999968,10.416667,10.416667,1.0,0.999999,1.0,0.999999,0.104167,0.208333,941.666667,941.666667
2,,3,0.03,0.9999708,10.416667,10.416667,1.0,0.999988,1.0,0.999996,0.104167,0.3125,941.666667,941.666667
3,,4,0.04,0.9996387,10.416667,10.416667,1.0,0.9998746,1.0,0.999965,0.104167,0.416667,941.666667,941.666667
4,,5,0.05,0.9955155,10.416667,10.416667,1.0,0.9984753,1.0,0.999667,0.104167,0.520833,941.666667,941.666667
5,,6,0.1,0.01419267,8.166667,9.291667,0.784,0.58605,0.892,0.792859,0.408333,0.929167,716.666667,829.166667
6,,7,0.15,0.0004279677,0.833333,6.472222,0.08,0.002982544,0.621333,0.529567,0.041667,0.970833,-16.666667,547.222222
7,,8,0.2,7.293746e-05,0.166667,4.895833,0.016,0.0002090431,0.47,0.397227,0.008333,0.979167,-83.333333,389.583333
8,,9,0.3,1.081507e-06,0.125,3.305556,0.012,1.526878e-05,0.317333,0.264823,0.0125,0.991667,-87.5,230.555556
9,,10,0.4,1.04065e-07,0.0625,2.494792,0.006,4.261164e-07,0.2395,0.198618,0.00625,0.997917,-93.75,149.479167




ModelMetricsBinomial: deeplearning
** Reported on cross-validation data. **

MSE: 0.017127521497078295
RMSE: 0.13087215707352842
LogLoss: 0.08456874003240933
Mean Per-Class Error: 0.05524520648967557
AUC: 0.986075543879056
AUCPR: 0.9454397810348495
Gini: 0.9721510877581121

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2662444461847636: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,4478.0,42.0,0.0093,(42.0/4520.0)
1,1,58.0,422.0,0.1208,(58.0/480.0)
2,Total,4536.0,464.0,0.02,(100.0/5000.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.2662444,0.894068,185.0
1,max f2,0.212759,0.888611,197.0
2,max f0point5,0.784581,0.928218,109.0
3,max accuracy,0.2662444,0.98,185.0
4,max precision,0.9999984,1.0,0.0
5,max recall,8.138272e-07,1.0,399.0
6,max specificity,0.9999984,1.0,0.0
7,max absolute_mcc,0.2662444,0.883184,185.0
8,max min_per_class_accuracy,0.01173608,0.9375,323.0
9,max mean_per_class_accuracy,0.02775215,0.944755,281.0



Gains/Lift Table: Avg response rate:  9.60 %, avg score:  8.65 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.01,0.9999994,10.416667,10.416667,1.0,0.9999998,1.0,1.0,0.104167,0.104167,941.666667,941.666667
1,,2,0.02,0.9999858,10.416667,10.416667,1.0,0.9999957,1.0,0.999998,0.104167,0.208333,941.666667,941.666667
2,,3,0.03,0.9999386,10.416667,10.416667,1.0,0.9999699,1.0,0.999988,0.104167,0.3125,941.666667,941.666667
3,,4,0.04,0.9995984,10.416667,10.416667,1.0,0.9998042,1.0,0.999942,0.104167,0.416667,941.666667,941.666667
4,,5,0.05,0.9977856,10.416667,10.416667,1.0,0.9989781,1.0,0.99975,0.104167,0.520833,941.666667,941.666667
5,,6,0.1,0.114449,7.458333,8.9375,0.716,0.6966609,0.858,0.848205,0.372917,0.89375,645.833333,793.75
6,,7,0.15,0.006398623,1.0,6.291667,0.096,0.03068219,0.604,0.575698,0.05,0.94375,0.0,529.166667
7,,8,0.2,0.0008314144,0.583333,4.864583,0.056,0.002680242,0.467,0.432443,0.029167,0.972917,-41.666667,386.458333
8,,9,0.3,4.030541e-05,0.145833,3.291667,0.014,0.0002375372,0.316,0.288375,0.014583,0.9875,-85.416667,229.166667
9,,10,0.4,3.311143e-06,0.0625,2.484375,0.006,1.495353e-05,0.2385,0.216285,0.00625,0.99375,-93.75,148.4375




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid
0,accuracy,0.98162067,0.0032113201,0.9779499,0.9830012,0.9839109
1,auc,0.9880061,0.00397547,0.9902481,0.983416,0.9903542
2,aucpr,0.9477135,0.0065645976,0.9494479,0.94045585,0.95323676
3,err,0.018379332,0.0032113201,0.02205006,0.016998827,0.01608911
4,err_count,30.666666,5.6862407,37.0,29.0,26.0
5,f0point5,0.9265587,0.025150469,0.9002433,0.92907804,0.95035464
6,f1,0.9002657,0.011338069,0.8888889,0.90034366,0.91156465
7,f2,0.8756559,0.0022463303,0.87781733,0.87333333,0.875817
8,lift_top_group,10.437969,0.6520707,9.870588,11.150327,10.292994
9,logloss,0.0842881,0.015250193,0.087226786,0.09785508,0.06778241



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-11-09 12:54:14,0.000 sec,,0.0,0,0.0,,,,,,,
1,,2020-11-09 12:54:15,28.606 sec,3663 obs/sec,1.0492,1,5246.0,0.168879,0.137991,0.671366,0.979265,0.920646,10.416667,0.0276
2,,2020-11-09 12:54:21,33.807 sec,4024 obs/sec,5.2498,5,26249.0,0.134828,0.076189,0.790531,0.98904,0.954224,10.416667,0.0186
3,,2020-11-09 12:54:26,39.498 sec,3917 obs/sec,9.4306,9,47153.0,0.105966,0.056958,0.870613,0.991611,0.969506,10.416667,0.0136
4,,2020-11-09 12:54:28,41.546 sec,3783 obs/sec,10.4644,10,52322.0,0.120067,0.080858,0.833885,0.989776,0.968841,10.416667,0.0136



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Income,1.0,1.0,0.119203
1,Family,0.892216,0.892216,0.106354
2,Education_Advanced/Professional,0.800844,0.800844,0.095463
3,Education_Graduate,0.798225,0.798225,0.09515
4,CreditCard,0.654996,0.654996,0.078077
5,Online,0.645824,0.645824,0.076984
6,Experience,0.63359,0.63359,0.075526
7,CCAvg,0.632115,0.632115,0.07535
8,Age,0.599742,0.599742,0.071491
9,Securities_Account,0.591112,0.591112,0.070462




In [None]:
!conda install tensorflow

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/acast/opt/anaconda3

  added / updated specs:
    - tensorflow


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _tflow_select-2.3.0        |              mkl           3 KB
    absl-py-0.11.0             |   py37hecd8cb5_0         169 KB
    astor-0.8.1                |           py37_0          46 KB
    c-ares-1.16.1              |       haf1e3a3_0          91 KB
    conda-4.9.1                |   py37hecd8cb5_0         2.9 MB
    gast-0.2.2                 |           py37_0         154 KB
    google-pasta-0.2.0

In [None]:
!y