# Bank Marketing

### Install Requirements

In [0]:
! pip install memory_profiler



In [0]:
%load_ext memory_profiler

In [0]:
! apt-get install default-jre
! java -version
! pip install h2o

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.10-63ubuntu1~02).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.
openjdk version "10.0.2" 2018-07-17
OpenJDK Runtime Environment (build 10.0.2+13-Ubuntu-1ubuntu0.18.04.4)
OpenJDK 64-Bit Server VM (build 10.0.2+13-Ubuntu-1ubuntu0.18.04.4, mixed mode)


### Import Libraries

In [0]:
from sklearn.preprocessing import MinMaxScaler
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
import h2o
import os

# Pretty Display of Variables
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def process_bank_data(ratios=[0.8, 0.1]):
    """ Downloads the banking dataset, preprocess the data and splits it.
    Args:
        ratio: Split ratio. Default is 0.8
    
    Returns:
        train, test and sample dataset
    
    """
    
    user = "fazilbtopal"
    key = "a01ead977f55d872c4deeadb0f173aa1"

    if '.kaggle' not in os.listdir('/root'):
        !mkdir ~/.kaggle
    !touch /root/.kaggle/kaggle.json
    !chmod 666 /root/.kaggle/kaggle.json
    with open('/root/.kaggle/kaggle.json', 'w') as f:
        f.write('{"username":"%s","key":"%s"}' % (user, key))
    !chmod 600 /root/.kaggle/kaggle.json

    print('Downloading data from web..')
    ! kaggle datasets download -d sonujha090/bank-marketing
    ! unzip -qq /content/bank-marketing.zip
    
    print('Reading the dataset..')
    # Load the data set.
    bank_df = h2o.import_file("/content/bank-full.csv")
    bank_sample_df = h2o.import_file("/content/bank.csv")


    print('Splitting train & test frames..')
    train, test, val = bank_df.split_frame(ratios=ratios, seed=1234)

    return train, test, sample

In [0]:
h2o.init(max_mem_size="4G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 hours 28 mins
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,10 days
H2O cluster name:,H2O_from_python_unknownUser_frr5wp
H2O cluster total nodes:,1
H2O cluster free memory:,3.998 Gb
H2O cluster total cores:,2
H2O cluster allowed cores:,2


In [0]:
train, val, sample = process_bank_data()

In [0]:
train.head(10)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no
42,entrepreneur,divorced,tertiary,yes,2,yes,no,unknown,5,may,380,1,-1,0,unknown,no
58,retired,married,primary,no,121,yes,no,unknown,5,may,50,1,-1,0,unknown,no
41,admin.,divorced,secondary,no,270,yes,no,unknown,5,may,222,1,-1,0,unknown,no
29,admin.,single,secondary,no,390,yes,no,unknown,5,may,137,1,-1,0,unknown,no




In [0]:
train.describe()

In [0]:
train.types

{'age': 'int',
 'balance': 'int',
 'campaign': 'int',
 'contact': 'enum',
 'day': 'int',
 'default': 'enum',
 'duration': 'int',
 'education': 'enum',
 'housing': 'enum',
 'job': 'enum',
 'loan': 'enum',
 'marital': 'enum',
 'month': 'enum',
 'pdays': 'int',
 'poutcome': 'enum',
 'previous': 'int',
 'y': 'enum'}

## Logistic Regression

In [0]:
feature_columns = train.col_names[:-1]
label = train.col_names[-1]

In [0]:
# Model parameters
model = H2OGeneralizedLinearEstimator(model_id="glm_v1", family="binomial", solver="AUTO")

In [0]:
model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [0]:
model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_v1


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.07126613766420797
RMSE: 0.2669571832039887
LogLoss: 0.23983725623041302
Null degrees of freedom: 36223
Residual degrees of freedom: 36182
Null deviance: 26065.033126033304
Residual deviance: 17375.72953938096
AIC: 17459.72953938096
AUC: 0.9070133046414631
pr_auc: 0.5455021097651589
Gini: 0.8140266092829263
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.21923662065754926: 


0,1,2,3,4
,no,yes,Error,Rate
no,29524.0,2482.0,0.0775,(2482.0/32006.0)
yes,1509.0,2709.0,0.3578,(1509.0/4218.0)
Total,31033.0,5191.0,0.1102,(3991.0/36224.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2192366,0.5758317,224.0
max f2,0.1050183,0.6887997,288.0
max f0point5,0.3465799,0.5719736,174.0
max accuracy,0.4022977,0.9031581,155.0
max precision,0.9062164,0.7154930,23.0
max recall,0.0031216,1.0,397.0
max specificity,0.9992664,0.9996251,0.0
max absolute_mcc,0.1589545,0.5206402,253.0
max min_per_class_accuracy,0.1168565,0.8342817,280.0


Gains/Lift Table: Avg response rate: 11.64 %, avg score: 11.64 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100210,0.9024603,6.1274950,6.1274950,0.7134986,0.9546178,0.7134986,0.9546178,0.0614035,0.0614035,512.7495046,512.7495046
,2,0.0200144,0.8099567,5.9309091,6.0293376,0.6906077,0.8526814,0.7020690,0.9037199,0.0592698,0.1206733,493.0909100,502.9337650
,3,0.0300077,0.7207274,5.8597382,5.9728565,0.6823204,0.7622316,0.6954922,0.8566005,0.0585586,0.1792319,485.9738191,497.2856505
,4,0.0400011,0.6325346,5.6699491,5.8971819,0.6602210,0.6758306,0.6866805,0.8114392,0.0566619,0.2358938,466.9949100,489.7181915
,5,0.0500221,0.5615671,4.9918975,5.7158253,0.5812672,0.5949416,0.6655629,0.7680680,0.0500237,0.2859175,399.1897509,471.5825271
,6,0.1000166,0.3163073,4.3579966,5.0370983,0.5074544,0.4241396,0.5865305,0.5961513,0.2178758,0.5037933,335.7996638,403.7098344
,7,0.1500110,0.2066819,3.0634013,4.3793204,0.3567090,0.2558440,0.5099374,0.4827364,0.1531532,0.6569464,206.3401336,337.9320413
,8,0.2000055,0.1467397,2.2619852,3.8500597,0.2633904,0.1729920,0.4483092,0.4053110,0.1130868,0.7700332,126.1985197,285.0059671
,9,0.2999945,0.0880610,1.2353190,2.9785596,0.1438432,0.1133180,0.3468299,0.3079889,0.1235183,0.8935514,23.5318960,197.8559638




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.0706300616089855
RMSE: 0.2657631682701452
LogLoss: 0.23828595057289353
Null degrees of freedom: 4594
Residual degrees of freedom: 4553
Null deviance: 3362.8735995876823
Residual deviance: 2189.847885764891
AIC: 2273.847885764891
AUC: 0.9126833311273722
pr_auc: 0.5596829193662611
Gini: 0.8253666622547444
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20746383675718202: 


0,1,2,3,4
,no,yes,Error,Rate
no,3702.0,344.0,0.085,(344.0/4046.0)
yes,169.0,380.0,0.3078,(169.0/549.0)
Total,3871.0,724.0,0.1116,(513.0/4595.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2074638,0.5970149,214.0
max f2,0.1251457,0.7086012,264.0
max f0point5,0.3305971,0.6057577,164.0
max accuracy,0.3454317,0.9062024,159.0
max precision,0.9860435,0.7333333,3.0
max recall,0.0183376,1.0,376.0
max specificity,0.9991153,0.9995057,0.0
max absolute_mcc,0.1594383,0.5439642,241.0
max min_per_class_accuracy,0.1251457,0.8433515,264.0


Gains/Lift Table: Avg response rate: 11.95 %, avg score: 12.19 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100109,0.9071555,5.6404926,5.6404926,0.6739130,0.9646750,0.6739130,0.9646750,0.0564663,0.0564663,464.0492595,464.0492595
,2,0.0200218,0.8277024,5.0946385,5.3675655,0.6086957,0.8720386,0.6413043,0.9183568,0.0510018,0.1074681,409.4638473,436.7565534
,3,0.0300326,0.7449510,5.6404926,5.4585412,0.6739130,0.7837556,0.6521739,0.8734897,0.0564663,0.1639344,464.0492595,445.8541221
,4,0.0400435,0.6694433,6.7322008,5.7769561,0.8043478,0.7130213,0.6902174,0.8333726,0.0673953,0.2313297,573.2200839,477.6956126
,5,0.0500544,0.6134350,5.6404926,5.7496634,0.6739130,0.6396067,0.6869565,0.7946194,0.0564663,0.2877960,464.0492595,474.9663420
,6,0.1001088,0.3502584,4.7307357,5.2401996,0.5652174,0.4642480,0.6260870,0.6294337,0.2367942,0.5245902,373.0735725,424.0199572
,7,0.1501632,0.2182733,2.8384414,4.4396135,0.3391304,0.2736415,0.5304348,0.5108363,0.1420765,0.6666667,183.8441435,343.9613527
,8,0.2,0.1530650,2.4487953,3.9435337,0.2925764,0.1832102,0.4711643,0.4291971,0.1220401,0.7887067,144.8795348,294.3533698
,9,0.3001088,0.0904129,1.1280985,3.0043748,0.1347826,0.1169328,0.3589558,0.3250335,0.1129326,0.9016393,12.8098519,200.4374755



Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iterations,negative_log_likelihood,objective
,2019-02-05 11:15:17,0.000 sec,0,13032.5165630,0.3597757
,2019-02-05 11:15:17,0.182 sec,1,9798.4225565,0.2745847
,2019-02-05 11:15:18,0.277 sec,2,9056.4022792,0.2521491
,2019-02-05 11:15:18,0.341 sec,3,8696.5873987,0.2429587
,2019-02-05 11:15:18,0.384 sec,4,8688.3893951,0.2428121
,2019-02-05 11:15:18,0.427 sec,5,8687.8647697,0.2428074




In [0]:
%memit model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
peak memory: 151.50 MiB, increment: 1.01 MiB


In [0]:
%time model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 46.4 ms, sys: 3.6 ms, total: 50 ms
Wall time: 544 ms


In [0]:
%prun model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

glm Model Build progress: |███████████████████████████████████████████████| 100%
 

**Results**

**Accuracy**: 0.9062
**AUC**: 0.91268
**AUC Precision Recall**: 0.5291479  
**Average Loss**: 0.40094  
**Loss**: 0.2657
**Precision**: 0.9563
**Recall**: 0.9149  


**%time**    
CPU times: user 46.4 ms, sys: 3.6 ms, total: 50 ms  
Wall time: 544 ms

**%prun**   
30356 function calls (28684 primitive calls) in 0.797 seconds

**%memit**   
peak memory: 151.50 MiB, increment: 1.01 MiB

## Random Forest

In [0]:
model = H2ORandomForestEstimator(model_id="rd_v1", ntrees=200)

In [0]:
model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [0]:
model

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rd_v1


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.06268215294274644
RMSE: 0.2503640408340352
LogLoss: 0.20027416620381755
Mean Per-Class Error: 0.1308599765566716
AUC: 0.9315084524958824
pr_auc: 0.6113683366816596
Gini: 0.8630169049917649
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.29482220603287906: 


0,1,2,3,4
,no,yes,Error,Rate
no,29138.0,2868.0,0.0896,(2868.0/32006.0)
yes,1019.0,3199.0,0.2416,(1019.0/4218.0)
Total,30157.0,6067.0,0.1073,(3887.0/36224.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2948222,0.6220710,222.0
max f2,0.1591537,0.7355085,281.0
max f0point5,0.4538122,0.6017792,156.0
max accuracy,0.4602345,0.9077683,153.0
max precision,0.9696794,1.0,0.0
max recall,0.0000418,1.0,399.0
max specificity,0.9696794,1.0,0.0
max absolute_mcc,0.2922522,0.5746827,223.0
max min_per_class_accuracy,0.1814040,0.8629944,270.0


Gains/Lift Table: Avg response rate: 11.64 %, avg score: 11.89 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100210,0.7989653,6.8135852,6.8135852,0.7933884,0.8543801,0.7933884,0.8543801,0.0682788,0.0682788,581.3585225,581.3585225
,2,0.0200144,0.7255138,6.2630400,6.5386923,0.7292818,0.7602698,0.7613793,0.8073898,0.0625889,0.1308677,526.3040010,553.8692304
,3,0.0300077,0.6799661,5.8360146,6.3046819,0.6795580,0.7023885,0.7341306,0.7724216,0.0583215,0.1891892,483.6014555,530.4681867
,4,0.0400011,0.6383720,5.6936727,6.1520350,0.6629834,0.6580567,0.7163561,0.7438501,0.0568990,0.2460882,469.3672736,515.2035003
,5,0.0500221,0.6040176,5.4650631,6.0144132,0.6363636,0.6206244,0.7003311,0.7191641,0.0547653,0.3008535,446.5063149,501.4413158
,6,0.1000166,0.4605962,4.6567494,5.3357686,0.5422419,0.5310350,0.6213083,0.6251255,0.2328118,0.5336652,365.6749400,433.5768646
,7,0.1500110,0.3360500,3.4569962,4.7096264,0.4025400,0.3973858,0.5483990,0.5492262,0.1728307,0.7064960,245.6996245,370.9626427
,8,0.2000055,0.2212653,2.4753800,4.1511419,0.2882385,0.2765096,0.4833678,0.4810565,0.1237553,0.8302513,147.5380027,315.1141923
,9,0.2999945,0.0811368,1.1049110,3.1358251,0.1286582,0.1408049,0.3651422,0.3676497,0.1104789,0.9407302,10.4911008,213.5825058




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.06171774416418573
RMSE: 0.2484305620574605
LogLoss: 0.19631429532563424
Mean Per-Class Error: 0.1270671881738874
AUC: 0.9369707381506123
pr_auc: 0.6309145213992088
Gini: 0.8739414763012245
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3218680327013135: 


0,1,2,3,4
,no,yes,Error,Rate
no,3726.0,320.0,0.0791,(320.0/4046.0)
yes,128.0,421.0,0.2332,(128.0/549.0)
Total,3854.0,741.0,0.0975,(448.0/4595.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3218680,0.6527132,193.0
max f2,0.1591111,0.7453886,267.0
max f0point5,0.4327916,0.6407840,146.0
max accuracy,0.4327916,0.9140370,146.0
max precision,0.9384309,1.0,0.0
max recall,0.0009231,1.0,395.0
max specificity,0.9384309,1.0,0.0
max absolute_mcc,0.3218680,0.6065548,193.0
max min_per_class_accuracy,0.1875453,0.8645576,251.0


Gains/Lift Table: Avg response rate: 11.95 %, avg score: 12.16 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100109,0.7914781,6.5502495,6.5502495,0.7826087,0.8401354,0.7826087,0.8401354,0.0655738,0.0655738,555.0249465,555.0249465
,2,0.0200218,0.7245978,6.0043953,6.2773224,0.7173913,0.7550527,0.75,0.7975940,0.0601093,0.1256831,500.4395343,527.7322404
,3,0.0300326,0.6737543,5.4585412,6.0043953,0.6521739,0.6958352,0.7173913,0.7636744,0.0546448,0.1803279,445.8541221,500.4395343
,4,0.0400435,0.6370252,5.8224440,5.9589075,0.6956522,0.6554225,0.7119565,0.7366115,0.0582878,0.2386157,482.2443969,495.8907500
,5,0.0500544,0.6082129,5.6404926,5.8952245,0.6739130,0.6241383,0.7043478,0.7141168,0.0564663,0.2950820,464.0492595,489.5224519
,6,0.1001088,0.4703972,5.0582482,5.4767364,0.6043478,0.5373779,0.6543478,0.6257474,0.2531876,0.5482696,405.8248198,447.6736359
,7,0.1501632,0.3488107,3.5662469,4.8399065,0.4260870,0.4062760,0.5782609,0.5525903,0.1785064,0.7267760,256.6246931,383.9906549
,8,0.2,0.2268075,2.0833035,4.1530055,0.2489083,0.2885490,0.4961915,0.4867954,0.1038251,0.8306011,108.3303505,315.3005464
,9,0.3001088,0.0854710,1.1280985,3.1439720,0.1347826,0.1479669,0.3756345,0.3737707,0.1129326,0.9435337,12.8098519,214.3971966



Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2019-02-05 11:23:06,0.032 sec,0.0,,,,,,,,,,,,
,2019-02-05 11:23:07,0.780 sec,1.0,0.3391352,3.6377265,0.6850405,0.1183961,4.3067389,0.1195578,0.3393706,3.6266722,0.7035260,0.1215133,4.2775047,0.1194777
,2019-02-05 11:23:07,1.255 sec,2.0,0.3295028,3.2763331,0.7168626,0.1651809,4.3773138,0.1200530,0.3050297,1.9767677,0.7853255,0.2695527,4.9959052,0.1388466
,2019-02-05 11:23:08,1.639 sec,3.0,0.3248134,2.9949378,0.7334451,0.1983478,4.3884378,0.1295817,0.2873014,1.1052341,0.8347026,0.3731574,5.8520296,0.1434168
,2019-02-05 11:23:08,2.088 sec,4.0,0.3164099,2.6330350,0.7557639,0.2322225,4.5356655,0.1297722,0.2799211,0.7733529,0.8656727,0.4334579,5.8270503,0.1170838
,2019-02-05 11:23:09,2.591 sec,5.0,0.3088143,2.2592981,0.7749824,0.2663090,4.6270353,0.1340904,0.2748885,0.5894898,0.8785387,0.4682451,6.3200253,0.1149075
,2019-02-05 11:23:09,3.132 sec,6.0,0.3028205,1.9712183,0.7922002,0.2973885,4.7420017,0.1381870,0.2700190,0.4926860,0.8913918,0.5047294,6.1863467,0.1186072
,2019-02-05 11:23:10,3.658 sec,7.0,0.2968943,1.6954449,0.8101700,0.3267194,4.7819959,0.1387644,0.2674707,0.4013613,0.9005280,0.5231048,5.8588342,0.1244831
,2019-02-05 11:23:14,7.776 sec,23.0,0.2625469,0.3814623,0.9031747,0.5290447,5.9438811,0.1147306,0.2534593,0.2454233,0.9252031,0.6084349,6.5992364,0.1022851


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
duration,130268.2265625,1.0,0.2879407
month,59591.1875000,0.4574499,0.1317184
age,41046.8476562,0.3150949,0.0907286
day,38325.8359375,0.2942071,0.0847142
poutcome,35658.2382812,0.2737294,0.0788178
job,33843.3164062,0.2597972,0.0748062
balance,24444.8984375,0.1876505,0.0540322
pdays,19133.8300781,0.1468803,0.0422928
campaign,14689.5419922,0.1127638,0.0324693




In [0]:
%memit model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

drf Model Build progress: |███████████████████████████████████████████████| 100%
peak memory: 152.40 MiB, increment: 0.38 MiB


In [0]:
%time model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

drf Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 331 ms, sys: 47.6 ms, total: 379 ms
Wall time: 24.4 s


In [0]:
%prun model.train(x=feature_columns, y=label, training_frame=train, validation_frame=val)

drf Model Build progress: |███████████████████████████████████████████████| 100%
 

**Results**

**Train Loss (RMSE)**: 0.161641  
**Validation Loss(RMSE)**: 0.162671 

**%time**    
CPU times: user 331 ms, sys: 47.6 ms, total: 379 ms   
Wall time: 24.4 s

**%prun**   
211981 function calls (206241 primitive calls) in 24.111 seconds 

**%memit**   
peak memory: 152.40 MiB, increment: 0.38 MiB