In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,2 hours 26 mins
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.6
H2O cluster version age:,3 months and 17 days !!!
H2O cluster name:,h2o_python
H2O cluster total nodes:,1
H2O cluster free memory:,22.60 Gb
H2O cluster total cores:,48
H2O cluster allowed cores:,48


In [3]:
DATAFILE = "/mnt/fs-h2o/Telco-Customer-Churn.csv"
data = h2o.import_file(DATAFILE, destination_frame= "gbm_data")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
data.head()

customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No




In [5]:
data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [6]:
# mark as categorical
data['SeniorCitizen'] = data['SeniorCitizen'].asfactor()

In [7]:
# build the list of features I want to use (excluding customeriD from list)
features = ['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges']

In [8]:
train, valid = data.split_frame([0.85], seed=1234, destination_frames= ["gbm_train",
                                                                              "gbm_valid"])

In [9]:
# grid search for hyper-parameter optimization

In [10]:
# this is one possible set of params, chosen after some initial trials. We can add more with longer exec time
ntrees_opt = [10, 20, 25, 30, 40, 50]
max_depth_opt = [3,4,5,6,7]
learn_rate_opt = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5]

hyper_parameters = {"ntrees": ntrees_opt, "max_depth": max_depth_opt, "learn_rate": learn_rate_opt}

In [11]:
gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)

In [12]:
# important to give an ID to the model, to make easier to find it in H2O Flow
%time gs.train(x = features, y = 'Churn', training_frame = train, validation_frame = valid, model_id = "gbm_churn")

gbm Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 5.63 s, sys: 326 ms, total: 5.95 s
Wall time: 2min 13s


In [13]:
# sort the grid in order of decreasing recall (top is higher recall)
grid = gs.get_grid(sort_by="recall", decreasing=True)

In [14]:
grid

       learn_rate max_depth ntrees  \
0             0.2         6     25   
1             0.2         4     30   
2             0.5         5     25   
3             0.5         3     30   
4             0.3         5     25   
..  ..        ...       ...    ...   
175           0.5         6     20   
176          0.01         7     50   
177          0.05         4     50   
178           0.2         3     30   
179          0.05         7     10   

                                                         model_ids recall  
0     Grid_GBM_gbm_train_model_python_1579439308081_67913_model_82    1.0  
1    Grid_GBM_gbm_train_model_python_1579439308081_67913_model_100    1.0  
2     Grid_GBM_gbm_train_model_python_1579439308081_67913_model_78    1.0  
3     Grid_GBM_gbm_train_model_python_1579439308081_67913_model_96    1.0  
4     Grid_GBM_gbm_train_model_python_1579439308081_67913_model_77    1.0  
..                                                             ...    ...  
175   Grid_



In [15]:
# this way we get the Top Model !
best_gbm = grid.models[0]

In [16]:
# let's see model's characteristics
best_gbm

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_gbm_train_model_python_1579439308081_67913_model_82


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,25.0,25.0,16638.0,6.0,6.0,6.0,32.0,62.0,48.32




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.10178347399061727
RMSE: 0.3190352237459326
LogLoss: 0.32487880350961806
Mean Per-Class Error: 0.1496028628348558
AUC: 0.9209948857906753
pr_auc: 0.801391521936984
Gini: 0.8419897715813507

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3618093028929226: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,3797.0,599.0,0.1363,(599.0/4396.0)
1,Yes,291.0,1298.0,0.1831,(291.0/1589.0)
2,Total,4088.0,1897.0,0.1487,(890.0/5985.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.361809,0.744693,206.0
1,max f2,0.245301,0.827668,255.0
2,max f0point5,0.508107,0.745139,148.0
3,max accuracy,0.442085,0.859482,174.0
4,max precision,0.949624,1.0,0.0
5,max recall,0.017715,1.0,390.0
6,max specificity,0.949624,1.0,0.0
7,max absolute_mcc,0.361809,0.645948,206.0
8,max min_per_class_accuracy,0.334787,0.842039,217.0
9,max mean_per_class_accuracy,0.282367,0.850397,237.0



Gains/Lift Table: Avg response rate: 26.55 %, avg score: 26.60 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010025,0.883155,3.76652,3.76652,1.0,0.905523,1.0,0.905523,0.03776,0.03776,276.651982,276.651982
1,,2,0.02005,0.859217,3.703744,3.735132,0.983333,0.870101,0.991667,0.887812,0.03713,0.07489,270.374449,273.513216
2,,3,0.030075,0.832437,3.452643,3.640969,0.916667,0.84659,0.966667,0.874071,0.034613,0.109503,245.264317,264.096916
3,,4,0.0401,0.813001,3.264317,3.546806,0.866667,0.822179,0.941667,0.861098,0.032725,0.142228,226.431718,254.680617
4,,5,0.050292,0.789469,3.210804,3.478713,0.852459,0.799263,0.923588,0.848567,0.032725,0.174953,221.080378,247.871266
5,,6,0.100251,0.690313,3.174458,3.327093,0.842809,0.740756,0.883333,0.794841,0.15859,0.333543,217.445818,232.709251
6,,7,0.150042,0.611616,2.730095,3.12898,0.724832,0.649923,0.830735,0.74675,0.135935,0.469478,173.009491,212.897972
7,,8,0.2,0.534201,2.506814,2.973568,0.665552,0.571498,0.789474,0.702974,0.125236,0.594714,150.68142,197.356828
8,,9,0.300084,0.383633,1.936708,2.627756,0.51419,0.454753,0.697661,0.620187,0.193833,0.788546,93.670802,162.775576
9,,10,0.4,0.258056,1.19672,2.270296,0.317726,0.318325,0.602757,0.544785,0.119572,0.908118,19.672035,127.029578




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.13272008786449208
RMSE: 0.3643076829611092
LogLoss: 0.40764014173635277
Mean Per-Class Error: 0.2223925817113478
AUC: 0.8506059493206023
pr_auc: 0.6827040118104296
Gini: 0.7012118986412046

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2615505840659436: 


Unnamed: 0,Unnamed: 1,No,Yes,Error,Rate
0,No,582.0,196.0,0.2519,(196.0/778.0)
1,Yes,54.0,226.0,0.1929,(54.0/280.0)
2,Total,636.0,422.0,0.2363,(250.0/1058.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.261551,0.643875,233.0
1,max f2,0.109375,0.754446,315.0
2,max f0point5,0.518419,0.649123,131.0
3,max accuracy,0.518419,0.811909,131.0
4,max precision,0.945534,1.0,0.0
5,max recall,0.013461,1.0,395.0
6,max specificity,0.945534,1.0,0.0
7,max absolute_mcc,0.261551,0.500202,233.0
8,max min_per_class_accuracy,0.295627,0.767352,220.0
9,max mean_per_class_accuracy,0.261551,0.777607,233.0



Gains/Lift Table: Avg response rate: 26.47 %, avg score: 26.73 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010397,0.881513,3.778571,3.778571,1.0,0.907011,1.0,0.907011,0.039286,0.039286,277.857143,277.857143
1,,2,0.020794,0.856941,3.435065,3.606818,0.909091,0.869155,0.954545,0.888083,0.035714,0.075,243.506494,260.681818
2,,3,0.030246,0.837239,3.400714,3.542411,0.9,0.848118,0.9375,0.875594,0.032143,0.107143,240.071429,254.241071
3,,4,0.040643,0.812854,2.748052,3.339203,0.727273,0.821572,0.883721,0.861775,0.028571,0.135714,174.805195,233.920266
4,,5,0.050095,0.786786,3.778571,3.422102,1.0,0.80008,0.90566,0.850134,0.035714,0.171429,277.857143,242.210243
5,,6,0.100189,0.691952,2.566577,2.99434,0.679245,0.740727,0.792453,0.79543,0.128571,0.3,156.657682,199.433962
6,,7,0.150284,0.60297,2.281402,2.756694,0.603774,0.648616,0.72956,0.746492,0.114286,0.414286,128.140162,175.669362
7,,8,0.200378,0.525306,2.06752,2.5844,0.54717,0.563196,0.683962,0.700668,0.103571,0.517857,106.752022,158.440027
8,,9,0.302457,0.376242,1.434458,2.196295,0.37963,0.446791,0.58125,0.614985,0.146429,0.664286,43.445767,119.629464
9,,10,0.399811,0.259256,1.467406,2.018811,0.38835,0.317628,0.534279,0.542579,0.142857,0.807143,46.740638,101.881121




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2020-01-19 15:35:46,31.083 sec,0.0,0.441598,0.578729,0.5,0.0,1.0,0.734503,0.441148,0.577867,0.5,0.0,1.0,0.73535
1,,2020-01-19 15:35:46,31.097 sec,1.0,0.413054,0.518441,0.861449,0.661399,3.452643,0.212364,0.413397,0.519626,0.844641,0.664874,3.387685,0.20983
2,,2020-01-19 15:35:46,31.110 sec,2.0,0.394142,0.480815,0.869829,0.679641,3.479243,0.225898,0.395838,0.485009,0.852803,0.686496,3.778571,0.189981
3,,2020-01-19 15:35:46,31.125 sec,3.0,0.380835,0.454098,0.872863,0.696725,3.652383,0.221053,0.383324,0.460057,0.858238,0.698744,3.778571,0.21172
4,,2020-01-19 15:35:46,31.142 sec,4.0,0.370958,0.433621,0.878119,0.711878,3.650627,0.217043,0.37606,0.444685,0.856718,0.699277,3.778571,0.202268
5,,2020-01-19 15:35:46,31.161 sec,5.0,0.363558,0.417995,0.88124,0.719113,3.643027,0.215372,0.371088,0.433302,0.856716,0.696391,3.778571,0.212665
6,,2020-01-19 15:35:46,31.183 sec,6.0,0.357837,0.405355,0.884828,0.726521,3.710303,0.197995,0.366655,0.422831,0.860331,0.699599,3.778571,0.218336
7,,2020-01-19 15:35:46,31.206 sec,7.0,0.353695,0.395942,0.886307,0.730673,3.76652,0.198663,0.36455,0.417207,0.859532,0.699048,3.778571,0.194707
8,,2020-01-19 15:35:46,31.231 sec,8.0,0.350387,0.388141,0.887974,0.734029,3.76652,0.196491,0.363739,0.414182,0.857398,0.698102,3.778571,0.195652
9,,2020-01-19 15:35:46,31.255 sec,9.0,0.34708,0.381017,0.890775,0.73943,3.70577,0.192982,0.363111,0.412162,0.855408,0.693928,3.778571,0.221172



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Contract,540.7901,1.0,0.356499
1,tenure,183.42366,0.339177,0.120916
2,MonthlyCharges,165.425018,0.305895,0.109051
3,TotalCharges,160.586472,0.296948,0.105861
4,InternetService,109.318932,0.202147,0.072065
5,OnlineSecurity,73.12355,0.135216,0.048204
6,PaymentMethod,56.537415,0.104546,0.03727
7,TechSupport,44.357285,0.082023,0.029241
8,PaperlessBilling,29.856867,0.05521,0.019682
9,OnlineBackup,24.678579,0.045634,0.016269




### Train the model using parameters identified.