In [1]:
import pandas as pd
import h2o
h2o.init()

dt = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

valid_ids = dt.policy_id.sample(frac=0.25)
train_dt = dt[~dt.policy_id.isin(valid_ids)]
valid_dt = dt[dt.policy_id.isin(valid_ids)]
train_dt = train_dt.reset_index(drop=True)
valid_dt = valid_dt.reset_index(drop=True)

train_dt_hex = h2o.H2OFrame(train_dt)
valid_dt_hex = h2o.H2OFrame(valid_dt)
test_hex = h2o.H2OFrame(test)

train_dt_hex['make'] = train_dt_hex['make'].asfactor()
valid_dt_hex['make'] = valid_dt_hex['make'].asfactor()
test_hex['make'] = test_hex['make'].asfactor()

train_dt_hex['is_claim'] = train_dt_hex['is_claim'].asfactor()
valid_dt_hex['is_claim'] = valid_dt_hex['is_claim'].asfactor()

selvars = ['policy_tenure','age_of_car','age_of_policyholder','population_density','airbags','width','height','ncap_rating',
    'area_cluster','segment','model','fuel_type','max_torque','max_power','engine_type','is_esc','is_adjustable_steering','is_parking_sensors','is_parking_camera','rear_brakes_type','transmission_type',
'steering_type','is_front_fog_lights','is_rear_window_wiper','is_rear_window_defogger','is_brake_assist','is_power_door_locks','is_central_locking','is_power_steering','is_driver_seat_height_adjustable']

numvars = ['policy_tenure','age_of_car','age_of_policyholder','population_density','airbags','displacement','cylinder','gear_box','turning_radius','length','width','height','gross_weight','ncap_rating']
catvars = ['area_cluster','segment','model','fuel_type','max_torque','max_power','engine_type','is_esc','is_adjustable_steering','is_tpms','is_parking_sensors','is_parking_camera','rear_brakes_type','transmission_type',
'steering_type','is_front_fog_lights','is_rear_window_wiper','is_rear_window_washer','is_rear_window_defogger','is_brake_assist','is_power_door_locks','is_central_locking','is_power_steering','is_driver_seat_height_adjustable',
'is_day_night_rear_view_mirror','is_ecw','is_speed_alert']

allvars =  ['policy_tenure','age_of_car','age_of_policyholder','population_density','airbags','displacement','cylinder','gear_box','turning_radius','length','width','height','gross_weight','ncap_rating',
'area_cluster','segment','model','fuel_type','max_torque','max_power','engine_type','is_esc','is_adjustable_steering','is_tpms','is_parking_sensors','is_parking_camera','rear_brakes_type','transmission_type',
'steering_type','is_front_fog_lights','is_rear_window_wiper','is_rear_window_washer','is_rear_window_defogger','is_brake_assist','is_power_door_locks','is_central_locking','is_power_steering','is_driver_seat_height_adjustable',
'is_day_night_rear_view_mirror','is_ecw','is_speed_alert']

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 hours 17 mins
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.2
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_Paul_4kgr44
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.946 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [2]:
# glm model
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator
othervars = [i for i in allvars if i not in selvars]
mdl_glm = H2OGeneralizedLinearEstimator(family='binomial',lambda_search=True,nfolds=5).train(x=selvars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)
mdl_gbm = H2OGradientBoostingEstimator(ntrees=65,min_rows=150,max_depth=3,nfolds=5).train(x=selvars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)
mdl_rf = H2ORandomForestEstimator(ntrees=160,min_rows=50,max_depth=6,nfolds=5).train(x=selvars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)

selvars_cat = [i for i in catvars if i in selvars]
from catboost import CatBoostClassifier
clf = CatBoostClassifier(iterations=25,depth=7).fit(
    train_dt[selvars], train_dt.is_claim,
    cat_features=selvars_cat,
    eval_set=(valid_dt[selvars], valid_dt.is_claim),
)

pred_dt = h2o.as_list(mdl_glm.predict(train_dt_hex),use_pandas=True)
train_dt['mdl1_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_gbm.predict(train_dt_hex),use_pandas=True)
train_dt['mdl1_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_rf.predict(train_dt_hex),use_pandas=True)
train_dt['mdl1_rf_score'] = pred_dt['p1']
pred_val = clf.predict_proba(train_dt[selvars])
train_dt['mdl1_catboost_score'] = [pred_val[val][1] for val in range(len(pred_val))]

pred_dt = h2o.as_list(mdl_glm.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl1_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_gbm.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl1_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_rf.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl1_rf_score'] = pred_dt['p1']
pred_val = clf.predict_proba(valid_dt[selvars])
valid_dt['mdl1_catboost_score'] = [pred_val[val][1] for val in range(len(pred_val))]

pred_dt = h2o.as_list(mdl_glm.predict(test_hex),use_pandas=True)
test['mdl1_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_gbm.predict(test_hex),use_pandas=True)
test['mdl1_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_rf.predict(test_hex),use_pandas=True)
test['mdl1_rf_score'] = pred_dt['p1']
pred_val = clf.predict_proba(test[selvars])
test['mdl1_catboost_score'] = [pred_val[val][1] for val in range(len(pred_val))]

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Learning rate set to 0.401422
0:	learn: 0.3601710	test: 0.3596923	best: 0.3596923 (0)	total: 238ms	remaining: 5.7s
1:	learn: 0.2694577	test: 0.2692912	best: 0.2692912 (1)	total: 319ms	remaining: 3.67s
2:	learn: 0.2434306	test: 0.2429648	best: 0.2429648 (2)	total: 375ms	remaining: 2.75s
3:	learn: 0.2349176	test: 0.2344204	best: 0.2344204 (3)	total: 444ms	remaining: 2.33s
4:	learn: 0.2305794	test: 0.2302244	best: 0.2302244 (4)	total: 514ms	remaining: 2.06s
5:	learn: 0.2296488	test: 0.2291888	best: 0.2291888 (5)	total: 559ms	remaining: 1.77s
6:	learn: 0.2288040	test: 0.2285216	best: 0.2285216 (6)	total: 620ms	remaining: 1.59s
7:	learn: 0.2278347	test: 0.2280351	best: 0.2280351 (7)	total: 687ms	remaining: 1.46

In [4]:
score_vars_mdls = ['mdl1_glm_score','mdl1_gbm_score','mdl1_rf_score','mdl1_catboost_score']
train_dt_hex = h2o.H2OFrame(train_dt)
valid_dt_hex = h2o.H2OFrame(valid_dt)
test_hex = h2o.H2OFrame(test)

train_dt_hex['make'] = train_dt_hex['make'].asfactor()
valid_dt_hex['make'] = valid_dt_hex['make'].asfactor()
test_hex['make'] = test_hex['make'].asfactor()

train_dt_hex['is_claim'] = train_dt_hex['is_claim'].asfactor()
valid_dt_hex['is_claim'] = valid_dt_hex['is_claim'].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


In [8]:
mdl_glm_fnl = H2OGeneralizedLinearEstimator(family='binomial',lambda_search=True,interactions=score_vars_mdls,nfolds=5).train(x=score_vars_mdls,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


In [11]:
import numpy as np
from sklearn.metrics import f1_score

thresh = 0.079

pred_dt = h2o.as_list(mdl_glm_fnl.predict(train_dt_hex),use_pandas=True)
train_dt['score'] = pred_dt['p1']
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)

pred_dt = h2o.as_list(mdl_glm_fnl.predict(valid_dt_hex),use_pandas=True)
valid_dt['score'] = pred_dt['p1']
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)

print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
0.19164090848004303
0.17718940936863545


In [12]:
pred_dt = h2o.as_list(mdl_glm_fnl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter5.csv',index=None)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
  policy_id  is_claim
0   ID58593         1
1   ID58594         0
2   ID58595         0
3   ID58596         0
4   ID58597         0


In [13]:
othervars = [i for i in allvars if i not in othervars]
mdl2_glm = H2OGeneralizedLinearEstimator(family='binomial',lambda_search=True,nfolds=5).train(x=othervars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)
mdl2_gbm = H2OGradientBoostingEstimator(ntrees=65,min_rows=150,max_depth=3,nfolds=5).train(x=othervars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)
mdl2_rf = H2ORandomForestEstimator(ntrees=160,min_rows=50,max_depth=6,nfolds=5).train(x=othervars,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)

pred_dt = h2o.as_list(mdl2_glm.predict(train_dt_hex),use_pandas=True)
train_dt['mdl2_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_gbm.predict(train_dt_hex),use_pandas=True)
train_dt['mdl2_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_rf.predict(train_dt_hex),use_pandas=True)
train_dt['mdl2_rf_score'] = pred_dt['p1']

pred_dt = h2o.as_list(mdl2_glm.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl2_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_gbm.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl2_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_rf.predict(valid_dt_hex),use_pandas=True)
valid_dt['mdl2_rf_score'] = pred_dt['p1']

pred_dt = h2o.as_list(mdl2_glm.predict(test_hex),use_pandas=True)
test['mdl2_glm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_gbm.predict(test_hex),use_pandas=True)
test['mdl2_gbm_score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl2_rf.predict(test_hex),use_pandas=True)
test['mdl2_rf_score'] = pred_dt['p1']

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |████████████████████████

In [14]:
score_vars_mdls = ['mdl1_glm_score','mdl1_gbm_score','mdl1_rf_score','mdl1_catboost_score','mdl2_glm_score','mdl2_gbm_score','mdl2_rf_score']
train_dt_hex = h2o.H2OFrame(train_dt)
valid_dt_hex = h2o.H2OFrame(valid_dt)
test_hex = h2o.H2OFrame(test)

train_dt_hex['make'] = train_dt_hex['make'].asfactor()
valid_dt_hex['make'] = valid_dt_hex['make'].asfactor()
test_hex['make'] = test_hex['make'].asfactor()

train_dt_hex['is_claim'] = train_dt_hex['is_claim'].asfactor()
valid_dt_hex['is_claim'] = valid_dt_hex['is_claim'].asfactor()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [16]:
mdl_glm_fnl = H2OGeneralizedLinearEstimator(family='binomial',lambda_search=True,interactions=score_vars_mdls,nfolds=5).train(x=score_vars_mdls,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


In [18]:
thresh = 0.08

pred_dt = h2o.as_list(mdl_glm_fnl.predict(train_dt_hex),use_pandas=True)
train_dt['score'] = pred_dt['p1']

pred_dt = h2o.as_list(mdl_glm_fnl.predict(valid_dt_hex),use_pandas=True)
valid_dt['score'] = pred_dt['p1']

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
0.19226150767178118
0.17494949494949497


In [22]:
thresh = 0.09
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)
print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

0.19125459746628526
0.17635270541082163


In [23]:
pred_dt = h2o.as_list(mdl_glm_fnl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter6.csv',index=None)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
  policy_id  is_claim
0   ID58593         1
1   ID58594         0
2   ID58595         0
3   ID58596         0
4   ID58597         0


In [24]:
score_vars_mdls = ['mdl1_glm_score','mdl1_gbm_score','mdl1_rf_score','mdl1_catboost_score']
train_dt_hex = h2o.H2OFrame(train_dt)
valid_dt_hex = h2o.H2OFrame(valid_dt)
test_hex = h2o.H2OFrame(test)

train_dt_hex['make'] = train_dt_hex['make'].asfactor()
valid_dt_hex['make'] = valid_dt_hex['make'].asfactor()
test_hex['make'] = test_hex['make'].asfactor()

train_dt_hex['is_claim'] = train_dt_hex['is_claim'].asfactor()
valid_dt_hex['is_claim'] = valid_dt_hex['is_claim'].asfactor()

mdl_rf_fnl = H2ORandomForestEstimator(ntrees=15,min_rows=50,max_depth=3,nfolds=5).train(x=score_vars_mdls,y='is_claim',training_frame=train_dt_hex,validation_frame=valid_dt_hex)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


In [25]:
pred_dt = h2o.as_list(mdl_rf_fnl.predict(train_dt_hex),use_pandas=True)
train_dt['score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_rf_fnl.predict(valid_dt_hex),use_pandas=True)
valid_dt['score'] = pred_dt['p1']

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [34]:
thresh = 0.093
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)
print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

0.1925730676796651
0.17800047721307563


In [35]:
pred_dt = h2o.as_list(mdl_rf_fnl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter7.csv',index=None)

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
  policy_id  is_claim
0   ID58593         1
1   ID58594         0
2   ID58595         0
3   ID58596         0
4   ID58597         0


In [37]:
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models=100, seed=1,nfolds=5,max_runtime_secs=600)
aml.train(x=score_vars_mdls, y='is_claim', training_frame=train_dt_hex,leaderboard_frame=valid_dt_hex)

AutoML progress: |█
17:02:32.44: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
,1,4,Input,0.0,,,,,,,,,
,2,10,Rectifier,0.0,0.0,0.0,0.0029213,0.0015719,0.0,0.0428848,0.4184036,0.6893302,0.2691964
,3,10,Rectifier,0.0,0.0,0.0,0.0043204,0.0065935,0.0,0.0007385,0.2821121,0.7810962,0.1446891
,4,10,Rectifier,0.0,0.0,0.0,0.0526745,0.1679507,0.0,-0.0470053,0.3321271,0.9501382,0.0978331
,5,2,Softmax,,0.0,0.0,0.0143212,0.0281806,0.0,-0.2477929,1.3034983,-0.008681,0.207704

Unnamed: 0,0,1,Error,Rate
0,7006.0,2296.0,0.2468,(2296.0/9302.0)
1,350.0,325.0,0.5185,(350.0/675.0)
Total,7356.0,2621.0,0.2652,(2646.0/9977.0)

metric,threshold,value,idx
max f1,0.0555839,0.1972087,185.0
max f2,0.0396064,0.3364389,255.0
max f0point5,0.0849989,0.1765562,91.0
max accuracy,0.2119865,0.9322442,0.0
max precision,0.1589506,0.3636364,7.0
max recall,0.006949,1.0,390.0
max specificity,0.2119865,0.9998925,0.0
max absolute_mcc,0.0396064,0.1435357,255.0
max min_per_class_accuracy,0.0477658,0.629972,220.0
max mean_per_class_accuracy,0.0396064,0.6428717,255.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100231,0.1161633,3.9908,3.9908,0.27,0.1340679,0.27,0.1340679,0.04,0.04,299.08,299.08,0.0321522
2,0.0200461,0.1044743,2.8083407,3.3995704,0.19,0.1095877,0.23,0.1218278,0.0281481,0.0681481,180.8340741,239.957037,0.0515926
3,0.0300692,0.0956184,2.8083407,3.2024938,0.19,0.1002197,0.2166667,0.1146251,0.0281481,0.0962963,180.8340741,220.2493827,0.0710329
4,0.0400922,0.08939,1.3302667,2.734437,0.09,0.0920472,0.185,0.1089806,0.0133333,0.1096296,33.0266667,173.4437037,0.0745834
5,0.050015,0.085696,2.3888066,2.6658651,0.1616162,0.0873266,0.1803607,0.1046845,0.0237037,0.1333333,138.8806584,166.5865063,0.0893643
6,0.1000301,0.0737608,1.8957263,2.2807957,0.1282565,0.0793248,0.1543086,0.0920047,0.0948148,0.2281481,89.5726267,128.0795665,0.137415
7,0.1500451,0.0668608,1.5698983,2.0438299,0.1062124,0.0700805,0.1382766,0.0846966,0.0785185,0.3066667,56.9898315,104.3829882,0.1679868
8,0.2000601,0.0607853,1.392174,1.8809159,0.0941884,0.0638102,0.1272545,0.079475,0.0696296,0.3762963,39.2173978,88.0915906,0.1890247
9,0.29999,0.053062,1.5121721,1.7580834,0.1023069,0.0566244,0.1189442,0.0718632,0.1511111,0.5274074,51.2172072,75.8083429,0.24392
10,0.40002,0.0469281,1.1403978,1.6036233,0.0771543,0.0498727,0.1084941,0.0663642,0.1140741,0.6414815,14.0397833,60.3623338,0.2589831

Unnamed: 0,0,1,Error,Rate
0,30119.0,11005.0,0.2676,(11005.0/41124.0)
1,1390.0,1430.0,0.4929,(1390.0/2820.0)
Total,31509.0,12435.0,0.2821,(12395.0/43944.0)

metric,threshold,value,idx
max f1,0.0922509,0.1874795,223.0
max f2,0.0714845,0.3243746,263.0
max f0point5,0.1404443,0.1549764,146.0
max accuracy,0.4468429,0.9358274,9.0
max precision,0.4468429,0.5,9.0
max recall,0.0057458,1.0,398.0
max specificity,0.6582232,0.9999757,0.0
max absolute_mcc,0.0714845,0.1394095,263.0
max min_per_class_accuracy,0.0791401,0.6322695,249.0
max mean_per_class_accuracy,0.0714845,0.6412259,263.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100127,0.2307786,3.6124178,3.6124178,0.2318182,0.2901461,0.2318182,0.2901461,0.0361702,0.0361702,261.2417795,261.2417795,0.0279512
2,0.0200027,0.2002416,2.5557505,3.0846852,0.1640091,0.2133852,0.1979522,0.2518093,0.0255319,0.0617021,155.5750497,208.4685208,0.0445589
3,0.0300155,0.1837591,2.0541199,2.740903,0.1318182,0.1914081,0.1758908,0.2316603,0.0205674,0.0822695,105.4119923,174.0903005,0.0558372
4,0.0400055,0.1714256,2.1297921,2.5882991,0.1366743,0.1769946,0.1660978,0.2180094,0.0212766,0.1035461,112.9792081,158.8299083,0.0678978
5,0.0500182,0.1631231,1.9124565,2.4530076,0.1227273,0.1669409,0.1574158,0.2077864,0.0191489,0.122695,91.245648,145.300757,0.0776605
6,0.1000137,0.1356142,1.9930892,2.2231007,0.1279017,0.1482691,0.1426621,0.1780345,0.0996454,0.2223404,99.3089222,122.3100719,0.1307151
7,0.1500091,0.1170336,1.5746114,2.0069704,0.1010469,0.1258111,0.1287925,0.1606294,0.0787234,0.3010638,57.4611414,100.6970409,0.161413
8,0.2000046,0.1057778,1.4682188,1.8722978,0.0942194,0.1109671,0.1201502,0.1482152,0.0734043,0.3744681,46.8218751,87.2297819,0.186427
9,0.2999954,0.0897249,1.54624,1.7636201,0.0992262,0.0971852,0.1131761,0.1312065,0.1546099,0.529078,54.6240037,76.3620136,0.2447915
10,0.4000091,0.0767251,1.2551478,1.6364876,0.0805461,0.0831515,0.1050176,0.1191914,0.1255319,0.6546099,25.5147774,63.6487582,0.2720596

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7428785,0.0670237,0.7484356,0.7240869,0.6393219,0.8057799,0.7967683
auc,0.687258,0.0046353,0.6888382,0.6940007,0.6872183,0.6844727,0.6817603
err,0.2571215,0.0670237,0.2515644,0.2759131,0.3606781,0.1942201,0.2032317
err_count,2259.8,589.1118,2211.0,2425.0,3170.0,1707.0,1786.0
f0point5,0.1429709,0.007661,0.1421622,0.1368123,0.1343391,0.1507208,0.1508201
f1,0.1922772,0.0032614,0.1921812,0.1876047,0.1909137,0.195191,0.1954955
f2,0.295871,0.021483,0.2965051,0.2983802,0.329806,0.276886,0.2777778
lift_top_group,3.894307,0.6271095,4.0585246,3.8615992,4.44649,4.2618275,2.843093
logloss,0.2258537,0.0032071,0.2273229,0.225408,0.2296584,0.2209243,0.2259549
max_per_class_error,0.5267669,0.1043215,0.5353357,0.5079086,0.3607556,0.6159555,0.613879

Unnamed: 0,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2022-11-13 17:03:28,0.000 sec,,0.0,0,0.0,,,,,,,
,2022-11-13 17:03:28,5.698 sec,331514 obs/sec,1.041075,1,45749.0,0.2488364,0.2391094,0.0183703,0.6828098,0.1321816,4.1386074,0.274331
,2022-11-13 17:03:29,6.762 sec,389481 obs/sec,10.4053113,10,457251.0,0.2494954,0.2411137,0.0131639,0.6860059,0.1322355,3.9908,0.26521

variable,relative_importance,scaled_importance,percentage
mdl1_gbm_score,1.0,1.0,0.2933832
mdl1_rf_score,0.8910623,0.8910623,0.2614227
mdl1_glm_score,0.8246097,0.8246097,0.2419267
mdl1_catboost_score,0.692839,0.692839,0.2032674


In [38]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DeepLearning_1_AutoML_2_20221113_170232,0.660876,0.232953,0.101359,0.402003,0.242678,0.0588924
GLM_1_AutoML_2_20221113_170232,0.659975,0.227968,0.103387,0.395451,0.242183,0.0586527
GBM_grid_1_AutoML_2_20221113_170232_model_7,0.658466,0.227243,0.103975,0.399014,0.241693,0.0584154
GBM_grid_1_AutoML_2_20221113_170232_model_11,0.6573,0.227401,0.103563,0.403477,0.241711,0.0584244
GBM_grid_1_AutoML_2_20221113_170232_model_14,0.656569,0.227781,0.100379,0.398627,0.241941,0.0585352
GBM_grid_1_AutoML_2_20221113_170232_model_17,0.656038,0.227862,0.102843,0.396492,0.241995,0.0585617
GBM_grid_1_AutoML_2_20221113_170232_model_2,0.656032,0.227613,0.105753,0.402494,0.241728,0.0584322
GBM_grid_1_AutoML_2_20221113_170232_model_24,0.655856,0.227486,0.103301,0.400683,0.241757,0.0584466
GBM_grid_1_AutoML_2_20221113_170232_model_20,0.655556,0.22769,0.102532,0.399894,0.241848,0.0584904
GBM_grid_1_AutoML_2_20221113_170232_model_9,0.654298,0.227761,0.103054,0.3986,0.241821,0.0584775


In [39]:
mdl_automl = h2o.get_model('DeepLearning_1_AutoML_2_20221113_170232')

In [40]:
pred_dt = h2o.as_list(mdl_automl.predict(train_dt_hex),use_pandas=True)
train_dt['score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_automl.predict(valid_dt_hex),use_pandas=True)
valid_dt['score'] = pred_dt['p1']

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%


In [56]:
thresh = 0.0549
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)
print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

0.1891310196394942
0.1776586974443528


In [57]:
pred_dt = h2o.as_list(mdl_automl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter8.csv',index=None)

deeplearning prediction progress: |██████████████████████████████████████████████| (done) 100%
  policy_id  is_claim
0   ID58593         1
1   ID58594         0
2   ID58595         0
3   ID58596         0
4   ID58597         0


In [58]:
mdl_automl = h2o.get_model('GLM_1_AutoML_2_20221113_170232')
pred_dt = h2o.as_list(mdl_automl.predict(train_dt_hex),use_pandas=True)
train_dt['score'] = pred_dt['p1']
pred_dt = h2o.as_list(mdl_automl.predict(valid_dt_hex),use_pandas=True)
valid_dt['score'] = pred_dt['p1']

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [75]:
thresh = 0.078
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)
print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

0.18948187536476233
0.17697729052466718


In [76]:
pred_dt = h2o.as_list(mdl_automl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter9.csv',index=None)

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
  policy_id  is_claim
0   ID58593         1
1   ID58594         0
2   ID58595         0
3   ID58596         0
4   ID58597         0


In [77]:
thresh = 0.077
train_dt['prediction'] = np.where(train_dt.score>=thresh,1,0)
valid_dt['prediction'] = np.where(valid_dt.score>=thresh,1,0)
print(f1_score(y_true=train_dt.is_claim.values,y_pred=train_dt.prediction.values))
print(f1_score(y_true=valid_dt.is_claim.values,y_pred=valid_dt.prediction.values))

0.19013771657041317
0.1776416539050536


In [None]:
pred_dt = h2o.as_list(mdl_automl.predict(test_hex),use_pandas=True)
test['score'] = pred_dt['p1']
test['is_claim'] = np.where(test.score>=thresh,1,0)
print(test[['policy_id','is_claim']].head(5))
test[['policy_id','is_claim']].to_csv('output_iter10.csv',index=None)