<a href="https://colab.research.google.com/github/naolia1211/machine_learning_Algorithms/blob/main/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O_on_Malware.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Before training

In [None]:
!pip install h2o

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import tree

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Initialize H2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.17" 2022-10-18; OpenJDK Runtime Environment (build 11.0.17+8-post-Ubuntu-1ubuntu218.04); OpenJDK 64-Bit Server VM (build 11.0.17+8-post-Ubuntu-1ubuntu218.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpe7d26lfg
  JVM stdout: /tmp/tmpe7d26lfg/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpe7d26lfg/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.3
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_unknownUser_eues2r
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [None]:
# Reading dataset from Google drive
train = pd.read_csv('/content/drive/MyDrive/KDD/KDDTrain+.txt')
test = pd.read_csv('/content/drive/MyDrive/KDD/KDDTest+.txt')

In [None]:
header = ['duration', 'protocol_type', 'service',
                          'src_bytes', 'dst_bytes', 'flag', 'land',
                          'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
                          'logged_in', 'num_compromised', 'root_shell',
                          'su_attempted', 'num_root', 'num_file_creations',
                          'num_shells', 'num_access_files', 'num_outbound_cmds',
                          'is_hot_login', 'is_guest_login', 'count', 'serror_rate',
                          'rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_count',
                          'srv_serror_rate', 'srv_rerror_rate', 'srv_diff_host_rate', 'dst_host_count',
                          'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                          'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
                          'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
                          'dst_host_srv_rerror_rate', 'class', 'servere_points']

train.columns = header 
test.columns = header

In [None]:
def dataPrep(data, label, drops):
  # We exclude all numeric columns
  num_cols = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  
  noEncode = data.pop(label)

  data.drop(drops, axis = 1, inplace=True)

  #print(data)

  data_catcol = data.select_dtypes(exclude=num_cols)
  data_numcols = data.select_dtypes(include=num_cols)
  # Steps to one-hot encoding:
  # We iterate through each categorical column name
  # Create encoded variables for each categorical columns
  # Concatenate the encoded variables to the DataFrame
  # Remove the original categorical variable
  for col in data_catcol.columns.values:
    one_hot_encoded_variables = pd.get_dummies(data_catcol[col],prefix=col)
    data_catcol = pd.concat([data_catcol,one_hot_encoded_variables],axis=1)
    data_catcol.drop([col],axis=1, inplace=True)

  noEncode = pd.Series(map(lambda x: 1 if x == 'normal' else 0, noEncode), name='class')
  
  data = pd.concat([data_numcols, data_catcol, noEncode], axis=1)

  return data

train = dataPrep(train, 'class', ['servere_points'])
test = dataPrep(test, 'class', ['servere_points'])

In [None]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [None]:
# Convert the categorical variables into factors

categorical_cols = ['protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
 'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol',
 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns',
 'service_ctf', 'service_daytime', 'service_discard', 'service_domain',
 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i',
 'service_efs', 'service_exec', 'service_finger', 'service_ftp',
 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames',
 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001',
 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell',
 'service_ldap', 'service_link', 'service_login', 'service_mtp',
 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn',
 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u',
 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3',
 'service_printer', 'service_private', 'service_red_i', 'service_remote_job',
 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net', 'service_ssh',
 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet',
 'service_tftp_u', 'service_tim_i', 'service_time', 'service_urh_i', 'service_urp_i',
 'service_uucp', 'service_uucp_path', 'service_vmnet', 'service_whois',
 'src_bytes_OTH', 'src_bytes_REJ', 'src_bytes_RSTO', 'src_bytes_RSTOS0',
 'src_bytes_RSTR', 'src_bytes_S0', 'src_bytes_S1', 'src_bytes_S2',
 'src_bytes_S3', 'src_bytes_SF', 'src_bytes_SH',]

for col in categorical_cols:
  train[col] = train[col].asfactor()

train['class'] = train['class'].asfactor()
train['class'].levels()



[['0', '1']]

In [None]:
# Define predictors manually

predictors = [col for col in train.columns if col != 'class']

target = 'class'


In [None]:
def report(trainedModel):
  print("AUC: ", trainedModel.auc())
  print("F1: ", trainedModel.F1())
  print("MSE: ", trainedModel.mse())
  print("Gini: ", trainedModel.gini())
  print("F0.5: ", trainedModel.F0point5())
  print("F2: ", trainedModel.F2())
  print("Accuracy: ", trainedModel.accuracy())
  print("Log loss: ", trainedModel.logloss())
  print("MSE: ", trainedModel.mse())
  print("Confusion matrix: ", trainedModel.confusion_matrix())

#GENERALIZED LINEAR MODEL (Defaut Settings)

STANDARDIZATION is enabled by default

GLM with default setting
GLM using lmbda search
GLM using Grid search
GLM WITH DEFAULT SETTINGS

Logistic Regression (Binomial Family)

H2O's GLM has the "family" argument, where the family is 'binomial' if the data is categorical 2 levels/classes or binary (Enum or Int).

In [None]:
GLM_default_settings = H2OGeneralizedLinearEstimator(family='binomial', \
                                            model_id='GLM_default',nfolds = 10, \
                                            fold_assignment = "Modulo", \
                                            keep_cross_validation_predictions = True)

GLM_default_settings.train(x = predictors, y = target, training_frame = train)

glm Model Build progress: |



██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 7.501E-4 )",205,49,7,py_85_sid_b6d4

Unnamed: 0,0,1,Error,Rate
0,56588.0,2042.0,0.0348,(2042.0/58630.0)
1,1675.0,65667.0,0.0249,(1675.0/67342.0)
Total,58263.0,67709.0,0.0295,(3717.0/125972.0)

metric,threshold,value,idx
max f1,0.5902304,0.9724771,196.0
max f2,0.1496319,0.9841027,306.0
max f0point5,0.7063629,0.9744042,164.0
max accuracy,0.5902304,0.9704934,196.0
max precision,0.9798944,0.9987255,34.0
max recall,5.05e-05,1.0,399.0
max specificity,0.9998719,0.9995395,0.0
max absolute_mcc,0.5902304,0.9406954,196.0
max min_per_class_accuracy,0.6418096,0.9695889,182.0
max mean_per_class_accuracy,0.602873,0.9701568,193.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100022,0.9981763,1.8290609,1.8290609,0.9777778,0.9989551,0.9777778,0.9989551,0.0182947,0.0182947,82.9060946,82.9060946,0.0178171
2,0.0200044,0.9975418,1.8706305,1.8498457,1.0,0.9977871,0.9888889,0.9983711,0.0187105,0.0370051,87.0630513,84.9845729,0.0365276
3,0.0300067,0.9971087,1.8706305,1.856774,1.0,0.9973091,0.9925926,0.9980171,0.0187105,0.0557156,87.0630513,85.6773991,0.055238
4,0.040001,0.99691,1.8706305,1.8602361,1.0,0.9970031,0.9944433,0.9977637,0.0186956,0.0744112,87.0630513,86.0236059,0.0739336
5,0.0500032,0.9967694,1.8691459,1.8620183,0.9992063,0.9968359,0.9953961,0.9975782,0.0186956,0.0931068,86.9145886,86.2018307,0.0926122
6,0.1000064,0.9960178,1.8703335,1.8661759,0.9998412,0.9964085,0.9976187,0.9969933,0.0935226,0.1866294,87.033354,86.6175924,0.1861178
7,0.1500016,0.9947616,1.8682544,1.8668687,0.9987298,0.9954652,0.997989,0.996484,0.0934038,0.2800333,86.8254355,86.6868661,0.2793851
8,0.2000048,0.9920596,1.8700366,1.8676607,0.9996825,0.9935819,0.9984124,0.9957585,0.0935078,0.373541,87.0036568,86.7660669,0.3728588
9,0.3000032,0.9863652,1.8691455,1.8681556,0.9992062,0.9887739,0.998677,0.9934303,0.1869116,0.5604526,86.9145532,86.815561,0.5595998
10,0.4000016,0.970391,1.8599387,1.8661014,0.9942844,0.9816525,0.9975788,0.9904859,0.1859909,0.7464435,85.993865,86.6101411,0.7443627

Unnamed: 0,0,1,Error,Rate
0,56392.0,2238.0,0.0382,(2238.0/58630.0)
1,1531.0,65811.0,0.0227,(1531.0/67342.0)
Total,57923.0,68049.0,0.0299,(3769.0/125972.0)

metric,threshold,value,idx
max f1,0.5553209,0.9721621,201.0
max f2,0.1545588,0.9838937,307.0
max f0point5,0.7164815,0.9741985,158.0
max accuracy,0.5955539,0.9701045,191.0
max precision,0.9802495,0.9987327,31.0
max recall,5.37e-05,1.0,399.0
max specificity,0.999884,0.9995395,0.0
max absolute_mcc,0.5955539,0.9399125,191.0
max min_per_class_accuracy,0.6412238,0.9693331,178.0
max mean_per_class_accuracy,0.6023813,0.9698265,189.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100022,0.9979735,1.8275763,1.8275763,0.9769841,0.9988208,0.9769841,0.9988208,0.0182798,0.0182798,82.7576319,82.7576319,0.0177852
2,0.0200044,0.9973112,1.8706305,1.8491034,1.0,0.9975821,0.9884921,0.9982014,0.0187105,0.0369903,87.0630513,84.9103416,0.0364957
3,0.0300067,0.9969485,1.8706305,1.8562791,1.0,0.9971015,0.992328,0.9978348,0.0187105,0.0557008,87.0630513,85.6279115,0.0552061
4,0.040001,0.9967611,1.8706305,1.8598648,1.0,0.996851,0.9942449,0.997589,0.0186956,0.0743964,87.0630513,85.9864828,0.0739017
5,0.0500032,0.9966142,1.8691459,1.8617213,0.9992063,0.9966863,0.9952373,0.9974084,0.0186956,0.093092,86.9145886,86.1721334,0.0925803
6,0.1000064,0.9958349,1.8703335,1.8660274,0.9998412,0.9962395,0.9975393,0.996824,0.0935226,0.1866146,87.033354,86.6027437,0.1860859
7,0.1500016,0.994527,1.8682544,1.8667697,0.9987298,0.9952623,0.9979361,0.9963035,0.0934038,0.2800184,86.8254355,86.6769665,0.2793532
8,0.2000048,0.9917266,1.8700366,1.8675864,0.9996825,0.9933111,0.9983727,0.9955554,0.0935078,0.3735262,87.0036568,86.7586423,0.3728269
9,0.3000032,0.9862079,1.869294,1.8681556,0.9992855,0.9885711,0.998677,0.9932273,0.1869264,0.5604526,86.929403,86.815561,0.5595998
10,0.4000016,0.9692985,1.8534047,1.864468,0.9907915,0.9812204,0.9967056,0.9902257,0.1853375,0.7457901,85.3404734,86.4467964,0.7429588

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.9706363,0.0011585,0.9713447,0.9698365,0.9713424,0.9722156,0.9707867,0.9694372,0.9687227,0.970231,0.9722156,0.970231
auc,0.9949009,0.0005436,0.9945607,0.9947052,0.9953798,0.9953271,0.9950663,0.9941571,0.9946351,0.9949182,0.9959480,0.9943119
err,0.0293637,0.0011585,0.0286553,0.0301635,0.0286576,0.0277844,0.0292133,0.0305628,0.0312773,0.0297690,0.0277844,0.0297690
err_count,369.9,14.594139,361.0,380.0,361.0,350.0,368.0,385.0,394.0,375.0,350.0,375.0
f0point5,0.9699516,0.0023987,0.9714001,0.9670294,0.9711923,0.9710094,0.9678368,0.9680839,0.9664139,0.9707687,0.9730758,0.9727054
f1,0.9726591,0.0011756,0.9734968,0.9716967,0.9733835,0.9741888,0.9728132,0.9719367,0.9707975,0.9717769,0.974359,0.9721417
f2,0.975388,0.0019063,0.9756026,0.9764093,0.9755845,0.9773891,0.9778411,0.9758204,0.9752212,0.9727873,0.9756455,0.9715788
lift_top_group,1.8276885,0.0198980,1.8270015,1.846831,1.8053722,1.8385062,1.8326485,1.7900052,1.8367546,1.8542025,1.835099,1.8104639
logloss,0.0855102,0.0035576,0.0868858,0.0871271,0.0838902,0.0812805,0.0836265,0.0895657,0.0885456,0.0876679,0.0785201,0.0879924
max_per_class_error,0.0369202,0.0040897,0.0352719,0.0410844,0.0352679,0.0362146,0.0411075,0.0411551,0.0420197,0.0333613,0.0328322,0.0308874

Unnamed: 0,timestamp,duration,iterations,negative_log_likelihood,objective,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2022-11-29 17:48:04,0.000 sec,0,87015.642796,0.6907538,,,,,,,
,2022-11-29 17:48:05,0.869 sec,1,25233.4607429,0.2070473,,,,,,,
,2022-11-29 17:48:05,1.320 sec,2,15683.9500471,0.1343506,,,,,,,
,2022-11-29 17:48:06,1.733 sec,3,12392.0473476,0.1103337,,,,,,,
,2022-11-29 17:48:06,2.120 sec,4,11156.7031354,0.1025011,,,,,,,
,2022-11-29 17:48:06,2.551 sec,5,10735.5608704,0.1007458,,,,,,,
,2022-11-29 17:48:07,3.005 sec,6,10634.784609,0.1005626,,,,,,,
,2022-11-29 17:48:07,3.304 sec,7,10622.6298563,0.100558,0.1506937,0.0843253,0.9087291,0.9951001,0.9933915,1.8290609,0.0295066

variable,relative_importance,scaled_importance,percentage
service_private.0,1.7371504,1.0,0.0598050
service_private.1,1.7371479,0.9999986,0.0598049
dst_host_srv_count,1.4096733,0.8114860,0.0485309
service_eco_i.1,1.2278910,0.7068420,0.0422727
protocol_type_icmp.1,1.1610413,0.6683597,0.0399712
service_urp_i.1,1.1564823,0.6657353,0.0398143
src_bytes_REJ.1,1.1518837,0.6630881,0.0396560
service_ecr_i.1,1.1400074,0.6562514,0.0392471
count,1.1125814,0.6404635,0.0383029
service_other.1,1.0804071,0.6219422,0.0371952


In [None]:
print(h2o.save_model(model=GLM_default_settings, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_default


In [None]:
''' GLM_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_default')
print(h2o.get_model(GLM_default_settings)) '''

" GLM_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_default')\nprint(h2o.get_model(GLM_default_settings)) "

# **GLM WITH LAMBDA SEARCH**

The model parameter, lambda, controls the amount of regularization in a GLM model
Setting  lambda_search = True gives us optimal lambda value for the regularization strength.

In [None]:
GLM_regularized = H2OGeneralizedLinearEstimator(family='binomial', model_id='GLM', \
                                                lambda_search=True, nfolds = 10, \
                                                fold_assignment = "Modulo", \
                                                keep_cross_validation_predictions = True)

GLM_regularized.train(x = predictors, y = target,training_frame = train)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 7.501E-5 )","nlambda = 100, lambda.max = 0.7501, lambda.min = 7.501E-5, lambda.1se = 1.088E-4",205,75,199,py_85_sid_b6d4

Unnamed: 0,0,1,Error,Rate
0,57133.0,1497.0,0.0255,(1497.0/58630.0)
1,1790.0,65552.0,0.0266,(1790.0/67342.0)
Total,58923.0,67049.0,0.0261,(3287.0/125972.0)

metric,threshold,value,idx
max f1,0.654562,0.9755415,182.0
max f2,0.1469595,0.98461,317.0
max f0point5,0.8586696,0.9789543,115.0
max accuracy,0.654562,0.9739069,182.0
max precision,0.9863687,0.9982286,30.0
max recall,2.35e-05,1.0,399.0
max specificity,0.9996882,0.9995054,0.0
max absolute_mcc,0.654562,0.9475903,182.0
max min_per_class_accuracy,0.6464281,0.9737848,184.0
max mean_per_class_accuracy,0.6679321,0.9739463,179.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100022,0.9997017,1.8290609,1.8290609,0.9777778,0.9998516,0.9777778,0.9998516,0.0182947,0.0182947,82.9060946,82.9060946,0.0178171
2,0.0200044,0.9996259,1.8706305,1.8498457,1.0,0.999667,0.9888889,0.9997593,0.0187105,0.0370051,87.0630513,84.9845729,0.0365276
3,0.0300067,0.9994389,1.8706305,1.856774,1.0,0.9995425,0.9925926,0.999687,0.0187105,0.0557156,87.0630513,85.6773991,0.055238
4,0.040001,0.9992109,1.8661731,1.8591224,0.9976172,0.9993191,0.993848,0.9995951,0.0186511,0.0743667,86.6173093,85.9122367,0.0738379
5,0.0500032,0.9989343,1.8706305,1.8614244,1.0,0.9990702,0.9950786,0.9994901,0.0187105,0.0930771,87.0630513,86.1424362,0.0925484
6,0.1000064,0.9976754,1.865879,1.8636517,0.9974599,0.998146,0.9962692,0.9988181,0.0932999,0.186377,86.5878951,86.3651656,0.1855754
7,0.1500016,0.9969962,1.8700365,1.8657797,0.9996824,0.9973628,0.9974069,0.998333,0.0934929,0.2798699,87.0036473,86.5779703,0.2790342
8,0.2000048,0.9957263,1.8706305,1.8669925,1.0,0.9964224,0.9980552,0.9978553,0.0935375,0.3734074,87.0630513,86.6992454,0.3725716
9,0.3000032,0.9892373,1.8669181,1.8669677,0.9980154,0.9928471,0.9980419,0.996186,0.1866888,0.5600962,86.6918061,86.6967657,0.5588341
10,0.4000016,0.9825941,1.8673636,1.8670666,0.9982536,0.9867633,0.9980948,0.9938303,0.1867334,0.7468296,86.7363555,86.7066629,0.7451922

Unnamed: 0,0,1,Error,Rate
0,57120.0,1510.0,0.0258,(1510.0/58630.0)
1,1814.0,65528.0,0.0269,(1814.0/67342.0)
Total,58934.0,67038.0,0.0264,(3324.0/125972.0)

metric,threshold,value,idx
max f1,0.6567449,0.9752642,182.0
max f2,0.1394762,0.984466,318.0
max f0point5,0.8615769,0.9787562,114.0
max accuracy,0.6567449,0.9736132,182.0
max precision,0.9852018,0.9982049,31.0
max recall,3.5e-05,1.0,399.0
max specificity,0.9997021,0.9995054,0.0
max absolute_mcc,0.6567449,0.9470017,182.0
max min_per_class_accuracy,0.6479485,0.9734266,184.0
max mean_per_class_accuracy,0.6567449,0.9736541,182.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100022,0.9996836,1.8275763,1.8275763,0.9769841,0.9998418,0.9769841,0.9998418,0.0182798,0.0182798,82.7576319,82.7576319,0.0177852
2,0.0200124,0.9996033,1.8706305,1.849112,1.0,0.9996461,0.9884966,0.9997439,0.0187253,0.0370051,87.0630513,84.9111955,0.0365105
3,0.0300067,0.9994102,1.8706305,1.8562791,1.0,0.9995166,0.992328,0.9996682,0.0186956,0.0557008,87.0630513,85.6279115,0.0552061
4,0.040001,0.9991666,1.8661731,1.8587511,0.9976172,0.9992834,0.9936495,0.9995721,0.0186511,0.0743518,86.6173093,85.8751137,0.073806
5,0.0500032,0.9988803,1.8706305,1.8611274,1.0,0.9990222,0.9949198,0.9994621,0.0187105,0.0930623,87.0630513,86.1127389,0.0925165
6,0.1000064,0.9976589,1.8670668,1.8640971,0.9980949,0.9981071,0.9965074,0.9987846,0.0933593,0.1864215,86.7066842,86.4097115,0.1856711
7,0.1500016,0.9969665,1.8700365,1.8660767,0.9996824,0.9973412,0.9975656,0.9983035,0.0934929,0.2799145,87.0036473,86.6076692,0.2791299
8,0.2000048,0.9956655,1.8691456,1.866844,0.9992062,0.9963838,0.9979758,0.9978236,0.0934632,0.3733777,86.914565,86.6843962,0.3725078
9,0.3000032,0.9891997,1.8672151,1.8669677,0.9981742,0.9927594,0.9980419,0.9961356,0.1867185,0.5600962,86.7215057,86.6967657,0.5588341
10,0.4000016,0.9823446,1.8675121,1.8671038,0.9983329,0.9866778,0.9981147,0.9937712,0.1867482,0.7468445,86.7512053,86.7103753,0.7452241

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.9739783,0.0011447,0.9742023,0.9733291,0.9752322,0.9738827,0.9749147,0.9732476,0.9718981,0.973327,0.9758673,0.9738827
auc,0.9962393,0.0003755,0.9962255,0.9961179,0.9966180,0.9964621,0.9963534,0.9958773,0.9960503,0.9961193,0.9969413,0.9956277
err,0.0260217,0.0011447,0.0257977,0.0266709,0.0247678,0.0261173,0.0250853,0.0267524,0.0281019,0.0266730,0.0241327,0.0261173
err_count,327.8,14.420664,325.0,336.0,312.0,329.0,316.0,337.0,354.0,336.0,304.0,329.0
f0point5,0.9768243,0.0020790,0.9785964,0.975276,0.9788368,0.9762294,0.9781372,0.9741647,0.9738125,0.9772175,0.9800154,0.9759577
f1,0.9756062,0.0011690,0.9759455,0.9747482,0.9768305,0.9755772,0.9763827,0.9753204,0.9735505,0.9745532,0.9775878,0.9755663
f2,0.9743941,0.0012792,0.9733089,0.9742211,0.9748325,0.9749258,0.9746344,0.9764789,0.9732887,0.9719033,0.9751721,0.9751752
lift_top_group,1.8276991,0.0197476,1.8122675,1.846831,1.8053722,1.8385062,1.8326485,1.7900052,1.8367546,1.8542025,1.835099,1.8253038
logloss,0.0750383,0.0034743,0.0751435,0.0758453,0.0747050,0.0701203,0.0740809,0.0769329,0.0780452,0.0795108,0.0684273,0.0775721
max_per_class_error,0.0280104,0.0017377,0.0284409,0.0272773,0.0264950,0.0268193,0.0265276,0.0314716,0.0294815,0.0298552,0.0264317,0.0273038

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_xval,deviance_se,alpha,iterations,training_rmse,training_logloss,training_r2,training_auc,training_pr_auc,training_lift,training_classification_error
,2022-11-29 17:59:38,0.000 sec,1,.75E0,1,1.3815077,1.3815276,0.0004267,0.5,,,,,,,,
,2022-11-29 17:59:38,0.104 sec,3,.68E0,3,1.3321086,1.3815077,0.0004250,0.5,,,,,,,,
,2022-11-29 17:59:38,0.178 sec,5,.62E0,5,1.2653235,1.3402035,0.0004049,0.5,,,,,,,,
,2022-11-29 17:59:38,0.280 sec,7,.57E0,5,1.1957394,1.2750741,0.0003996,0.5,,,,,,,,
,2022-11-29 17:59:38,0.371 sec,9,.52E0,9,1.1269199,1.2046049,0.0003588,0.5,,,,,,,,
,2022-11-29 17:59:38,0.449 sec,11,.47E0,9,1.0632580,1.1359401,0.0004142,0.5,,,,,,,,
,2022-11-29 17:59:39,0.527 sec,13,.43E0,9,1.0064124,1.0713165,0.0005948,0.5,,,,,,,,
,2022-11-29 17:59:39,0.606 sec,15,.39E0,10,0.9548708,1.0136210,0.0007860,0.5,,,,,,,,
,2022-11-29 17:59:39,0.690 sec,17,.36E0,10,0.9067852,0.9616676,0.0009649,0.5,,,,,,,,
,2022-11-29 17:59:39,0.779 sec,19,.32E0,10,0.8635332,0.9129300,0.0011376,0.5,,,,,,,,

variable,relative_importance,scaled_importance,percentage
service_IRC.1,4.1746135,1.0,0.0607542
src_bytes_REJ.1,2.9484501,0.7062810,0.0429096
service_urp_i.1,2.7419724,0.6568207,0.0399046
service_smtp.1,2.6284637,0.6296304,0.0382527
service_telnet.1,2.4825642,0.5946812,0.0361294
src_bytes_S0.1,2.1368866,0.5118765,0.0310987
src_bytes_RSTR.1,2.0723369,0.4964141,0.0301593
service_private.0,1.9983155,0.4786827,0.0290820
service_private.1,1.9791451,0.4740906,0.0288030
same_srv_rate,1.9709543,0.4721286,0.0286838


In [None]:
GLM_regularized.confusion_matrix

<bound method H2OBinomialModel.confusion_matrix of H2OGeneralizedLinearEstimator({'parms': {'model_id': {'__meta': {'schema_version': 3, 'schema_name': 'ModelParameterSchemaV3', 'schema_type': 'Iced'}, 'name': 'model_id', 'label': 'model_id', 'help': 'Destination id for this model; auto-generated if not specified.', 'required': False, 'type': 'Key<Model>', 'default_value': None, 'actual_value': {'__meta': {'schema_version': 3, 'schema_name': 'ModelKeyV3', 'schema_type': 'Key<Model>'}, 'name': 'GLM', 'type': 'Key<Model>', 'URL': '/3/Models/GLM'}, 'input_value': None, 'level': 'critical', 'values': [], 'is_member_of_frames': [], 'is_mutually_exclusive_with': [], 'gridable': False}, 'training_frame': {'__meta': {'schema_version': 3, 'schema_name': 'ModelParameterSchemaV3', 'schema_type': 'Iced'}, 'name': 'training_frame', 'label': 'training_frame', 'help': 'Id of the training data frame.', 'required': False, 'type': 'Key<Frame>', 'default_value': None, 'actual_value': {'__meta': {'schema_

In [None]:
print(h2o.save_model(model=GLM_regularized, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM


In [None]:
''' GLM_regularized = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM')
print(h2o.get_model(GLM_regularized)) '''

" GLM_regularized = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM')\nprint(h2o.get_model(GLM_regularized)) "

#**GLM WITH GRID SEARCH**

GLM needs to find the optimal values of the regularization parameters α and λ
lambda: controls the amount of regularization, when set to 0 it gets disabled

alpha : controls the distribution between lasso & ridge regression penalties.

random grid search: H2o supports 2 types of grid search, cartesian and random. We make use of the random as the search criteria for faster computation

Stopping metric: we specify the metric used for early stopping. AUTO takes log loss as default

source: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/lambda.html



In [None]:
hyper_parameters = { 'alpha': [0.0001, 0.001, 0.01, 0.1],
                     'lambda': [0.001, 0.01, 0.1] }
search_criteria = { 'strategy': "RandomDiscrete", 
                    'stopping_metric': "AUTO",
                    'stopping_rounds': 5}

GLM_grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', \
                  nfolds = 10, fold_assignment = "Modulo", \
                  keep_cross_validation_predictions = True),\
                  hyper_parameters, grid_id="GLM_grid", search_criteria=search_criteria)

GLM_grid_search.train(x= predictors,y= target, training_frame=train)


glm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%




Unnamed: 0,alpha,lambda,model_ids,logloss
,0.0001,0.001,GLM_grid_model_6,0.0865
,0.001,0.001,GLM_grid_model_8,0.0865215
,0.01,0.001,GLM_grid_model_12,0.0865592
,0.1,0.001,GLM_grid_model_7,0.0869008
,0.0001,0.01,GLM_grid_model_1,0.115981
,0.001,0.01,GLM_grid_model_4,0.1160181
,0.01,0.01,GLM_grid_model_3,0.1162848
,0.1,0.01,GLM_grid_model_9,0.1202857
,0.0001,0.1,GLM_grid_model_2,0.1826163
,0.001,0.1,GLM_grid_model_10,0.1828146


# Get the grid results, sorted by validation AUC
  

In [None]:
# Get the grid results, sorted by validation AUC
GLM_grid_sorted = GLM_grid_search.get_grid(sort_by='auc', decreasing=True)
GLM_grid_sorted

Unnamed: 0,alpha,lambda,model_ids,auc
,0.0001,0.001,GLM_grid_model_6,0.9947265
,0.001,0.001,GLM_grid_model_8,0.9947118
,0.01,0.001,GLM_grid_model_12,0.9947036
,0.1,0.001,GLM_grid_model_7,0.9946563
,0.0001,0.01,GLM_grid_model_1,0.9907448
,0.001,0.01,GLM_grid_model_4,0.9907383
,0.01,0.01,GLM_grid_model_3,0.9907224
,0.1,0.01,GLM_grid_model_9,0.9903759
,0.0001,0.1,GLM_grid_model_2,0.9859579
,0.001,0.1,GLM_grid_model_10,0.9859377


In [None]:
# Extract the best model from random grid search
Best_GLM_model_from_Grid = GLM_grid_sorted.model_ids[0]

#model performance
Best_GLM_model_from_Grid = h2o.get_model(Best_GLM_model_from_Grid)
print(Best_GLM_model_from_Grid)

Model Details
H2OGeneralizedLinearEstimator : Generalized Linear Modeling
Model Key: GLM_grid_model_6


GLM Model: summary
    family    link    regularization                                 number_of_predictors_total    number_of_active_predictors    number_of_iterations    training_frame
--  --------  ------  ---------------------------------------------  ----------------------------  -----------------------------  ----------------------  ----------------
    binomial  logit   Elastic Net (alpha = 1.0E-4, lambda = 0.001 )  205                           123                            7                       py_85_sid_b6d4

ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.022744166710172143
RMSE: 0.15081169288278726
LogLoss: 0.08527331372651978
AUC: 0.9949105146141968
AUCPR: 0.9932161188607552
Gini: 0.9898210292283935
Null degrees of freedom: 125971
Residual degrees of freedom: 125848
Null deviance: 174031.28559199083
Residual deviance: 21484.099753551214
AIC: 21732.

In [None]:
print(h2o.save_model(model=Best_GLM_model_from_Grid, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_grid_model_6


In [None]:
''' Best_GLM_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_grid_model_8')
print(h2o.get_model(Best_GLM_model_from_Grid)) '''

" Best_GLM_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_grid_model_8')\nprint(h2o.get_model(Best_GLM_model_from_Grid)) "

# RF WITH DEFAULT SETTINGS

  

In [None]:
# Build a RF model with default settings
RF_default_settings = H2ORandomForestEstimator(model_id = 'RF_D',\
                                nfolds = 10, fold_assignment = "Modulo", \
                                keep_cross_validation_predictions = True)

# Use train() to build the model
RF_default_settings.train(x = predictors, y = target, training_frame = train)

drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,50.0,259939.0,20.0,20.0,20.0,256.0,509.0,406.0

Unnamed: 0,0,1,Error,Rate
0,58348.0,282.0,0.0048,(282.0/58630.0)
1,136.0,67206.0,0.002,(136.0/67342.0)
Total,58484.0,67488.0,0.0033,(418.0/125972.0)

metric,threshold,value,idx
max f1,0.4570282,0.9968998,216.0
max f2,0.3885909,0.9979735,231.0
max f0point5,0.6918804,0.9970749,165.0
max accuracy,0.4570282,0.9966818,216.0
max precision,0.9998302,1.0,0.0
max recall,0.0630428,1.0,346.0
max specificity,0.9998302,1.0,0.0
max absolute_mcc,0.4570282,0.9933333,216.0
max min_per_class_accuracy,0.5327513,0.9964955,201.0
max mean_per_class_accuracy,0.4792275,0.9966066,212.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100022,0.9999137,1.8706305,1.8706305,1.0,0.9999613,1.0,0.9999613,0.0187105,0.0187105,87.0630513,87.0630513,0.0187105
2,0.0200044,0.9998239,1.8706305,1.8706305,1.0,0.9998696,1.0,0.9999154,0.0187105,0.0374209,87.0630513,87.0630513,0.0374209
3,0.0300067,0.9996909,1.8706305,1.8706305,1.0,0.9997647,1.0,0.9998652,0.0187105,0.0561314,87.0630513,87.0630513,0.0561314
4,0.040001,0.9995152,1.8706305,1.8706305,1.0,0.9996049,1.0,0.9998002,0.0186956,0.074827,87.0630513,87.0630513,0.074827
5,0.0500032,0.9993577,1.8706305,1.8706305,1.0,0.9994372,1.0,0.9997275,0.0187105,0.0935375,87.0630513,87.0630513,0.0935375
6,0.1000064,0.9985413,1.8706305,1.8706305,1.0,0.9989065,1.0,0.999317,0.0935375,0.1870749,87.0630513,87.0630513,0.1870749
7,0.1500016,0.997927,1.8706305,1.8706305,1.0,0.9982554,1.0,0.9989632,0.0935226,0.2805975,87.0630513,87.0630513,0.2805975
8,0.2000048,0.9971174,1.8706305,1.8706305,1.0,0.9975191,1.0,0.9986021,0.0935375,0.374135,87.0630513,87.0630513,0.374135
9,0.3000032,0.9953033,1.8706305,1.8706305,1.0,0.9962479,1.0,0.9978174,0.1870601,0.5611951,87.0630513,87.0630513,0.5611951
10,0.4000016,0.990741,1.8703335,1.8705563,0.9998412,0.9937438,0.9999603,0.9967991,0.1870304,0.7482255,87.0333517,87.0556265,0.7481914

Unnamed: 0,0,1,Error,Rate
0,58418.0,212.0,0.0036,(212.0/58630.0)
1,137.0,67205.0,0.002,(137.0/67342.0)
Total,58555.0,67417.0,0.0028,(349.0/125972.0)

metric,threshold,value,idx
max f1,0.5120692,0.9974102,206.0
max f2,0.3603075,0.9981844,235.0
max f0point5,0.6113496,0.9975393,185.0
max accuracy,0.5120692,0.9972295,206.0
max precision,0.999572,1.0,0.0
max recall,0.1118632,1.0,316.0
max specificity,0.999572,1.0,0.0
max absolute_mcc,0.5120692,0.9944327,206.0
max min_per_class_accuracy,0.5606734,0.9968222,196.0
max mean_per_class_accuracy,0.5120692,0.9971749,206.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100657,0.9995342,1.8706305,1.8706305,1.0,0.9996459,1.0,0.9996459,0.0188293,0.0188293,87.0630513,87.0630513,0.0188293
2,0.02006,0.9992726,1.8706305,1.8706305,1.0,0.999407,1.0,0.9995269,0.0186956,0.0375249,87.0630513,87.0630513,0.0375249
3,0.0300781,0.9989758,1.8706305,1.8706305,1.0,0.9991263,1.0,0.9993935,0.0187402,0.056265,87.0630513,87.0630513,0.056265
4,0.040001,0.9985442,1.8706305,1.8706305,1.0,0.9987816,1.0,0.9992417,0.018562,0.074827,87.0630513,87.0630513,0.074827
5,0.0500111,0.9979153,1.8706305,1.8706305,1.0,0.9982522,1.0,0.9990436,0.0187253,0.0935523,87.0630513,87.0630513,0.0935523
6,0.1000143,0.9974492,1.8706305,1.8706305,1.0,0.9976457,1.0,0.9983447,0.0935375,0.1870898,87.0630513,87.0630513,0.1870898
7,0.1500016,0.9967502,1.8706305,1.8706305,1.0,0.9970145,1.0,0.9979014,0.0935078,0.2805975,87.0630513,87.0630513,0.2805975
8,0.2000127,0.9965236,1.8706305,1.8706305,1.0,0.9966309,1.0,0.9975837,0.0935523,0.3741499,87.0630513,87.0630513,0.3741499
9,0.3002969,0.995561,1.8706305,1.8706305,1.0,0.9961583,1.0,0.9971077,0.1875947,0.5617445,87.0630513,87.0630513,0.5617445
10,0.4000016,0.988032,1.8706305,1.8706305,1.0,0.9927977,1.0,0.9960334,0.1865106,0.7482552,87.0630513,87.0630513,0.7482552

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.9974201,0.0005516,0.9982537,0.9975393,0.9981742,0.9973009,0.9968246,0.9965865,0.9974597,0.9970628,0.9978566,0.9971422
auc,0.999939,2.79e-05,0.9999725,0.9999689,0.9999555,0.99994,0.99994,0.9999073,0.999956,0.9998823,0.9999424,0.9999244
err,0.0025799,0.0005516,0.0017463,0.0024607,0.0018258,0.0026991,0.0031754,0.0034135,0.0025403,0.0029372,0.0021434,0.0028578
err_count,32.5,6.948221,22.0,31.0,23.0,34.0,40.0,43.0,32.0,37.0,27.0,36.0
f0point5,0.9973178,0.0006432,0.9981151,0.9977171,0.9978122,0.9977442,0.9966649,0.9961041,0.9977883,0.9968965,0.9974488,0.9968866
f1,0.997587,0.0005179,0.9983797,0.9976721,0.9982992,0.9974778,0.9970211,0.9968486,0.9976094,0.997212,0.9980195,0.9973302
f2,0.9978565,0.0005858,0.9986445,0.9976272,0.9987867,0.9972115,0.9973777,0.9975944,0.9974306,0.9975277,0.9985908,0.9977741
lift_top_group,1.8707694,0.0170127,1.8564692,1.8918756,1.8645648,1.8681595,1.8773472,1.8486938,1.8815534,1.899427,1.8497797,1.8698233
logloss,0.0214093,0.0012878,0.0204206,0.02225,0.0214512,0.021071,0.0222294,0.0229254,0.021767,0.0201637,0.0189109,0.0229039
max_per_class_error,0.0033833,0.0008624,0.0024088,0.0025257,0.0029105,0.002966,0.0040768,0.0051876,0.0026886,0.0036882,0.003456,0.0039249

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2022-11-29 18:11:52,3 min 42.809 sec,0.0,,,,,,
,2022-11-29 18:11:52,3 min 43.388 sec,1.0,0.1044474,0.1119417,0.9961768,0.9948582,1.8629304,0.0126938
,2022-11-29 18:11:53,3 min 43.852 sec,2.0,0.1014131,0.0918377,0.9970534,0.9960337,1.8645059,0.012222
,2022-11-29 18:11:53,3 min 44.506 sec,3.0,0.1071315,0.0789664,0.997286,0.9964202,1.8654873,0.013217
,2022-11-29 18:11:54,3 min 44.922 sec,4.0,0.1007675,0.0660511,0.9979254,0.9972412,1.8666109,0.0118555
,2022-11-29 18:11:54,3 min 45.344 sec,5.0,0.0923026,0.0560817,0.9983299,0.9976988,1.8669657,0.0095633
,2022-11-29 18:11:55,3 min 45.780 sec,6.0,0.0864554,0.0478006,0.9986203,0.9981331,1.8676016,0.0082008
,2022-11-29 18:11:55,3 min 46.327 sec,7.0,0.0805762,0.0410756,0.9988593,0.9983325,1.867455,0.0067925
,2022-11-29 18:12:00,3 min 50.509 sec,18.0,0.0727,0.0275681,0.9996446,0.9994205,1.867795,0.0047404
,2022-11-29 18:12:04,3 min 54.699 sec,29.0,0.0679699,0.0237205,0.9998687,0.9998794,1.8706305,0.0040326

variable,relative_importance,scaled_importance,percentage
srv_rerror_rate,132039.4531250,1.0,0.1128465
dst_host_same_srv_rate,102489.3828125,0.7762027,0.0875918
src_bytes_SF,98310.7187500,0.7445556,0.0840205
dst_host_srv_count,95625.6953125,0.7242206,0.0817258
logged_in,93583.9140625,0.7087572,0.0799808
rerror_rate,51121.9296875,0.3871716,0.0436910
src_bytes_S0,46465.0664062,0.3519029,0.0397110
count,40648.4101562,0.3078505,0.0347399
dst_host_diff_srv_rate,38098.3398438,0.2885375,0.0325605
service_http,36973.3671875,0.2800176,0.0315990


In [None]:
#Let's see the default parameters that RF model utilizes:
RF_default_settings.summary()

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,50.0,259939.0,20.0,20.0,20.0,256.0,509.0,406.0


In [None]:
print(h2o.save_model(model=RF_default_settings, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_D


In [None]:
''' RF_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_D') 
print(h2o.get_model(RF_default_settings)'''

" RF_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_D') \nprint(h2o.get_model(RF_default_settings)"

# RF with GRID SEARCH to extract the best model

  

In [None]:
hyper_params = {'sample_rate':[0.7,0.9],
                'col_sample_rate_per_tree': [0.8, 0.9],
                'max_depth': [3, 5],
                'ntrees': [200, 300]
               }

In [None]:
RF_grid_search = H2OGridSearch(H2ORandomForestEstimator(nfolds = 10, \
                             fold_assignment = "Modulo", \
                             keep_cross_validation_predictions = True, \
                             stopping_metric = 'AUC',stopping_rounds = 5), \
                             hyper_params = hyper_params, \
                             grid_id= 'RF_gridsearch')

# Use train() to start the grid search
RF_grid_search.train(x = predictors, y = target, training_frame = train)

drf Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,col_sample_rate_per_tree,max_depth,ntrees,sample_rate,model_ids,logloss
,0.9,5.0,83.0,0.7,RF_gridsearch_model_8,0.1195883
,0.8,5.0,9.0,0.7,RF_gridsearch_model_3,0.1207654
,0.9,5.0,21.0,0.7,RF_gridsearch_model_4,0.1207921
,0.8,5.0,10.0,0.9,RF_gridsearch_model_11,0.1227574
,0.9,5.0,49.0,0.9,RF_gridsearch_model_12,0.1257021
,0.9,5.0,19.0,0.9,RF_gridsearch_model_16,0.1281171
,0.8,5.0,15.0,0.7,RF_gridsearch_model_7,0.1283488
,0.8,5.0,39.0,0.9,RF_gridsearch_model_15,0.1296725
,0.8,3.0,14.0,0.7,RF_gridsearch_model_1,0.1805834
,0.9,3.0,7.0,0.7,RF_gridsearch_model_6,0.1888521


In [None]:
# Sort the grid models
RF_grid_sorted = RF_grid_search.get_grid(sort_by='auc', decreasing=True)
print(RF_grid_sorted)

Hyper-Parameter Search Summary: ordered by decreasing auc
    col_sample_rate_per_tree    max_depth    ntrees    sample_rate    model_ids               auc
--  --------------------------  -----------  --------  -------------  ----------------------  --------
    0.9                         5            83        0.7            RF_gridsearch_model_8   0.99609
    0.8                         5            9         0.7            RF_gridsearch_model_3   0.995738
    0.9                         5            21        0.7            RF_gridsearch_model_4   0.99548
    0.8                         5            15        0.7            RF_gridsearch_model_7   0.995327
    0.9                         5            49        0.9            RF_gridsearch_model_12  0.994963
    0.9                         5            19        0.9            RF_gridsearch_model_16  0.994386
    0.8                         5            39        0.9            RF_gridsearch_model_15  0.994331
    0.8               

In [None]:
# Extract the best model from random grid search
Best_RF_model_from_Grid = RF_grid_sorted.model_ids[0]

# Model performance
Best_RF_model_from_Grid = h2o.get_model(Best_RF_model_from_Grid) 
print(Best_RF_model_from_Grid)

Model Details
H2ORandomForestEstimator : Distributed Random Forest
Model Key: RF_gridsearch_model_8


Model Summary: 
    number_of_trees    number_of_internal_trees    model_size_in_bytes    min_depth    max_depth    mean_depth    min_leaves    max_leaves    mean_leaves
--  -----------------  --------------------------  ---------------------  -----------  -----------  ------------  ------------  ------------  -------------
    83                 83                          30482                  5            5            5             14            32            24.494

ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.02986948777216001
RMSE: 0.17282791375284262
LogLoss: 0.12265954440935405
Mean Per-Class Error: 0.026968617473474007
AUC: 0.9949837037894649
AUCPR: 0.9951737574167863
Gini: 0.9899674075789298

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6343285948222775
       0      1      Error    Rate
-----  -----  -----  -------  -----------------
0      5617

In [None]:
print(h2o.save_model(model=Best_RF_model_from_Grid, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_gridsearch_model_8


In [None]:
''' Best_RF_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_gridsearch_model_23')
print(h2o.get_model(Best_RF_model_from_Grid)) '''

" Best_RF_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/RF_gridsearch_model_23')\nprint(h2o.get_model(Best_RF_model_from_Grid)) "

#GBM WITH DEFAULT SETTINGS

In [None]:
GBM_default_settings = H2OGradientBoostingEstimator(model_id = 'GBM_default', \
                       nfolds = 10, \
                       fold_assignment = "Modulo", \
                       keep_cross_validation_predictions = True)

# Use train() to build the model
GBM_default_settings.train(x = predictors, y = target, training_frame = train)

gbm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,50.0,21596.0,5.0,5.0,5.0,18.0,32.0,29.68

Unnamed: 0,0,1,Error,Rate
0,58367.0,263.0,0.0045,(263.0/58630.0)
1,210.0,67132.0,0.0031,(210.0/67342.0)
Total,58577.0,67395.0,0.0038,(473.0/125972.0)

metric,threshold,value,idx
max f1,0.5824247,0.9964895,195.0
max f2,0.3451289,0.9974252,241.0
max f0point5,0.6732151,0.9965707,176.0
max accuracy,0.5862533,0.9962452,194.0
max precision,0.9945194,1.0,0.0
max recall,0.021619,1.0,369.0
max specificity,0.9945194,1.0,0.0
max absolute_mcc,0.5824247,0.9924542,195.0
max min_per_class_accuracy,0.6438371,0.9960089,184.0
max mean_per_class_accuracy,0.5862533,0.9962001,194.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0519242,0.9945203,1.8706305,1.8706305,1.0,0.9945207,1.0,0.9945207,0.0971311,0.0971311,87.0630513,87.0630513,0.0971311
2,0.2835948,0.994451,1.8706305,1.8706305,1.0,0.994451,1.0,0.9944638,0.43337,0.530501,87.0630513,87.0630513,0.530501
3,0.306584,0.9938433,1.8706305,1.8706305,1.0,0.9941251,1.0,0.9944384,0.0430044,0.5735054,87.0630513,87.0630513,0.5735054
4,0.4002636,0.9878489,1.8699965,1.8704821,0.999661,0.992073,0.9999207,0.9938848,0.1751804,0.7486858,86.9996455,87.0482115,0.7486176
5,0.5,0.9598512,1.8608039,1.8685516,0.9947469,0.9774877,0.9988886,0.990614,0.18559,0.9342758,86.0803896,86.8551573,0.9330819
6,0.6021973,0.0205874,0.643111,1.6605853,0.3437937,0.3578573,0.8877142,0.8832305,0.0657242,1.0,-35.6889028,66.0585289,0.854716
7,0.701386,0.0056542,0.0,1.4257484,0.0,0.0106848,0.7621753,0.7598367,0.0,1.0,-100.0,42.5748401,0.6415999
8,0.9294923,0.0054565,0.0,1.0758562,0.0,0.0054585,0.5751302,0.5747051,0.0,1.0,-100.0,7.5856179,0.1514924
9,1.0,0.0049576,0.0,1.0,0.0,0.0053551,0.5345791,0.5345616,0.0,1.0,-100.0,0.0,0.0

Unnamed: 0,0,1,Error,Rate
0,58336.0,294.0,0.005,(294.0/58630.0)
1,236.0,67106.0,0.0035,(236.0/67342.0)
Total,58572.0,67400.0,0.0042,(530.0/125972.0)

metric,threshold,value,idx
max f1,0.5812843,0.9960666,191.0
max f2,0.3333362,0.9972682,238.0
max f0point5,0.6246063,0.9961199,181.0
max accuracy,0.5812843,0.9957927,191.0
max precision,0.9945971,1.0,0.0
max recall,0.0181557,1.0,371.0
max specificity,0.9945971,1.0,0.0
max absolute_mcc,0.5812843,0.9915449,191.0
max min_per_class_accuracy,0.6299019,0.9956639,180.0
max mean_per_class_accuracy,0.608193,0.9957532,185.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.010542,0.994606,1.8706305,1.8706305,1.0,0.9947031,1.0,0.9947031,0.0197202,0.0197202,87.0630513,87.0630513,0.0197202
2,0.022251,0.9945354,1.8706305,1.8706305,1.0,0.9945404,1.0,0.9946175,0.0219031,0.0416234,87.0630513,87.0630513,0.0416234
3,0.031896,0.9944487,1.8706305,1.8706305,1.0,0.9944551,1.0,0.9945684,0.0180422,0.0596656,87.0630513,87.0630513,0.0596656
4,0.0559728,0.9944141,1.8706305,1.8706305,1.0,0.9944148,1.0,0.9945023,0.0450388,0.1047043,87.0630513,87.0630513,0.1047043
5,0.1045709,0.9943628,1.8706305,1.8706305,1.0,0.9943707,1.0,0.9944412,0.0909091,0.1956134,87.0630513,87.0630513,0.1956134
6,0.1641397,0.9942376,1.8706305,1.8706305,1.0,0.994259,1.0,0.9943751,0.1114312,0.3070446,87.0630513,87.0630513,0.3070446
7,0.2185009,0.9941484,1.8706305,1.8706305,1.0,0.9941887,1.0,0.9943287,0.1016899,0.4087345,87.0630513,87.0630513,0.4087345
8,0.3019639,0.9938693,1.8706305,1.8706305,1.0,0.9940298,1.0,0.9942461,0.1561284,0.5648629,87.0630513,87.0630513,0.5648629
9,0.4000016,0.9878324,1.8700246,1.870482,0.9996761,0.9920495,0.9999206,0.9937077,0.1833328,0.7481958,87.0024641,87.0482018,0.7481276
10,0.5,0.9601124,1.8581567,1.868017,0.9933317,0.9777315,0.9986029,0.9905125,0.1858127,0.9340085,85.8156673,86.8016988,0.9325076

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
accuracy,0.9960149,0.0006703,0.997063,0.9958724,0.9967452,0.995872,0.9965071,0.99484,0.9960308,0.9959514,0.9961102,0.9951576
auc,0.9997136,8.75e-05,0.9997884,0.999739,0.9998195,0.9995201,0.999791,0.9996504,0.9996691,0.9996842,0.9997431,0.9997312
err,0.003985,0.0006703,0.002937,0.0041276,0.0032547,0.004128,0.0034929,0.00516,0.0039692,0.0040486,0.0038898,0.0048424
err_count,50.2,8.4432745,37.0,52.0,41.0,52.0,44.0,65.0,50.0,51.0,49.0,61.0
f0point5,0.9957196,0.0010511,0.9968792,0.9954724,0.9970978,0.9953532,0.9968986,0.994493,0.9959999,0.9955739,0.9955753,0.9938521
f1,0.9962765,0.0006197,0.9972756,0.9960996,0.996965,0.9961493,0.9967203,0.9952363,0.9962676,0.9961588,0.9964074,0.9954852
f2,0.9968346,0.0004578,0.9976723,0.9967276,0.9968322,0.9969466,0.996542,0.9959809,0.9965354,0.9967443,0.9972409,0.9971235
lift_top_group,1.8707694,0.0170127,1.8564692,1.8918756,1.8645648,1.8681595,1.8773472,1.8486938,1.8815534,1.899427,1.8497797,1.8698233
logloss,0.0237507,0.001655,0.0208895,0.0241316,0.022049,0.0251804,0.0231866,0.0262427,0.025024,0.0236761,0.0223077,0.0248194
max_per_class_error,0.0053761,0.001586,0.0039573,0.0055565,0.0032564,0.0059788,0.0035768,0.0070897,0.0047442,0.0053646,0.0058752,0.0083618

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2022-11-29 18:40:54,2 min 58.567 sec,0.0,0.4988029,0.6907538,0.5,0.5345791,1.0,0.4654209
,2022-11-29 18:40:55,2 min 59.225 sec,1.0,0.4525528,0.6021464,0.9936156,0.9924638,1.8706305,0.0199171
,2022-11-29 18:40:55,2 min 59.560 sec,2.0,0.4116087,0.5297397,0.9936542,0.9924969,1.8706305,0.0198377
,2022-11-29 18:40:56,2 min 59.861 sec,3.0,0.3752828,0.4694598,0.994388,0.9932246,1.8706305,0.0174324
,2022-11-29 18:40:56,3 min 0.198 sec,4.0,0.3430322,0.4185526,0.9944083,0.9932389,1.8706305,0.0163131
,2022-11-29 18:40:56,3 min 0.503 sec,5.0,0.3137088,0.374212,0.9968681,0.9970089,1.8706305,0.012995
,2022-11-29 18:40:57,3 min 0.809 sec,6.0,0.2882521,0.3368188,0.9968969,0.9970264,1.8706305,0.0118598
,2022-11-29 18:40:57,3 min 1.148 sec,7.0,0.2658323,0.304692,0.9981682,0.9976429,1.8706305,0.0109866
,2022-11-29 18:40:57,3 min 1.471 sec,8.0,0.2461357,0.276866,0.9981479,0.9976198,1.8706305,0.0113041
,2022-11-29 18:40:58,3 min 1.809 sec,9.0,0.2285163,0.2522971,0.9983299,0.9978226,1.8706305,0.0111295

variable,relative_importance,scaled_importance,percentage
src_bytes_SF,76145.9921875,1.0,0.4589154
protocol_type_icmp,21535.5195312,0.2828188,0.1297899
srv_serror_rate,19567.0722656,0.2569679,0.1179265
dst_host_same_srv_rate,12542.1464844,0.1647118,0.0755888
service_private,8225.2460938,0.1080194,0.0495718
dst_host_same_src_port_rate,5011.5625,0.0658152,0.0302036
hot,2933.1872559,0.0385206,0.0176777
dst_host_srv_count,2234.5944824,0.0293462,0.0134674
srv_rerror_rate,2176.9460449,0.0285891,0.0131200
dst_host_count,1555.5339355,0.0204283,0.0093749


In [None]:
print(h2o.save_model(model=GBM_default_settings, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_default


In [None]:
''' GBM_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_default')
print(h2o.get_model(GBM_default_settings)) '''

" GBM_default_settings = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_default')\nprint(h2o.get_model(GBM_default_settings)) "

#GBM WITH GRID SEARCH

In [None]:
hyper_params = {'sample_rate': [0.8, 0.9],
                'col_sample_rate': [0.2, 0.5],
                'max_depth': [5, 9],
                'ntrees' : [100, 200]
               }

In [None]:
GBM_grid_search = H2OGridSearch(H2OGradientBoostingEstimator(nfolds = 10, \
                        fold_assignment = "Modulo", \
                        keep_cross_validation_predictions = True,\
                        stopping_metric = 'AUC', stopping_rounds = 5),
                        hyper_params = hyper_params, grid_id= 'GBM_Grid')

# Use train() to start the grid search
GBM_grid_search.train(x = predictors, y = target, training_frame = train)

gbm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,col_sample_rate,max_depth,ntrees,sample_rate,model_ids,logloss
,0.2,9.0,84.0,0.8,GBM_Grid_model_7,0.0075273
,0.2,9.0,69.0,0.9,GBM_Grid_model_15,0.0089321
,0.2,9.0,64.0,0.8,GBM_Grid_model_3,0.0095565
,0.2,5.0,196.0,0.8,GBM_Grid_model_5,0.0102657
,0.2,5.0,187.0,0.9,GBM_Grid_model_13,0.0110438
,0.5,5.0,116.0,0.8,GBM_Grid_model_6,0.0112247
,0.2,9.0,54.0,0.9,GBM_Grid_model_11,0.0112784
,0.5,9.0,50.0,0.9,GBM_Grid_model_16,0.0115519
,0.5,5.0,116.0,0.9,GBM_Grid_model_14,0.0116728
,0.5,5.0,99.0,0.9,GBM_Grid_model_10,0.0126291


In [None]:
# Sort and show the grid search results
GBM_grid_sorted = GBM_grid_search.get_grid(sort_by='auc', decreasing=True)
print(GBM_grid_sorted)

Hyper-Parameter Search Summary: ordered by decreasing auc
    col_sample_rate    max_depth    ntrees    sample_rate    model_ids          auc
--  -----------------  -----------  --------  -------------  -----------------  --------
    0.2                9            84        0.8            GBM_Grid_model_7   0.999962
    0.2                9            64        0.8            GBM_Grid_model_3   0.999952
    0.2                9            69        0.9            GBM_Grid_model_15  0.999951
    0.5                9            50        0.9            GBM_Grid_model_16  0.999945
    0.2                9            54        0.9            GBM_Grid_model_11  0.999944
    0.5                9            46        0.8            GBM_Grid_model_4   0.999943
    0.5                9            47        0.9            GBM_Grid_model_12  0.99993
    0.5                9            45        0.8            GBM_Grid_model_8   0.999926
    0.2                5            196       0.8         

In [None]:
# Extract the best model from random grid search
Best_GBM_model_from_Grid = GBM_grid_sorted.model_ids[0]

Best_GBM_model_from_Grid = h2o.get_model(Best_GBM_model_from_Grid) 
print(Best_GBM_model_from_Grid)

Model Details
H2OGradientBoostingEstimator : Gradient Boosting Machine
Model Key: GBM_Grid_model_7


Model Summary: 
    number_of_trees    number_of_internal_trees    model_size_in_bytes    min_depth    max_depth    mean_depth    min_leaves    max_leaves    mean_leaves
--  -----------------  --------------------------  ---------------------  -----------  -----------  ------------  ------------  ------------  -------------
    84                 84                          167241                 9            9            9             35            262           153.738

ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0009911023909017806
RMSE: 0.03148177871248352
LogLoss: 0.005508074004352803
Mean Per-Class Error: 0.0010955280048753407
AUC: 0.9999847792501563
AUCPR: 0.9999864748326862
Gini: 0.9999695585003125

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5730558786944447
       0      1      Error    Rate
-----  -----  -----  -------  ----------------
0      5

In [None]:
print(h2o.save_model(model=Best_GBM_model_from_Grid, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_Grid_model_7


In [None]:
''' Best_GBM_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_Grid_model_17')
print(h2o.get_model(Best_GBM_model_from_Grid)) '''

" Best_GBM_model_from_Grid = h2o.load_model('/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_Grid_model_17')\nprint(h2o.get_model(Best_GBM_model_from_Grid)) "

#STACKED ENSEMBLE

In [None]:
# list the best models from each grid
all_models = [Best_GLM_model_from_Grid, Best_RF_model_from_Grid, Best_GBM_model_from_Grid]

In [None]:
# Set up Stacked Ensemble
ensemble = H2OStackedEnsembleEstimator(model_id = "ensemble", base_models = all_models, metalearner_algorithm = "deeplearning")

# uses GLM as the default metalearner
ensemble.train(y = target, training_frame = train)

stackedensemble Model Build progress: |



██████████████████████████████████████████| (done) 100%


Unnamed: 0,0,1,Error,Rate
0,4633.0,5.0,0.0011,(5.0/4638.0)
1,0.0,5356.0,0.0,(0.0/5356.0)
Total,4633.0,5361.0,0.0005,(5.0/9994.0)

metric,threshold,value,idx
max f1,0.0001246,0.9995335,389.0
max f2,0.0001246,0.9998133,389.0
max f0point5,0.9681049,0.9996635,375.0
max accuracy,0.0001246,0.9994997,389.0
max precision,0.9999995,1.0,0.0
max recall,0.0001246,1.0,389.0
max specificity,0.9999995,1.0,0.0
max absolute_mcc,0.0001246,0.9989946,389.0
max min_per_class_accuracy,0.1074548,0.9993532,384.0
max mean_per_class_accuracy,0.0001246,0.999461,389.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.010006,0.9999928,1.8659447,1.8659447,1.0,0.9999974,1.0,0.9999974,0.0186706,0.0186706,86.5944735,86.5944735,0.0186706
2,0.020012,0.9999926,1.8659447,1.8659447,1.0,0.9999927,1.0,0.999995,0.0186706,0.0373413,86.5944735,86.5944735,0.0373413
3,0.030018,0.9999926,1.8659447,1.8659447,1.0,0.9999926,1.0,0.9999942,0.0186706,0.0560119,86.5944735,86.5944735,0.0560119
4,0.040024,0.9999926,1.8659447,1.8659447,1.0,0.9999926,1.0,0.9999938,0.0186706,0.0746826,86.5944735,86.5944735,0.0746826
5,0.05003,0.9999926,1.8659447,1.8659447,1.0,0.9999926,1.0,0.9999936,0.0186706,0.0933532,86.5944735,86.5944735,0.0933532
6,0.10006,0.9999924,1.8659447,1.8659447,1.0,0.9999925,1.0,0.999993,0.0933532,0.1867065,86.5944735,86.5944735,0.1867065
7,0.14999,0.9999923,1.8659447,1.8659447,1.0,0.9999924,1.0,0.9999928,0.0931665,0.279873,86.5944735,86.5944735,0.279873
8,0.20002,0.9999922,1.8659447,1.8659447,1.0,0.9999923,1.0,0.9999927,0.0933532,0.3732263,86.5944735,86.5944735,0.3732263
9,0.29998,0.9999892,1.8659447,1.8659447,1.0,0.9999911,1.0,0.9999922,0.1865198,0.5597461,86.5944735,86.5944735,0.5597461
10,0.40004,0.999985,1.8659447,1.8659447,1.0,0.9999878,1.0,0.9999911,0.1867065,0.7464526,86.5944735,86.5944735,0.7464526


In [None]:
print(h2o.save_model(model=ensemble, path="/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/", force=True))

/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/ensemble


### Checking model performance of all base learners

In [None]:
# Checking the model performance for all GLM models built

model_perf_GLM_default = GLM_default_settings.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for GLM default: ")
report(model_perf_GLM_default)
model_perf_GLM_regularized = GLM_regularized.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for GLM_regularized: ")
report(model_perf_GLM_regularized)
model_perf_Best_GLM_model_from_Grid = Best_GLM_model_from_Grid.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for Best_GLM_model_from_Grid: ")
report(model_perf_Best_GLM_model_from_Grid)

---------------------------------------------------------------------------------
Report for GLM default: 
AUC:  0.8715208442311994
F1:  [[0.9437380916455735, 0.8001539127507095]]
MSE:  0.22405602850221104
Gini:  0.7430416884623987
F0.5:  [[0.9818581846958288, 0.8149315342107015]]
F2:  [[0.8098619426352783, 0.8576628204135368]]
Accuracy:  [[0.9784644501851787, 0.8309896641973118]]
Log loss:  1.0154901591103882
MSE:  0.22405602850221104
Confusion matrix:  Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9437380916455735
       0      1      Error    Rate
-----  -----  -----  -------  ----------------
0      10070  2762   0.2152   (2762.0/12832.0)
1      1393   8318   0.1434   (1393.0/9711.0)
Total  11463  11080  0.1843   (4155.0/22543.0)
---------------------------------------------------------------------------------
Report for GLM_regularized: 
AUC:  0.829018721314056
F1:  [[0.96655908218939, 0.7967228915662651]]
MSE:  0.22396611285362147
Gini:  0.6580374426281119
F0.5:  [[0.971

In [None]:
# Checking the model performance for all RF models built

model_perf_RF_default_settings = RF_default_settings.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for RF_default_settings: ")
report(model_perf_RF_default_settings)

model_perf_Best_RF_model_from_Grid = Best_RF_model_from_Grid.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for Best_RF_model_from_Grid: ")
report(model_perf_Best_RF_model_from_Grid)

---------------------------------------------------------------------------------
Report for RF_default_settings: 
AUC:  0.9542264508510414
F1:  [[0.9695379968309378, 0.8838955327934788]]
MSE:  0.16988970337777626
Gini:  0.9084529017020828
F0.5:  [[0.9859537609121832, 0.9244168398440237]]
F2:  [[0.5203240258799182, 0.8853069515498043]]
Accuracy:  [[0.9804022713873891, 0.9062680211152021]]
Log loss:  0.5468070427132175
MSE:  0.16988970337777626
Confusion matrix:  Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9695379968309378
       0      1     Error    Rate
-----  -----  ----  -------  ----------------
0      12137  695   0.0542   (695.0/12832.0)
1      1470   8241  0.1514   (1470.0/9711.0)
Total  13607  8936  0.096    (2165.0/22543.0)
---------------------------------------------------------------------------------
Report for Best_RF_model_from_Grid: 
AUC:  0.9397352101031532
F1:  [[0.9073016434399597, 0.8459818933720801]]
MSE:  0.1758784265475983
Gini:  0.8794704202063064
F0

In [None]:
# Checking the model performance for all GBM models built

model_perf_GBM_default_settings = GBM_default_settings.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for GBM_default_settings: ")
report(model_perf_GBM_default_settings)

model_perf_Best_GBM_model_from_Grid = Best_GBM_model_from_Grid.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for Best_GBM_model_from_Grid: ")
report(model_perf_Best_GBM_model_from_Grid)

---------------------------------------------------------------------------------
Report for GBM_default_settings: 
AUC:  0.9430293228351734
F1:  [[0.9877113326646165, 0.8563647257475898]]
MSE:  0.17913870719862002
Gini:  0.8860586456703468
F0.5:  [[0.9927021709407542, 0.9210626474842047]]
F2:  [[0.16871014326029604, 0.8846275989613495]]
Accuracy:  [[0.9913261787428174, 0.8851971787251032]]
Log loss:  0.6707171055651723
MSE:  0.17913870719862002
Confusion matrix:  Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9877113326646165
       0      1     Error    Rate
-----  -----  ----  -------  ----------------
0      12045  787   0.0613   (787.0/12832.0)
1      1850   7861  0.1905   (1850.0/9711.0)
Total  13895  8648  0.117    (2637.0/22543.0)
---------------------------------------------------------------------------------
Report for Best_GBM_model_from_Grid: 
AUC:  0.9501784192528153
F1:  [[0.9923519309572126, 0.8677388260362233]]
MSE:  0.1893199152622362
Gini:  0.9003568385056306

### Best AUC from the base learners

In [None]:
# Best AUC from the base learner models
best_auc = max(model_perf_GLM_default.auc(), model_perf_GLM_regularized.auc(), \
               model_perf_Best_GLM_model_from_Grid.auc(), \
               model_perf_RF_default_settings.auc(), \
               model_perf_Best_RF_model_from_Grid.auc(), \
               model_perf_GBM_default_settings.auc(), \
               model_perf_Best_GBM_model_from_Grid.auc())

print("Best AUC out of all the models performed: ", format(best_auc))

Best AUC out of all the models performed:  0.9542264508510414


### AUC from the Ensemble Learner

In [None]:
# Eval ensemble performance on the test data
Ensemble_model = ensemble.model_performance(test)
print("---------------------------------------------------------------------------------")
print("Report for ensemble: ")
report(Ensemble_model)

---------------------------------------------------------------------------------
Report for ensemble: 
AUC:  0.932440007648729
F1:  [[0.9999910155305758, 0.8765419082762257]]
MSE:  0.1881140496450202
Gini:  0.8648800152974581
F0.5:  [[0.9999910155305758, 0.8892549254497615]]
F2:  [[0.9999340025481704, 0.9139721616899307]]
Accuracy:  [[0.9999910155305758, 0.8961096570997649]]
Log loss:  1.868792661393939
MSE:  0.1881140496450202
Confusion matrix:  Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9999910155305758
       0      1     Error    Rate
-----  -----  ----  -------  ----------------
0      11887  945   0.0736   (945.0/12832.0)
1      1397   8314  0.1439   (1397.0/9711.0)
Total  13284  9259  0.1039   (2342.0/22543.0)


In [None]:
print(Ensemble_model)

ModelMetricsBinomial: stackedensemble
** Reported on test data. **

MSE: 0.1881140496450202
RMSE: 0.43372116577937514
LogLoss: 1.868792661393939
Mean Per-Class Error: 0.10875074808473616
AUC: 0.932440007648729
AUCPR: 0.8797636626772504
Gini: 0.8648800152974581

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.9999910155305758
       0      1     Error    Rate
-----  -----  ----  -------  ----------------
0      11887  945   0.0736   (945.0/12832.0)
1      1397   8314  0.1439   (1397.0/9711.0)
Total  13284  9259  0.1039   (2342.0/22543.0)

Maximum Metrics: Maximum metrics at their respective thresholds
metric                       threshold    value     idx
---------------------------  -----------  --------  -----
max f1                       0.999991     0.876542  0
max f2                       0.999934     0.913972  2
max f0point5                 0.999991     0.889255  0
max accuracy                 0.999991     0.89611   0
max precision                0.999991     0.897937  0
m

#Loading pretrained models

In [None]:
''' # Set up Stacked Ensemble
ensemble = H2OStackedEnsembleEstimator(model_id = "ensemble", base_models = all_models, metalearner_algorithm = "deeplearning")

# uses GLM as the default metalearner
ensemble.train(y = target, training_frame = train)
ensemble = h2o.load_model("/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_default")
Ensemble_model = ensemble.model_performance(test) '''

' # Set up Stacked Ensemble\nensemble = H2OStackedEnsembleEstimator(model_id = "ensemble", base_models = all_models, metalearner_algorithm = "deeplearning")\n\n# uses GLM as the default metalearner\nensemble.train(y = target, training_frame = train)\nensemble = h2o.load_model("/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GBM_default")\nEnsemble_model = ensemble.model_performance(test) '

In [None]:
''' hyper_parameters = { 'alpha': [0.0001, 0.001, 0.01, 0.1],
                     'lambda': [0.001, 0.01, 0.1] }
search_criteria = { 'strategy': "RandomDiscrete", 
                    'stopping_metric': "AUTO",
                    'stopping_rounds': 5}

GLM_grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial', \
                  nfolds = 10, fold_assignment = "Modulo", \
                  keep_cross_validation_predictions = True),\
                  hyper_parameters, grid_id="GLM_grid", search_criteria=search_criteria)

#GLM_grid_search.train(x= predictors,y= target, training_frame=train)
GLM_grid_search = h2o.load_model("/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_grid_model_10")
GLM_grid_search_performance = GLM_grid_search.model_performance(test)
 '''

' hyper_parameters = { \'alpha\': [0.0001, 0.001, 0.01, 0.1],\n                     \'lambda\': [0.001, 0.01, 0.1] }\nsearch_criteria = { \'strategy\': "RandomDiscrete", \n                    \'stopping_metric\': "AUTO",\n                    \'stopping_rounds\': 5}\n\nGLM_grid_search = H2OGridSearch(H2OGeneralizedLinearEstimator(family=\'binomial\',                   nfolds = 10, fold_assignment = "Modulo",                   keep_cross_validation_predictions = True),                  hyper_parameters, grid_id="GLM_grid", search_criteria=search_criteria)\n\n#GLM_grid_search.train(x= predictors,y= target, training_frame=train)\nGLM_grid_search = h2o.load_model("/content/drive/MyDrive/Ensemble model/Heterogeneous_Ensemble_to_Predict_Credit_Card_Default_using_H2O/models/GLM_grid_model_10")\nGLM_grid_search_performance = GLM_grid_search.model_performance(test)\n '