In [1]:
import h2o
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-linux64) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-linux64) (build 25.121-b15, mixed mode)
  Starting server from /home/alchemist/.conda/envs/tensorflow/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp4cj5xkck
  JVM stdout: /tmp/tmp4cj5xkck/h2o_alchemist_started_from_python.out
  JVM stderr: /tmp/tmp4cj5xkck/h2o_alchemist_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.23.0.4597
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_alchemist_uh8t0y
H2O cluster total nodes:,1
H2O cluster free memory:,1.280 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [3]:
data = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/airlines/allyears2k_headers.zip")
train, valid, test = data.split_frame([0.8,0.1],seed = 69)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
print("%d %d %d"% (train.nrows,valid.nrows, test.nrows))

35255 4272 4451


In [5]:
y = "IsArrDelayed"
ignoreFields = [
    "ArrDelay","DepDelay","CarrierDelay",
    "WeatherDelay","NASDelay","SecurityDelay",
    "LateAircraftDelay","IsDepDelayed","IsArrDelayed","ActualElapsedTime","Arrtime","TailNum"] #But CRSElapsedTime is fine.
x = [i for i in train.names if i not in ignoreFields]

In [6]:
nfolds = 5
train2 = train.rbind(valid)

In [7]:
train2.nrow

39527

In [8]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

In [9]:
m_GLM = H2OGeneralizedLinearEstimator(
    family = "binomial",
    model_id = "glm_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_GLM.train(x,y,train2)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [10]:
m_GBM = H2OGradientBoostingEstimator(
    model_id = "gbm_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_GBM.train(x,y,train2)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
m_RF = H2ORandomForestEstimator(
    model_id = "rf_def",
    nfolds = nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions = True
)
m_RF.train(x,y,train2)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [12]:
model = [m_GLM.model_id,m_GBM.model_id,m_RF.model_id]

In [13]:
m_SE = H2OStackedEnsembleEstimator(model_id = "se_glm_gbm_rf", base_models = model)
m_SE.train(x, y, train2)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [14]:
import pandas as pd

### Analyzing the performance

In [20]:
all_models = [m_GLM, m_GBM, m_RF , m_SE]

In [21]:
names = ["GLM","GBM","RF","SE"]

In [22]:
pd.Series(map(lambda x : x.logloss(),all_models),names)

GLM    0.573228
GBM    0.468854
RF     0.457297
SE     0.162360
dtype: float64

In [24]:
pd.Series(map(lambda x : x.auc(),all_models),names)

GLM    0.768445
GBM    0.897356
RF     0.887733
SE     0.996629
dtype: float64

In [19]:
pd.Series(map(lambda x : x.auc(xval = True),all_models),names)

GLM    0.761067
GBM    0.857355
RF     0.889048
SE          NaN
dtype: float64

In [25]:
test_perf = list(map(lambda x : x.model_performance(test),all_models))

In [26]:
pd.Series(map(lambda x : x.logloss(),test_perf),names)

GLM    0.580618
GBM    0.499171
RF     0.419713
SE     0.395240
dtype: float64

In [27]:
pd.Series(map(lambda x : x.auc(),test_perf),names)

GLM    0.755456
GBM    0.859658
RF     0.900333
SE     0.901220
dtype: float64