In [1]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "11.0.3" 2019-04-16 LTS; Java(TM) SE Runtime Environment 18.9 (build 11.0.3+12-LTS); Java HotSpot(TM) 64-Bit Server VM 18.9 (build 11.0.3+12-LTS, mixed mode)
  Starting server from /Users/charlottefeng/opt/anaconda3/envs/ml/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/xp/08mc5j0141gd6235k_vvnd3h0000gn/T/tmp0x4kwle2
  JVM stdout: /var/folders/xp/08mc5j0141gd6235k_vvnd3h0000gn/T/tmp0x4kwle2/h2o_charlottefeng_started_from_python.out
  JVM stderr: /var/folders/xp/08mc5j0141gd6235k_vvnd3h0000gn/T/tmp0x4kwle2/h2o_charlottefeng_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 13 days
H2O_cluster_name:,H2O_from_python_charlottefeng_plx6ap
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


# Data Import

In [2]:
df = h2o.import_file("data/train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
df.shape

(2000, 21)

In [4]:
df.columns

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time',
 'three_g',
 'touch_screen',
 'wifi',
 'price_range']

# Preprocessing

## Encode response variable

- y: `price_range`.
    - 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

In [5]:
df['price_range'] = df['price_range'].asfactor()
df['price_range'].levels()

[['0', '1', '2', '3']]

## Train-test-split

In [6]:
train, val, test = df.split_frame(ratios=[0.8, 0.1], seed=0)

In [7]:
print('Training set size:', train.nrow)
print('Validation set size:', val.nrow)
print('Test set size:', test.nrow)

Training set size: 1597
Validation set size: 206
Test set size: 197


In [8]:
y = 'price_range'
x = list(df.columns)
x.remove(y)

# Modeling

## Random Forest

In [9]:
from h2o.estimators import H2ORandomForestEstimator

In [10]:
rf = H2ORandomForestEstimator(seed=0)

rf.train(x=x, y=y, training_frame=train, validation_frame=val)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
rf_perf = rf.model_performance(test_data=test)

In [12]:
rf_perf


ModelMetricsMultinomial: drf
** Reported on test data. **

MSE: 0.16355058089756336
RMSE: 0.40441387327534073
LogLoss: 0.502290467124602
Mean Per-Class Error: 0.11288950932255062

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,0,1,2,3,Error,Rate
0,46.0,5.0,0.0,0.0,0.098039,5 / 51
1,2.0,48.0,2.0,0.0,0.076923,4 / 52
2,1.0,5.0,37.0,4.0,0.212766,10 / 47
3,0.0,0.0,3.0,44.0,0.06383,3 / 47
4,49.0,58.0,42.0,48.0,0.111675,22 / 197



Top-4 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.888325
1,2,0.989848
2,3,1.0
3,4,1.0




In [13]:
print('Training loss:', rf.logloss(train=True))
print('Validation loss:', rf.logloss(valid=True))

Training loss: 0.6051669360631697
Validation loss: 0.5161306617719451


## GBM

In [14]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

gbm = H2OGradientBoostingEstimator(seed=0)
gbm.train(x=x, y=y, training_frame=train, validation_frame=val)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [15]:
gbm_perf = gbm.model_performance(test_data=test)

In [16]:
gbm_perf


ModelMetricsMultinomial: gbm
** Reported on test data. **

MSE: 0.06101225604688036
RMSE: 0.24700659110007644
LogLoss: 0.1997476830528826
Mean Per-Class Error: 0.08324909341805463

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,0,1,2,3,Error,Rate
0,49.0,2.0,0.0,0.0,0.039216,2 / 51
1,0.0,50.0,2.0,0.0,0.038462,2 / 52
2,0.0,5.0,37.0,5.0,0.212766,10 / 47
3,0.0,0.0,2.0,45.0,0.042553,2 / 47
4,49.0,57.0,41.0,50.0,0.081218,16 / 197



Top-4 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.918782
1,2,1.0
2,3,1.0
3,4,1.0




In [17]:
print('Training loss:', gbm.logloss(train=True))
print('Validation loss:', gbm.logloss(valid=True))

Training loss: 0.04494925795614103
Validation loss: 0.22848238084995362


## GBM with Random Search of Hyperparameters
- [Reference](https://github.com/h2oai/h2o-tutorials/blob/master/h2o-open-tour-2016/chicago/grid-search-model-selection.ipynb)

In [18]:
gbm_params = {
    'learn_rate': [i * 0.01 for i in range(1, 11)], 
    'max_depth': list(range(2, 11)),
    'sample_rate': [i * 0.1 for i in range(5, 11)],
    'col_sample_rate': [i * 0.1 for i in range(1, 11)]
}

search_criteria = {'strategy': 'RandomDiscrete', 'max_models': 36}

In [19]:
from h2o.grid.grid_search import H2OGridSearch

gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator,
                         hyper_params=gbm_params,
                         search_criteria=search_criteria)
gbm_grid.train(x=x, y=y, 
               training_frame=train, 
               validation_frame=val, 
               ntrees=100,
               seed=0)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [20]:
gbm_grid_perf = gbm_grid.get_grid(sort_by='logloss', decreasing=False)

In [21]:
gbm_grid_perf

          col_sample_rate learn_rate max_depth         sample_rate  \
0                     0.8       0.06         6  0.6000000000000001   
1                     0.9       0.05        10  0.6000000000000001   
2                     0.9       0.08         9  0.7000000000000001   
3                     0.9       0.05         7  0.7000000000000001   
4                     0.4        0.1         4                 1.0   
5                     0.8       0.08         3                 0.8   
6                     0.9       0.08         7  0.7000000000000001   
7                     0.9       0.08         8  0.7000000000000001   
8                     0.5       0.07         5                 0.5   
9                     1.0       0.06         4                 1.0   
10                    0.8       0.09         8  0.7000000000000001   
11     0.6000000000000001       0.08         5                 1.0   
12     0.6000000000000001       0.05         6                 1.0   
13     0.60000000000



In [22]:
best_gbm_model = gbm_grid_perf.models[0]

In [23]:
gbm_best_perf = best_gbm_model.model_performance(test_data=test)

In [24]:
gbm_best_perf


ModelMetricsMultinomial: gbm
** Reported on test data. **

MSE: 0.058305168991132084
RMSE: 0.2414646330027072
LogLoss: 0.1912323538372744
Mean Per-Class Error: 0.07363370880267

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,0,1,2,3,Error,Rate
0,49.0,2.0,0.0,0.0,0.039216,2 / 51
1,0.0,52.0,0.0,0.0,0.0,0 / 52
2,0.0,5.0,38.0,4.0,0.191489,9 / 47
3,0.0,0.0,3.0,44.0,0.06383,3 / 47
4,49.0,59.0,41.0,48.0,0.071066,14 / 197



Top-4 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.928934
1,2,1.0
2,3,1.0
3,4,1.0




In [25]:
print(gbm_best_perf.logloss())

0.1912323538372744
