### Create a simple h2o cars price model

Model based on scraped gaspedaal data

In [16]:
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

### import csv file

In [25]:
cars = pd.read_csv("cars.csv")
cars = (
    cars
    .assign(ouderdom = 2016 - cars.bouwjaar + 1)
)

### start h2o and upload

In [43]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "13.0.1" 2019-10-15; OpenJDK Runtime Environment (build 13.0.1+9); OpenJDK 64-Bit Server VM (build 13.0.1+9, mixed mode, sharing)
  Starting server from /Users/lamlon/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/y6/jrqktfnx2dxdcryrpygdr2s9rt72yb/T/tmpcv2rj9zn
  JVM stdout: /var/folders/y6/jrqktfnx2dxdcryrpygdr2s9rt72yb/T/tmpcv2rj9zn/h2o_lamlon_started_from_python.out
  JVM stderr: /var/folders/y6/jrqktfnx2dxdcryrpygdr2s9rt72yb/T/tmpcv2rj9zn/h2o_lamlon_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Amsterdam
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_lamlon_hlsyk8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [44]:
carsh = h2o.H2OFrame(cars)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [45]:
train, test = carsh.split_frame()

In [46]:
train.head(5)

KM,bouwjaar,Prijs,Transmissie,Merk,Model,Motor,Brandstof,ouderdom
12865,2013,7250,Handgeschakeld,Peugeot,107,998,Benzine,4
4620,2016,11339,Handgeschakeld,Peugeot,108,998,Benzine,1
198,2003,1275,Handgeschakeld,Ford,Focus,1388,Benzine,14
451,2016,17144,Handgeschakeld,Ford,Fiesta,1499,Diesel,1
999,2015,29995,Automaat,Ford,C-Max,1999,Anders,2




### Just lazy, turn on autoML

In [47]:
aml = H2OAutoML(max_runtime_secs = 30)

In [48]:
aml.train(
    y = "Prijs",
    x = ["ouderdom", "KM"],
    training_frame = train,
    validation_frame = test
)

AutoML progress: |
21:32:54.683: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

████████████████████████████████████████████████████████| 100%


### dump winner model to mojo

In [54]:
modelfile = aml.leader.download_mojo(path=".", get_genmodel_jar=True)

In [49]:
aml.leaderboard

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20200426_213254,436915000.0,20902.5,436915000.0,8212.61,0.703241
StackedEnsemble_BestOfFamily_AutoML_20200426_213254,438289000.0,20935.3,438289000.0,8192.48,0.700179
DeepLearning_1_AutoML_20200426_213254,469795000.0,21674.8,469795000.0,8161.79,
GBM_3_AutoML_20200426_213254,535291000.0,23136.4,535291000.0,11438.5,1.18275
GBM_2_AutoML_20200426_213254,537657000.0,23187.4,537657000.0,11525.2,1.18944
GBM_4_AutoML_20200426_213254,538523000.0,23206.1,538523000.0,11526.3,1.18925
GBM_5_AutoML_20200426_213254,538592000.0,23207.6,538592000.0,11528.0,1.18936
GBM_1_AutoML_20200426_213254,541556000.0,23271.4,541556000.0,11634.6,1.19762
GLM_1_AutoML_20200426_213254,560873000.0,23682.8,560873000.0,12152.5,1.23359
XGBoost_2_AutoML_20200426_213254,631619000.0,25132.0,631619000.0,11517.0,1.14732




In [32]:
perf = aml.leader.model_performance(test)

In [33]:
perf


ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 495401577.72121614
RMSE: 22257.618419795414
MAE: 8321.435026980478
RMSLE: 0.7082577129686142
R^2: 0.2092245177908143
Mean Residual Deviance: 495401577.72121614
Null degrees of freedom: 108969
Residual degrees of freedom: 108960
Null deviance: 68269864677839.06
Residual deviance: 53983909924280.92
AIC: 2490940.676862295




In [55]:
modelfile

'/Users/lamlon/Documents/Personal/Projects/cars_model/StackedEnsemble_AllModels_AutoML_20200426_213254.zip'

In [38]:
h2o.shutdown()

H2O session _sid_92bf closed.


## Score using mojo

In [56]:
mycars = pd.DataFrame({"ouderdom":[1,3], "KM":[2000, 40000]})
mycars

Unnamed: 0,ouderdom,KM
0,1,2000
1,3,40000


In [57]:
h2o.mojo_predict_pandas(mycars,mojo_zip_path = modelfile)

Unnamed: 0,predict
0,34985.790999
1,22556.508331
