In [1]:
from pysparkling import H2OContext, H2OConf

In [2]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SQLContext

In [3]:
conf = SparkConf()
conf.setMaster("local").setAppName("test")
conf.set("spark.sql.shuffle.partitions", 3)
conf.set("spark.default.parallelism", 3)
conf.set("spark.debug.maxToStringFields", 100)
sc = pyspark.SparkContext(conf=conf)

In [4]:
sqlContext = SQLContext(sc)

In [5]:
hc = H2OContext.getOrCreate(sc, H2OConf(sc).set_internal_cluster_mode())

Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and parameter of type SparkSession is preferred.
Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and parameter of type SparkSession is preferred.


Connecting to H2O server at http://10.74.33.10:54323 ... successful.


0,1
H2O cluster uptime:,27 secs
H2O cluster timezone:,Europe/Paris
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,1 month and 20 days
H2O cluster name:,sparkling-water-lrubio_local-1580990243875
H2O cluster total nodes:,1
H2O cluster free memory:,810 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4



Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-lrubio_local-1580990243875
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,10.74.33.10,54323)
  ------------------------

  Open H2O Flow in browser: http://10.74.33.10:54323 (CMD + click in Mac OSX)

    


In [6]:
from pyspark.sql.types import StructType, StructField, FloatType, StringType

schema = StructType([
    StructField("sepal_length", FloatType(), True),
    StructField("sepal_width", FloatType(), True),
    StructField("petal_length", FloatType(), True),
    StructField("petal_width", FloatType(), True),
    StructField("class", StringType(), True)])

In [7]:
iris_df = sqlContext.read \
        .format('com.databricks.spark.csv') \
        .option('header', 'false') \
        .option('delimiter', ',') \
        .schema(schema) \
        .load('../../../../Downloads/iris.data')

In [8]:
from pyspark.ml.pipeline import Pipeline
from ai.h2o.sparkling.ml.algos import H2OGridSearch
from ai.h2o.sparkling.ml.algos import H2OGBM

In [9]:
gbm_params = {'learnRate': [0.01, 0.1],
              'ntrees': [100 , 200, 300, 500]}
gbm_grid = H2OGridSearch()\
    .setLabelCol("class") \
    .setHyperParameters(gbm_params)\
    .setAlgo(H2OGBM().setMaxDepth(30))

model_pipeline = Pipeline().setStages([gbm_grid])
model = model_pipeline.fit(iris_df)

Py4JJavaError: An error occurred while calling o111.fit.
: java.lang.NullPointerException
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.extractH2OParameters(H2OGridSearch.scala:352)
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.fit(H2OGridSearch.scala:64)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)


In [10]:
gbm_grid = H2OGridSearch()\
    .setLabelCol("class") \
    .setAlgo(H2OGBM().setMaxDepth(30))

model_pipeline = Pipeline().setStages([gbm_grid])
model = model_pipeline.fit(iris_df)

Py4JJavaError: An error occurred while calling o615.fit.
: java.lang.NullPointerException
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.extractH2OParameters(H2OGridSearch.scala:352)
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.fit(H2OGridSearch.scala:64)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)


In [11]:
gbm_grid = H2OGridSearch(algo=H2OGBM().setMaxDepth(30),
                         #hyperParameters=gbm_params,
                         withDetailedPredictionCol=True,
                         labelCol='class',
                         stoppingMetric="AUC")
model_pipeline = Pipeline().setStages([gbm_grid])
model = model_pipeline.fit(iris_df)
model.stages[0].transform(iris_df).head()

Row(sepal_length=5.099999904632568, sepal_width=3.5, petal_length=1.399999976158142, petal_width=0.20000000298023224, class='Iris-setosa', detailed_prediction=Row(label='Iris-setosa', probabilities=[0.9989910227091852, 0.0005042546003770538, 0.0005047226904378534]), prediction='Iris-setosa')

In [12]:
gbm_grid = H2OGridSearch(algo=H2OGBM().setMaxDepth(30),
                         hyperParameters={'learnRate': [0.01, 0.1]},
                         withDetailedPredictionCol=True,
                         labelCol='class',
                         stoppingMetric="AUC")
model_pipeline = Pipeline().setStages([gbm_grid])
model = model_pipeline.fit(iris_df)
model.stages[0].transform(iris_df).head()

Py4JJavaError: An error occurred while calling o1817.fit.
: java.lang.NoSuchFieldException: learnRate
	at java.lang.Class.getField(Unknown Source)
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.findField(H2OGridSearch.scala:170)
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.processHyperParams(H2OGridSearch.scala:154)
	at ai.h2o.sparkling.ml.algos.H2OGridSearch.fit(H2OGridSearch.scala:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)


In [14]:
gbm_model = H2OGBM(labelCol='class',
                   withDetailedPredictionCol=True).setLearnRate(0.01).setMaxDepth(5).setNtrees(100)

model_pipeline = Pipeline().setStages([gbm_model])
model = model_pipeline.fit(iris_df)
model.stages[0].transform(iris_df).head()

Row(sepal_length=5.099999904632568, sepal_width=3.5, petal_length=1.399999976158142, petal_width=0.20000000298023224, class='Iris-setosa', detailed_prediction=Row(label='Iris-setosa', probabilities=[0.791673028985822, 0.10416326177822503, 0.10416370923595292]), prediction='Iris-setosa')