<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#GBTRegressor" data-toc-modified-id="GBTRegressor-1">GBTRegressor</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1">Load Data</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-1.2">Train-test split</a></span></li><li><span><a href="#Build-Pipeline" data-toc-modified-id="Build-Pipeline-1.3">Build Pipeline</a></span></li><li><span><a href="#Train-Model" data-toc-modified-id="Train-Model-1.4">Train Model</a></span></li><li><span><a href="#Predict-&amp;-Validate" data-toc-modified-id="Predict-&amp;-Validate-1.5">Predict &amp; Validate</a></span></li><li><span><a href="#Feature-Importances" data-toc-modified-id="Feature-Importances-1.6">Feature Importances</a></span></li></ul></li><li><span><a href="#h2o" data-toc-modified-id="h2o-2">h2o</a></span><ul class="toc-item"><li><span><a href="#Convert-to-h2o-Frame" data-toc-modified-id="Convert-to-h2o-Frame-2.1">Convert to h2o Frame</a></span></li><li><span><a href="#Pre-process" data-toc-modified-id="Pre-process-2.2">Pre-process</a></span></li><li><span><a href="#AutoML" data-toc-modified-id="AutoML-2.3">AutoML</a></span></li></ul></li></ul></div>

In [63]:
# Standard lib
import os

# Standard pyspark lib
from pyspark import SparkContext

# SparkML
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.classification import GBTClassifier

# SparkSQL
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# H2O
import h2o
from h2o.automl import H2OAutoML
from pysparkling import *

In [64]:
# Get the ball rolling
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

------------------

# GBTRegressor

## Load Data

In [65]:
# Data path
data_path = "../../data/processed/lite_300.csv"

In [66]:
# TODO: s3_utils.get_data()
df_pills = ss.read.csv(data_path, header=True, inferSchema=True)

In [67]:
# n_observations
df_pills.count()

300

-------------

## Train-test split

In [68]:
# Train-test split
df_train, df_test = df_pills.randomSplit(weights=[0.80, 0.20])

-------

## Build Pipeline

In [69]:
# Exclude target="label" col from training data
train_cols = df_train.columns[:-1]

In [70]:
# Transformer; excludes "label" col
va = VectorAssembler(outputCol="features", inputCols=train_cols)

# Estimators
gbt = GBTRegressor(maxIter=100)

In [71]:
# Assemble features
train_lpoints = va.transform(df_train).select("features", "label").persist()
test_lpoints = va.transform(df_test).select("features", "label").persist()

---------------

## Train Model

In [72]:
# Train model
gbt_model = gbt.fit(train_lpoints)

-----------

## Predict & Validate

In [73]:
# Predict
gbt_predict = gbt_model.transform(test_lpoints)

In [74]:
# Validate
metric = RegressionEvaluator(labelCol="label",
                             predictionCol="prediction",
                             metricName="mae")

# Evaluate
score = metric.evaluate(gbt_predict)

In [75]:
# Print nicely
score

10.462529598947514

-----------

## Feature Importances

In [76]:
# Maps {col_num: importance}
importances_dict = dict(zip(gbt_model.featureImportances.indices, gbt_model.featureImportances.values))

# Tuples (col_num, importance); reverse sorted by ``importance``
importances_tups = sorted(importances_dict.items(), key=lambda tup: tup[1], reverse=True)

In [77]:
# Grab most important features
top_n = 10
top_tups = importances_tups[:top_n]

In [78]:
top_tups

[(0, 0.025772512555205775),
 (66, 0.025054024958800332),
 (47, 0.023112448372701287),
 (43, 0.023041126822569816),
 (14, 0.022110750953390962),
 (83, 0.02195664365415609),
 (7, 0.021263067842595632),
 (79, 0.018695018783958142),
 (42, 0.01830506469394253),
 (3, 0.018230218705549906)]

--------------------

# h2o

In [79]:
# Get the ball rolling
hc = H2OContext.getOrCreate(ss)


Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-matthew_local-1579260140931
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,192.168.29.219,54321)
  ------------------------

  Open H2O Flow in browser: http://192.168.29.219:54321 (CMD + click in Mac OSX)

    


## Convert to h2o Frame

In [80]:
# Convert to h2o frame
h2o_df = hc.as_h2o_frame(df_pills)

H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.29.219', port=54321): Max retries exceeded with url: /3/Frames/frame_rdd_6880_ace81a0bde7fb4883e832d239a2442d0/light?row_count=10&row_offset=0&column_count=-1&full_column_count=-1&column_offset=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x114368d68>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [None]:
# Check shape
h2o_df.shape

-------------

## Pre-process

In [None]:
# Identify target
target = "label"

# Cast target to categorical (enum)
h2o_df[target] = h2o_df[target].asfactor()

-----------

## AutoML

In [None]:
# Identify predictors
predictors = h2o_df.names[:]
predictors.remove(target)

In [None]:
# Build model
model = H2OAutoML(max_runtime_secs=120, nfolds=5)

In [None]:
# Train model
model.train(x=predictors, y=target, training_frame=h2o_df)

In [None]:
# Print leaderboard
print(model.leaderboard)