# Chapter 23 - RFormula

In [1]:
%run common.ipynb

Name: Error parsing magics!
Message: Magics [run] do not exist!
StackTrace: 

In [2]:
%ShowTypes on

Types will be printed.


# Setup

In [3]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.ml.feature._

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [4]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [5]:
val NUM_CORES = 8
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .appName("mllib-rformula")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

Name: org.apache.spark.SparkException
Message: Uncaught exception: org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException: Invalid resource request! Cannot allocate containers as requested resource is greater than maximum allowed allocation. Requested resource type=[vcores], Requested resource=<memory:4505, vCores:8>, maximum allowed allocation=<memory:8192, vCores:4>, please note that maximum allowed allocation is calculated by scheduler based on maximum resource of registered NodeManagers, which might be less than configured maximum allocation=<memory:8192, vCores:4>
	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils.throwInvalidResourceException(SchedulerUtils.java:491)
	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils.checkResourceRequestAgainstAvailableResource(SchedulerUtils.java:387)
	at org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils.validateResourceRequest(SchedulerUtils.java:315)
	at org.apache

In [6]:
val configMap = spark.conf.getAll.foreach(println)

Waiting for a Spark session to start...

lastException: Throwable = null


Name: java.lang.IllegalStateException
Message: Spark context stopped while waiting for backend
StackTrace:   at org.apache.spark.scheduler.TaskSchedulerImpl.waitBackendReady(TaskSchedulerImpl.scala:818)
  at org.apache.spark.scheduler.TaskSchedulerImpl.postStartHook(TaskSchedulerImpl.scala:196)
  at org.apache.spark.SparkContext.<init>(SparkContext.scala:560)
  at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2520)
  at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:935)
  at org.apache.spark.sql.SparkSession$Builder$$anonfun$7.apply(SparkSession.scala:926)
  at scala.Option.getOrElse(Option.scala:121)
  at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:926)
  at org.apache.toree.kernel.api.Kernel$$anonfun$1.apply(Kernel.scala:428)
  at org.apache.toree.kernel.api.Kernel$$anonfun$1.apply(Kernel.scala:428)
  at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
  at scala.concurre

## Constants

In [7]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data"
val RESULT_DIR="."

PROTOCOL: String = file://
DATA_DIR: String = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data
RESULT_DIR: String = .


lastException: Throwable = null


RESULT_DIR: String = .


# Dataframe

In [None]:
var df = spark.read.json(PROTOCOL + DATA_DIR + "/simple-ml")

Waiting for a Spark session to start...

In [None]:
df.orderBy("value2").show(false)

# Split train/test data

In [None]:
val Array(train, test) = df.randomSplit(Array(0.7, 0.3))

# Pipeline

## RFormula

In [None]:
val rFormula = new RFormula()

## Logistic regression

In [None]:
import org.apache.spark.ml.classification.LogisticRegression
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("features")

## Pipeline

In [None]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
val pipeline = new Pipeline()
    .setStages(Array(
        rFormula,
        lr
    ))

val model = pipeline.fit(train)

## Parameter Grid

In [None]:
import org.apache.spark.ml.tuning.ParamGridBuilder
val params = new ParamGridBuilder()
    .addGrid(
        rFormula.formula, 
        Array(
            "lab ~ . + color:value1",
            "lab ~ . + color:value1 + color:value2"
        )
    )
    .addGrid(
        lr.elasticNetParam, 
        Array(
            0.0, 0.5, 1.0
        )
    )
    .addGrid(
        lr.regParam, 
        Array(
            0.1, 2.0)
    )
    .build()

## Evaluator

In [None]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
val evaluator = new BinaryClassificationEvaluator()
  .setMetricName("areaUnderROC")
  .setRawPredictionCol("prediction")
  .setLabelCol("label")

# Cross Validation

In [None]:
import org.apache.spark.ml.tuning.TrainValidationSplit
val tvs = new TrainValidationSplit()
  .setTrainRatio(0.75) // also the default.
  .setEstimatorParamMaps(params)
  .setEstimator(pipeline)
  .setEvaluator(evaluator)

In [None]:
val tvsFitted = tvs.fit(train)

In [None]:
evaluator.evaluate(tvsFitted.transform(test))