# Chapter 25 - Preprocessing - Estimator/Transformer

In [8]:
%ShowTypes on

Types will be printed.


# Setup

In [34]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.ml.feature._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector

import spark.implicits._

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [10]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [11]:
val NUM_CORES = 4
val NUM_PARTITIONS = 4

lazy val spark: SparkSession = SparkSession.builder()
    .appName("mllib-cross-validation")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/

NUM_CORES: Int = 4
NUM_PARTITIONS: Int = 4
spark: org.apache.spark.sql.SparkSession = <lazy>


spark: org.apache.spark.sql.SparkSession = <lazy>


In [12]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,172.17.0.1)
(spark.eventLog.enabled,true)
(spark.driver.port,35271)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://172.17.0.1:35271/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-0457f37e-a701-4ec8-a1fb-241b65acf6f1/repl-33d143d8-b21e-4a5b-9877-5ed0f16b42e6)
(spark.app.name,mllib-cross-validation)
(spark.driver.memory,3g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,16)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,yarn)
(spark.ui.filters,org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.driver.appUIAddress,http://172.17.0.1:4040)
(spark.org.apac

configMap: Unit = ()


## Constants

In [13]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data"
val RESULT_DIR="."

PROTOCOL: String = file://
DATA_DIR: String = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data
RESULT_DIR: String = .


RESULT_DIR: String = .


# Dataframe

In [16]:
val sales = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(PROTOCOL + DATA_DIR + "/retail-data/by-day/*.csv")
  .coalesce(5)
  .where("Description IS NOT NULL")
val fakeIntDF = spark.read.parquet(PROTOCOL + DATA_DIR + "/simple-ml-integers")
var simpleDF = spark.read.json(PROTOCOL + DATA_DIR + "/simple-ml")
val scaleDF = spark.read.parquet(PROTOCOL + DATA_DIR + "/simple-ml-scaling")

sales: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [InvoiceNo: string, StockCode: string ... 6 more fields]
fakeIntDF: org.apache.spark.sql.DataFrame = [int1: int, int2: int ... 1 more field]
simpleDF: org.apache.spark.sql.DataFrame = [color: string, lab: string ... 2 more fields]
scaleDF: org.apache.spark.sql.DataFrame = [id: int, features: vector]


lastException: Throwable = null


scaleDF: org.apache.spark.sql.DataFrame = [id: int, features: vector]


In [18]:
sales.cache()
sales.printSchema
sales.show()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538| 

In [52]:
val countries = sales.select("Country").distinct
println(countries.count)
countries.show(5)

38
+---------+
|  Country|
+---------+
|Australia|
|   Israel|
|   Sweden|
|  Denmark|
|  Bahrain|
+---------+
only showing top 5 rows



countries: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Country: string]


countries: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Country: string]


# Estimators

## OneHotEncoderEstimator
37 bits are used to express 38 countries and Australia is encoded as 23th bit, Israel is encoded as 18th bit.

```
|Australia|          23.0|(37,[23],[1.0])|
|   Israel|          18.0|(37,[18],[1.0])|
```

In [53]:
// Convert categorical string to numerical index
val indexer = new StringIndexer()
  .setInputCol("Country")
  .setOutputCol("CountryIndexed")

// Convert numerical category into one hot encode
val encoder = new OneHotEncoderEstimator()
  .setInputCols(Array("CountryIndexed"))
  .setOutputCols(Array("CountryEncoded"))

// Pipeline is also an estimator that generates a transformer
val pipeline = new Pipeline()
  .setStages(Array(indexer, encoder))

// Fit the pipeline to generate transformer (PipelineModel)
val model = pipeline.fit(countries)

// Transform country into one hot encoded 
model.transform(countries).show(3)

+---------+--------------+---------------+
|  Country|CountryIndexed| CountryEncoded|
+---------+--------------+---------------+
|Australia|          23.0|(37,[23],[1.0])|
|   Israel|          18.0|(37,[18],[1.0])|
|   Sweden|          20.0|(37,[20],[1.0])|
+---------+--------------+---------------+
only showing top 3 rows



indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_891d875559f0
encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_707c9ebf5548
pipeline: org.apache.spark.ml.Pipeline = pipeline_f6298553a1a0
model: org.apache.spark.ml.PipelineModel = pipeline_f6298553a1a0


model: org.apache.spark.ml.PipelineModel = pipeline_f6298553a1a0


# Transformers

## SQLTransformer

In [23]:
import org.apache.spark.ml.feature.SQLTransformer

val basicTransformation = new SQLTransformer()
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show(3)

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|         3810|     474|   18041.0|
|          129|      16|   16619.0|
|         1070|     107|   12782.0|
+-------------+--------+----------+
only showing top 3 rows



basicTransformation: org.apache.spark.ml.feature.SQLTransformer = sql_2cd839464d13


basicTransformation: org.apache.spark.ml.feature.SQLTransformer = sql_2cd839464d13


## VectorAssembler
Group multiple columns into a vector (to be processed as ML features)

In [27]:
import org.apache.spark.ml.feature.VectorAssembler
val va = new VectorAssembler()
    .setInputCols(Array("int1", "int2", "int3"))
    .setOutputCol("features")
va.transform(fakeIntDF).show()

+----+----+----+-------------+
|int1|int2|int3|     features|
+----+----+----+-------------+
|   7|   8|   9|[7.0,8.0,9.0]|
|   1|   2|   3|[1.0,2.0,3.0]|
|   4|   5|   6|[4.0,5.0,6.0]|
+----+----+----+-------------+



va: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_c7d496a88fa5


va: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_c7d496a88fa5
