# Chapter 26 - Classification - Logistic Regression

In [9]:
%ShowTypes on

Types will be printed.


# Setup

In [10]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.ml.feature._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification._

import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp
import java.time.temporal.ChronoUnit

In [11]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [12]:
val NUM_CORES = 4
val NUM_PARTITIONS = 4

lazy val spark: SparkSession = SparkSession.builder()
    .appName("mllib-cross-validation")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/

import spark.implicits._

NUM_CORES: Int = 4
NUM_PARTITIONS: Int = 4
spark: org.apache.spark.sql.SparkSession = <lazy>


spark: org.apache.spark.sql.SparkSession = <lazy>


In [13]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,172.19.210.168)
(spark.eventLog.enabled,true)
(spark.driver.port,46361)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://172.19.210.168:46361/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-ba348052-5850-48b9-84b2-91a8da343c06/repl-1c601d40-eff9-40ce-92e5-da71ab984df5)
(spark.app.name,mllib-cross-validation)
(spark.driver.memory,3g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,16)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,yarn)
(spark.ui.filters,org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.driver.appUIAddress,http://172.19.210.168:4040)
(sp

configMap: Unit = ()


## Constants

In [14]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data"
val RESULT_DIR="."

PROTOCOL: String = file://
DATA_DIR: String = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe/data
RESULT_DIR: String = .


RESULT_DIR: String = .


# Dataframe

In [15]:
val bInput = spark.read.format("parquet").load(PROTOCOL + DATA_DIR + "/binary-classification")
  .selectExpr("features", "cast(label as double) as label")

bInput: org.apache.spark.sql.DataFrame = [features: vector, label: double]


bInput: org.apache.spark.sql.DataFrame = [features: vector, label: double]


In [16]:
bInput.show(3)

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
+--------------+-----+
only showing top 3 rows



# Logistic Regression 

In [18]:
val lr = new LogisticRegression()
println(lr.explainParams()) // see all parameters
val lrModel = lr.fit(bInput)

aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term (default: true)
labelCol: label column name (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. (undefined)
maxIter: maximum number of iterations (>= 0) (default: 100)
predictionCol: prediction column name (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: 

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_ba88cd353c1e
lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_ba88cd353c1e, numClasses = 2, numFeatures = 3


lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_ba88cd353c1e, numClasses = 2, numFeatures = 3


## Model
Weights of the features (coefficients) and intercept. For a multinomial model (the current one is binary), lrModel.coefficientMatrix and lrModel.interceptVecto

In [19]:
println(lrModel.coefficients)
println(lrModel.intercept)

[6.848741326855034,0.35356589010197487,14.814900276915889]
-10.225695864480993


## Model performance
Note that for the area under the curve, instance weighting is not taken into account, so if you wanted to see how you performed on the values you weighed more highly, you’d have to do that manually. This will probably change in future Spark versions.

The model summary is currently only available for binary logistic regression problems, but multiclass summaries will likely be added in the future.

In [20]:
val summary = lrModel.summary
val bSummary = summary.asInstanceOf[BinaryLogisticRegressionSummary]
println(bSummary.areaUnderROC)
bSummary.roc.show()
bSummary.pr.show()

1.0
+---+------------------+
|FPR|               TPR|
+---+------------------+
|0.0|               0.0|
|0.0|0.3333333333333333|
|0.0|               1.0|
|1.0|               1.0|
|1.0|               1.0|
+---+------------------+

+------------------+---------+
|            recall|precision|
+------------------+---------+
|               0.0|      1.0|
|0.3333333333333333|      1.0|
|               1.0|      1.0|
|               1.0|      0.6|
+------------------+---------+



summary: org.apache.spark.ml.classification.LogisticRegressionTrainingSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@25c6e15f
bSummary: org.apache.spark.ml.classification.BinaryLogisticRegressionSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@25c6e15f


bSummary: org.apache.spark.ml.classification.BinaryLogisticRegressionSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@25c6e15f


### Conversion Rate
The speed at which the model descends to the final result is shown in the objective history.

In [22]:
summary.objectiveHistory.foreach(println)

0.6730116670092565
0.5042829330409728
0.36356862066874396
0.1252407018038338
0.08532556611276212
0.03550487641573043
0.01819649450857124
0.008817369922959128
0.004413673785392138
0.002194038351234706
0.001096564114808084
5.476575519853136E-4
2.73762379514901E-4
1.368465223657475E-4
6.84180903707058E-5
3.4207077910384856E-5
1.710317666423191E-5
8.551470106426885E-6
4.275703677941412E-6
2.1378240117781396E-6
1.0688564054651744E-6
5.342600202575258E-7
2.668135105897087E-7
1.32046278653136E-7
6.768401481681801E-8
3.3145477184834547E-8
1.6151438837488498E-8
8.309350118268437E-9
