In [1]:
// 1. GET THE DATA

// Import SparkSession and LogisticRegression.
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession

// Import VectorAssembler and Vectors
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,VectorIndexer,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

// Import Pipeline
import org.apache.spark.ml.Pipeline

// For Metrics and Evaluation import MulticlassMetrics
import org.apache.spark.mllib.evaluation.MulticlassMetrics

// Set error reporting.
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

// Create a spark session.
val spark = SparkSession.builder().getOrCreate()

// Read in the advertising.csv file.
val data = spark.read
    .option("header","true")
    .option("inferSchema","true")
    .format("csv").load("advertising.csv")

// Print the schema of the dataframe.
data.printSchema()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.22:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1601729290158)
SparkSession available as 'spark'


root
 |-- Daily Time Spent on Site: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Area Income: double (nullable = true)
 |-- Daily Internet Usage: double (nullable = true)
 |-- Ad Topic Line: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Clicked on Ad: integer (nullable = true)



import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@3eba8e61
data: org.apache.spark.sql.DataFrame = [Daily Time Spent on Site: double, Age: int ... 8 more fields]


In [2]:
// 2 DISPLAY THE DATA
// Print out a sample row of the data (multiple ways to do this)
val colnames = data.columns
val firstrow = data.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
    println(colnames(ind))
    println(firstrow(ind))
    println("\n")
    }



Example Data Row
Age
35


Area Income
61833.9


Daily Internet Usage
256.09


Ad Topic Line
Cloned 5thgeneration orchestration


City
Wrightburgh


Male
0


Country
Tunisia


Timestamp
2016-03-27 00:53:11


Clicked on Ad
0




colnames: Array[String] = Array(Daily Time Spent on Site, Age, Area Income, Daily Internet Usage, Ad Topic Line, City, Male, Country, Timestamp, Clicked on Ad)
firstrow: org.apache.spark.sql.Row = [68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0]


In [3]:
// 2 SETUP DATAFRAME FOR MACHINE LEARNING

//    - Rename the Clicked on Ad column to "label"
//    - Grab the following columns "Daily Time Spent on Site","Age","Area Income","Daily Internet Usage","Timestamp","Male"
//    - Create a new column called Hour from the Timestamp containing the Hour of the click

val timedata = data.withColumn("Hour",hour(data("Timestamp")))

val logregdataall = timedata.select(
    data("Clicked on Ad").as("label"),
    $"Daily Time Spent on Site",
    $"Age",
    $"Area Income",
    $"Daily Internet Usage",
//     $"Timestamp",
    $"Hour",
    $"Male")

val logregdata = logregdataall.na.drop()

// Create a new VectorAssembler object called assembler for the feature
// columns as the input Set the output column to be called features

val assembler = (new VectorAssembler()
    .setInputCols(Array(
        "Daily Time Spent on Site",
        "Age",
        "Area Income",
        "Daily Internet Usage",
//         "Timestamp",
        "Hour",
        "Male",
        ))
    .setOutputCol("features"))


// Use randomSplit to create a train test split of 70/30
val Array(training, test) = logregdata
    .randomSplit(Array(0.7, 0.3), seed = 12345)

timedata: org.apache.spark.sql.DataFrame = [Daily Time Spent on Site: double, Age: int ... 9 more fields]
logregdataall: org.apache.spark.sql.DataFrame = [label: int, Daily Time Spent on Site: double ... 5 more fields]
logregdata: org.apache.spark.sql.DataFrame = [label: int, Daily Time Spent on Site: double ... 5 more fields]
assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_51fa94905a18, handleInvalid=error, numInputCols=6
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Daily Time Spent on Site: double ... 5 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Daily Time Spent on Site: double ... 5 more fields]


In [4]:
// 3 SETUP THE PIPE LINE

// Create a new LogisticRegression object called lr
val lr = new LogisticRegression()

// Create a new pipeline with the stages: assembler, lr
val pipeline = new Pipeline()
    .setStages(Array(assembler, lr))

// Fit the pipeline to training set.
val model = pipeline.fit(training)

// Get Results on Test Set with transform
val results = model.transform(test)

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_1f32e2f9eaa7
pipeline: org.apache.spark.ml.Pipeline = pipeline_cbd5233be56d
model: org.apache.spark.ml.PipelineModel = pipeline_cbd5233be56d
results: org.apache.spark.sql.DataFrame = [label: int, Daily Time Spent on Site: double ... 9 more fields]


In [5]:
// 4 MODEL EVALUATION

// Convert the test results to an RDD using .as and .rdd
val predictionAndLabels = results.select($"prediction",$"label").as[(Double, Double)].rdd

// Instantiate a new MulticlassMetrics object
val metrics = new MulticlassMetrics(predictionAndLabels)

// Print out the Confusion matrix
println("Confusion matrix:")
println(metrics.confusionMatrix)

Confusion matrix:
136.0  1.0    
4.0    146.0  


predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[94] at rdd at <console>:38
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@7b285208
