In [2]:
// Logistic Regression Example
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession

// Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.SparkSession
import org.apache.log4j._


In [3]:
// Spark Session
val spark = SparkSession.builder().getOrCreate()

// Use Spark to read in the Titanic csv file.
val data = spark.read
    .option("header","true")
    .option("inferSchema","true")
    .format("csv")
    .load("titanic.csv")

// Print the Schema of the DataFrame
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@240e4d4a
data: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int ... 10 more fields]


In [4]:

// Display Data

val colnames = data.columns
val firstrow = data.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Survived
0


Pclass
3


Name
Braund, Mr. Owen Harris


Sex
male


Age
22.0


SibSp
1


Parch
0


Ticket
A/5 21171


Fare
7.25


Cabin
null


Embarked
S




colnames: Array[String] = Array(PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked)
firstrow: org.apache.spark.sql.Row = [1,0,3,Braund, Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,null,S]


In [5]:
// Setting Up DataFrame for Machine Learning

// Grab only the columns we want
val logregdataall = data
    .select(
    data("Survived").as("label"),
    $"Pclass", $"Sex", $"Age", $"SibSp", $"Parch", $"Fare", $"Embarked")
val logregdata = logregdataall.na.drop()

logregdataall: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]
logregdata: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]


In [6]:
// A few things we need to do before Spark can accept the data!
// We need to deal with the Categorical columns

// Import VectorAssembler and Vectors
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,VectorIndexer,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


In [7]:
// Deal with Categorical Columns

// 1 Covert strings to numerical values
val genderIndexer = new StringIndexer()
    .setInputCol("Sex")
    .setOutputCol("SexIndex")
val embarkIndexer = new StringIndexer()
    .setInputCol("Embarked")
    .setOutputCol("EmbarkIndex")

// 2 Apply one-hot encoder
val genderEncoder = new OneHotEncoder()
    .setInputCol("SexIndex")
    .setOutputCol("SexVec")
val embarkEncoder = new OneHotEncoder()
    .setInputCol("EmbarkIndex")
    .setOutputCol("EmbarkVec")

genderIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_5df031bcf5a0
embarkIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_bd926fb2d4b1
genderEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_592439a87496
embarkEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_90b5abcee24b


In [8]:
// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
    .setInputCols(
        Array("Pclass", "SexVec", "Age","SibSp","Parch","Fare","EmbarkVec"))
    .setOutputCol("features"))

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_329c572f1324, handleInvalid=error, numInputCols=7


In [9]:
// Split the data into training and test sets
val Array(training, test) = logregdata
    .randomSplit(Array(0.7, 0.3), seed = 12345)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]


In [10]:
// Set up the pipeline
import org.apache.spark.ml.Pipeline

val lr = new LogisticRegression()

val pipeline = new Pipeline()
    .setStages(Array(
        genderIndexer,embarkIndexer,genderEncoder,embarkEncoder,assembler, lr))

// Fit the pipeline to training documents.
val model = pipeline.fit(training)

// Get Results on Test Set
val results = model.transform(test)

import org.apache.spark.ml.Pipeline
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_cf8a94339527
pipeline: org.apache.spark.ml.Pipeline = pipeline_80bda54bc50a
model: org.apache.spark.ml.PipelineModel = pipeline_80bda54bc50a
results: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 14 more fields]


In [11]:

// MODEL EVALUATION

// For Metrics and Evaluation
import org.apache.spark.mllib.evaluation.MulticlassMetrics

// Need to convert to RDD to use this
val predictionAndLabels = results.select($"prediction",$"label").as[(Double, Double)].rdd

// Instantiate metrics object
val metrics = new MulticlassMetrics(predictionAndLabels)

// Confusion matrix
println("Confusion matrix:")
println(metrics.confusionMatrix)

Confusion matrix:
107.0  13.0  
18.0   62.0  


import org.apache.spark.mllib.evaluation.MulticlassMetrics
predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[92] at rdd at &lt;console&gt;:40
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@1d6dd02b


In [21]:
val labels = metrics.labels
labels.foreach { l =>
  println(s"Precision($l) = " + metrics.precision(l))
}

Precision(0.0) = 0.856
Precision(1.0) = 0.8266666666666667


labels: Array[Double] = Array(0.0, 1.0)
