In [34]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.MulticlassMetrics

import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.Pipeline
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6132a509


In [6]:
val df = (spark.read.option("header","true").option("inferSchema","true")
          .option("multiline","true").format("csv")
          .load("../../data/ml_scala/titanic.csv"))

df: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int ... 10 more fields]


In [7]:
df.printSchema()
df.describe().show()
df.show(5)

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+--------------

In [8]:
// Print a row to better visualize
val colnames = df.columns
val firstrow = df.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Survived
0


Pclass
3


Name
Braund, Mr. Owen Harris


Sex
male


Age
22.0


SibSp
1


Parch
0


Ticket
A/5 21171


Fare
7.25


Cabin
null


Embarked
S




colnames: Array[String] = Array(PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked)
firstrow: org.apache.spark.sql.Row = [1,0,3,Braund, Mr. Owen Harris,male,22.0,1,0,A/5 21171,7.25,null,S]


In [15]:
// Select relevant features and label
val df_detail = (df.select(df("Survived").as("label"), $"Pclass",$"Sex",$"Age", $"SibSp",$"Parch"
          ,$"Fare",$"Embarked"))

// Remove rows with duplicates
val df_clean = df_detail.na.drop()

// Show dataframe
df_clean.show(5)

+-----+------+------+----+-----+-----+-------+--------+
|label|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----+------+------+----+-----+-----+-------+--------+
|    0|     3|  male|22.0|    1|    0|   7.25|       S|
|    1|     1|female|38.0|    1|    0|71.2833|       C|
|    1|     3|female|26.0|    0|    0|  7.925|       S|
|    1|     1|female|35.0|    1|    0|   53.1|       S|
|    0|     3|  male|35.0|    0|    0|   8.05|       S|
+-----+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



df_detail: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]
df_clean: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 6 more fields]


In [26]:
// Convert Categorical features
val genderIndexer = new StringIndexer().setInputCol("Sex").setOutputCol("SexIndex")
val embarkIndexer = new StringIndexer().setInputCol("Embarked").setOutputCol("EmbarkIndex")

val genderEncoder = new OneHotEncoder().setInputCol("SexIndex").setOutputCol("SexVec")
val embarkEncoder = new OneHotEncoder().setInputCol("EmbarkIndex").setOutputCol("EmbarkVec")

// Create dataframe in format spark needs
// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
                  .setInputCols(Array("Pclass", "SexVec", "Age","SibSp","Parch","Fare","EmbarkVec"))
                  .setOutputCol("features"))

genderIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_0a2e624fabd7
embarkIndexer: org.apache.spark.ml.feature.StringIndexer = strIdx_43ae4971ebb1
genderEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_9bdce6af4013
embarkEncoder: org.apache.spark.ml.feature.OneHotEncoder = oneHotEncoder_cce212d6b070
assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_6cadf61c1a6a, handleInvalid=error, numInputCols=7


In [27]:
// Create train/test datasets
val Array(training, test) = df_clean.randomSplit(Array(0.7,0.3))

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: int, Pclass: int ... 6 more fields]


In [29]:
// Create classifier
val clf_lr = new LogisticRegression()

// Create pipeline
val pipeline = new Pipeline().setStages(Array(genderIndexer,embarkIndexer,genderEncoder,
                                              embarkEncoder,assembler, clf_lr))

// Fit model
val model = pipeline.fit(training)

clf_lr: org.apache.spark.ml.classification.LogisticRegression = logreg_fffb245d5cdd
pipeline: org.apache.spark.ml.Pipeline = pipeline_78f0038338c2
model: org.apache.spark.ml.PipelineModel = pipeline_78f0038338c2


In [31]:
val results = model.transform(test)

results.show(5)

+-----+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|label|Pclass|   Sex| Age|SibSp|Parch|    Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+-----+------+------+----+-----+-----+--------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|    0|     1|female| 2.0|    1|    2|  151.55|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,2.0,1.0,...|[-4.2182401181549...|[0.01451086935212...|       1.0|
|    0|     1|female|25.0|    1|    2|  151.55|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,25.0,1.0...|[-3.1472194135218...|[0.04120098096572...|       1.0|
|    0|     1|  male|21.0|    0|    1| 77.2875|       S|     0.0|        0.0|(1,[0],[1.0])|(2,[0],[1

results: org.apache.spark.sql.DataFrame = [label: int, Pclass: int ... 14 more fields]


In [37]:
// Need to convert to RDD to use Multiclass metrics class
val predictionAndLabels = results.select($"prediction",$"label").as[(Double, Double)].rdd

// Model Performance
val metrics = new MulticlassMetrics(predictionAndLabels)
println("Confusion matrix:")
println(metrics.confusionMatrix)

Confusion matrix:
107.0  22.0  
21.0   55.0  


predictionAndLabels: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[158] at rdd at <console>:67
metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@6e9018a9


In [None]:
spark.stop()