In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,lit, desc, col, when, max
from functools import reduce
from pyspark.ml import Pipeline
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors, SparseVector, DenseVector
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.tree import DecisionTree
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession.builder.getOrCreate()

In [3]:
def title(s):
    print("---- %s -----" %s)    
    
def see(s, v):
    print("---- %s -----" %s)
    print(v)

# 1: Preprocessing 

## 1.1 OneHot encoding for "sales" and "salary" columns

In [4]:
files = "../data/HR_comma_sep.csv"
rawDF = spark.read.csv(files, header=True, inferSchema=True)
stages=[]
categoricalColumns = ["sales", "salary"]
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    encoder = OneHotEncoder(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "Vec")
    stages += [stringIndexer, encoder]

In [None]:
stages
rawDF.printSchema()

## 1.2 Convert label into label indices using the StringIndexer

In [5]:
label_stringIdx = StringIndexer(inputCol = "left", outputCol = "label")
stages += [label_stringIdx]

## 1.3 Transform all features into a vector using VectorAssembler

In [6]:
numericCols = ["satisfaction_level",\
               "last_evaluation",\
               "number_project",\
               "average_montly_hours",\
               "time_spend_company",\
               "Work_accident",\
              "promotion_last_5years"]
assemblerInputs = numericCols + list(map(lambda c: c + "Vec", categoricalColumns))
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

## 1.4 Create a Pipeline, Fit and Transform

In [7]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(rawDF)
rawDF = pipelineModel.transform(rawDF)

## 1.5 Keep relevant columns

In [8]:
selectedcols = ["label", "features"]
rawDF = rawDF.select(selectedcols)
rawDF = rawDF.rdd.map(list)

In [9]:
#display(rawDF)
rawDF.take(1)

[[1.0,
  SparseVector(18, {0: 0.38, 1: 0.53, 2: 2.0, 3: 157.0, 4: 3.0, 7: 1.0, 16: 1.0})]]

## 1.6 convert Sparse vectors to DenseVectors and have LabeledPoints set

In [10]:
def preprocessing(elem):
    label = elem[0]
    featureVector = Vectors.dense(elem[1])
    return LabeledPoint(label, featureVector)

data = rawDF.map(preprocessing)

In [11]:
data.take(10)

[LabeledPoint(1.0, [0.38,0.53,2.0,157.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.8,0.86,5.0,262.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]),
 LabeledPoint(1.0, [0.11,0.88,7.0,272.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]),
 LabeledPoint(1.0, [0.72,0.87,5.0,223.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.37,0.52,2.0,159.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.41,0.5,2.0,153.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.1,0.77,6.0,247.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.92,0.85,5.0,259.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.89,1.0,5.0,224.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0]),
 LabeledPoint(1.0, [0.42,0.53,2.0,142.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0])]

# 2: Split the data into 3 sets

In [12]:
see("data", data.count())
sampledData = data.sample(fraction=0.05, withReplacement=False, seed=17)

see("sampledData", sampledData.count())

---- data -----
14999
---- sampledData -----
754


In [13]:
sampledData.take(1)

[LabeledPoint(1.0, [0.38,0.46,2.0,137.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0])]

In [14]:
(trainData, cvData, testData) = data.randomSplit(weights=[0.8, 0.1, 0.1])
#(trainData, cvData, testData) = sampledData.randomSplit(weights=[0.8, 0.1, 0.1])
trainData.cache()
cvData.cache()
testData.cache()

PythonRDD[43] at RDD at PythonRDD.scala:48

# 3: Building a decison tree classifier model

## 3.1 Training

In [15]:
def buildDecisionTreeClassifier(trainData):
    model = DecisionTree.trainClassifier(trainData,numClasses=2,\
                                         categoricalFeaturesInfo={},\
                                         impurity="gini",\
                                         maxDepth=4,\
                                         maxBins=100)
    return  model
%time m1 =  buildDecisionTreeClassifier(trainData)

CPU times: user 10 ms, sys: 0 ns, total: 10 ms
Wall time: 7.15 s


In [16]:
m1

DecisionTreeModel classifier of depth 4 with 27 nodes

# 3.2 Testing the trained model (prediction)

In [17]:
testPoint = testData.first()
see("testPoint", testPoint)

label = testPoint.label
see("label", label)

features = testPoint.features
see("features", features)

---- testPoint -----
(1.0,[0.11,0.88,7.0,272.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0])
---- label -----
1.0
---- features -----
[0.11,0.88,7.0,272.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]


In [18]:
predictedLabel = m1.predict(testPoint.features)
see("predictedLabel", predictedLabel)

---- predictedLabel -----
1.0


In [19]:
testPoints = testData.take(10)
for testPoint in testPoints:
    label = testPoint.label
    predictedLabel = m1.predict(testPoint.features)
    print("label: %s , predicted: %s, correct: %s"  %(label,predictedLabel, label==predictedLabel))

label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 0.0, correct: False
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 1.0, correct: True
label: 1.0 , predicted: 0.0, correct: False


# 4: Evaluate the model using the Accuracy metric in SPARK

In [20]:
#Caclusating the metrics using Spark Mulitclass
def getMetrics(model, data):
    labels = data.map(lambda d: d.label)
    features = data.map(lambda d: d.features)
    predictions = model.predict(features)
    predictionsAndLabels = predictions.zip(labels)
    return MulticlassMetrics(predictionsAndLabels)

In [21]:
metrics = getMetrics(m1, cvData)
%time accuracy = metrics.accuracy
see("accuracy", accuracy)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.73 s
---- accuracy -----
0.966042966042966


# 5: Tuning HyperParams using CVData

## 5.1 Trying combinations of hyper params

In [22]:
def evaluate(trainData, cvData):
    evaluations = []
    #Take combinations of 3 hyperparams, evaluating the accuracy of each using CV Dataset
    for impurity in ["gini", "entropy"]:
        for depth in [1, 20]:
            for bins in [10, 300]:
                model = DecisionTree.trainClassifier(trainData,numClasses=2, categoricalFeaturesInfo={}, impurity=impurity, maxDepth=depth, maxBins=bins)
                accuracy = getMetrics(model, cvData).accuracy
                evaluations.append(((impurity, depth, bins), accuracy))

    return evaluations
evaluations = evaluate(trainData, cvData)
evaluations

[(('gini', 1, 10), 0.7657657657657657),
 (('gini', 1, 300), 0.7914067914067914),
 (('gini', 20, 10), 0.9653499653499653),
 (('gini', 20, 300), 0.9715869715869716),
 (('entropy', 1, 10), 0.7657657657657657),
 (('entropy', 1, 300), 0.7914067914067914),
 (('entropy', 20, 10), 0.9702009702009702),
 (('entropy', 20, 300), 0.972972972972973)]

In [26]:
sortedEvals = sorted(evaluations, key=lambda a:a[1], reverse=True)
title("Sorted Evaluations")
for e in sortedEvals:
    print(e)

---- Sorted Evaluations -----
(('entropy', 20, 300), 0.972972972972973)
(('gini', 20, 300), 0.9715869715869716)
(('entropy', 20, 10), 0.9702009702009702)
(('gini', 20, 10), 0.9653499653499653)
(('gini', 1, 300), 0.7914067914067914)
(('entropy', 1, 300), 0.7914067914067914)
(('gini', 1, 10), 0.7657657657657657)
(('entropy', 1, 10), 0.7657657657657657)


In [27]:
bestParams = sortedEvals[0][0]
see("bestParams", bestParams)
type(bestParams)

---- bestParams -----
('entropy', 20, 300)


tuple

# 6: Evaluate best hyper params using the test set

In [28]:
(i, d, b) = bestParams
model = DecisionTree.trainClassifier(
    trainData.union(cvData),
    numClasses=2, 
    categoricalFeaturesInfo={}, 
    impurity=i, 
    maxDepth=d, 
    maxBins=b)
see("testData accuracy",getMetrics(model, testData).accuracy)
see("testData+cvData accuracy", getMetrics(model, testData.union(cvData)).accuracy)

---- testData accuracy -----
0.9864253393665159
---- testData+cvData accuracy -----
0.9929765886287626


# Clean up persisted data

In [22]:
trainData.unpersist()
cvData.unpersist()
testData.unpersist()

PythonRDD[43] at RDD at PythonRDD.scala:48