# Importing libraries and initialising the spark session

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

spark = SparkSession \
.builder \
.appName("attrition example") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()

data = spark.read.load("attrition-db.csv",format="csv",header=True,inferSchema=True)

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="Attrition", outputCol="indexedAttrition").fit(data)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.

# Pre processing the data as per the requirements

In [2]:
indexer = StringIndexer(inputCol="BusinessTravel", outputCol="indBusinessTravel")
indexed_att_data = indexer.fit(data).transform(data)

indexer = StringIndexer(inputCol="Department", outputCol="indDepartment")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="EducationField", outputCol="indEducationField")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="Gender", outputCol="indGender")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="JobRole", outputCol="indJobRole")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="MaritalStatus", outputCol="indMaritalStatus")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="Over18", outputCol="indOver18")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexer = StringIndexer(inputCol="OverTime", outputCol="indOverTime")
indexed_att_data = indexer.fit(indexed_att_data).transform(indexed_att_data)

indexed_att_data.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string 

# Vectoring the features

In [3]:
from pyspark.ml.feature import VectorAssembler
vectorAss = VectorAssembler(inputCols=["Age",
                                       "indBusinessTravel",
                                       "DailyRate",
                                       "indDepartment",
                                      "DistanceFromHome","Education","indEducationField",
                                      "EmployeeCount","EmployeeNumber","EnvironmentSatisfaction",
                                      "indGender","HourlyRate","JobInvolvement","JobLevel","indJobRole",
                                      "JobSatisfaction","indMaritalStatus","MonthlyIncome","MonthlyRate",
                                      "NumCompaniesWorked","indOver18","indOverTime","PercentSalaryHike",
                                      "PerformanceRating","RelationshipSatisfaction","StandardHours",
                                      "StockOptionLevel","TotalWorkingYears","TotalWorkingYears",
                                      "TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany",
                                      "YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion",
                                      "YearsWithCurrManager"]
                            ,outputCol="features")

In [4]:
data = vectorAss.transform(indexed_att_data)

In [5]:
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Splitting the data into training and testing data

In [6]:
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Applying DecisionTreeClassifier

In [7]:
dt = DecisionTreeClassifier(labelCol="indexedAttrition", featuresCol="indexedFeatures")

# Adding the pipeline

In [8]:
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

# Training the model

In [9]:
model = pipeline.fit(trainingData)

# Making predictions

In [10]:
predictions = model.transform(testData)

# Evaluating the model

In [11]:
predictions.select("prediction", "indexedAttrition", "features").show(5)

+----------+----------------+--------------------+
|prediction|indexedAttrition|            features|
+----------+----------------+--------------------+
|       1.0|             0.0|[18.0,2.0,1124.0,...|
|       1.0|             0.0|[18.0,2.0,1431.0,...|
|       0.0|             1.0|(36,[0,2,4,5,7,8,...|
|       0.0|             0.0|[19.0,0.0,645.0,0...|
|       1.0|             1.0|[19.0,2.0,504.0,0...|
+----------+----------------+--------------------+
only showing top 5 rows



In [12]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedAttrition", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
print("Accuracy = %g " % (accuracy))


treeModel = model.stages[2]
# summary only
print(treeModel)

Test Error = 0.155556 
Accuracy = 0.844444 
DecisionTreeClassificationModel (uid=DecisionTreeClassifier_40579e5fd1563ec63d22) of depth 5 with 61 nodes


# Confusion Matrix

In [13]:
indexer_new = StringIndexer(inputCol="prediction", outputCol="indexedPredictedLabel")
ind_pred = indexer_new.fit(predictions).transform(predictions)

tp = ind_pred[(ind_pred.indexedAttrition == 1) & (ind_pred.prediction == 1)].count()
tn = ind_pred[(ind_pred.indexedAttrition == 0) & (ind_pred.prediction == 0)].count()
fp = ind_pred[(ind_pred.indexedAttrition == 0) & (ind_pred.prediction == 1)].count()
fn = ind_pred[(ind_pred.indexedAttrition == 1) & (ind_pred.prediction == 0)].count()
print ("True Positives:", tp)
print ("True Negatives:", tn)
print ("False Positives:", fp)
print ("False Negatives:", fn)
print ("Total", data.count())

r = float(tp)/(tp + fn)
print ("recall", r)

p = float(tp) / (tp + fp)
print ("precision - true", p)

p1 = float(tn) / (tn + fn)
print ("precision - false", p1)

True Positives: 24
True Negatives: 356
False Positives: 31
False Negatives: 39
Total 1470
recall 0.38095238095238093
precision - true 0.43636363636363634
precision - false 0.9012658227848102
