In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=3bf40ba4b667447142b829660b000003bcbffb62fd88bd251a1c7918d3b8a283
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName("classification").getOrCreate()

In [5]:
hrdata=spark.read.csv("/content/drive/MyDrive/HR Analytics/train_LZdllcl.csv",
                      inferSchema=True,header=True)

In [9]:
hrdata.groupBy('education').count().show()

+----------------+-----+
|       education|count|
+----------------+-----+
| Below Secondary|  805|
|Master's & above|14925|
|      Bachelor's|39078|
+----------------+-----+



In [8]:
hrdata=hrdata.na.fill(value="Bachelor's",subset=['education'])

In [12]:
hrdata.groupBy('previous_year_rating').count().show()

+--------------------+-----+
|previous_year_rating|count|
+--------------------+-----+
|                   1| 6223|
|                   3|22742|
|                   5|11741|
|                   4| 9877|
|                   2| 4225|
+--------------------+-----+



In [11]:
hrdata=hrdata.na.fill(value=3,subset=['previous_year_rating'])

In [13]:
hrdata.columns

['employee_id',
 'department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'is_promoted']

In [15]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

In [43]:
indexer=[StringIndexer(inputCol=col,outputCol=col+"index").fit(hrdata)
for col in list(set(hrdata.columns)-set(['no_of_trainings','age',
                                         'length_of_service',
                                         'avg_training_score','employee_id']))]

In [44]:
pipeline=Pipeline(stages=indexer)

In [45]:
hrdatadf=pipeline.fit(hrdata).transform(hrdata)

In [62]:
hrdatadf.columns

['employee_id',
 'department',
 'no_of_trainings',
 'age',
 'length_of_service',
 'avg_training_score',
 'genderindex',
 'is_promotedindex',
 'KPIs_met >80%index',
 'awards_won?index',
 'educationindex',
 'recruitment_channelindex',
 'departmentindex',
 'regionindex',
 'previous_year_ratingindex',
 'features',
 'label']

In [47]:
columnstodrop=['employee_id''department','region','education','gender',
               'recruitment_channel','previous_year_rating','KPIs_met >80%',
               'awards_won?','is_promoted']

In [48]:
hrdatadf=hrdatadf.drop(*columnstodrop)

In [49]:
from pyspark.ml.feature import RFormula

In [50]:
formula=RFormula(formula="is_promotedindex~.",featuresCol='features',
                 labelCol='label')

In [51]:
hrdatadf=formula.fit(hrdatadf).transform(hrdatadf)

In [52]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator,BinaryClassificationEvaluator

In [53]:
logit=LogisticRegression()

In [54]:
logitmodel=logit.fit(hrdatadf)

In [55]:
logitmodel.summary.accuracy

0.9325098525762663

In [56]:
logitmodel.summary.areaUnderROC

0.8683283464397373

In [57]:
accuracy=MulticlassClassificationEvaluator(metricName="accuracy")

In [58]:
auc=BinaryClassificationEvaluator()

In [59]:
from pyspark.ml.classification import DecisionTreeClassifier

In [64]:
tree=DecisionTreeClassifier(maxBins=35)

In [66]:
treemodel=tree.fit(hrdatadf)

In [67]:
treepredict=treemodel.transform(hrdatadf)

In [69]:
accuracy.evaluate(treepredict)

0.9261786600496278

In [70]:
auc.evaluate(treepredict)

0.5785816850778404

In [71]:
from pyspark.ml.classification import RandomForestClassifier

In [72]:
RF=RandomForestClassifier(maxBins=35)

In [73]:
RFmodel=RF.fit(hrdatadf)

In [74]:
RFpredict=RFmodel.transform(hrdatadf)

In [75]:
accuracy.evaluate(RFpredict)

0.9237520070062765

In [76]:
auc.evaluate(RFpredict)

0.8557354168397036

In [77]:
from pyspark.ml.classification import GBTClassifier

In [78]:
gbm=GBTClassifier(maxBins=35)

In [79]:
gbmmodel=gbm.fit(hrdatadf)

In [80]:
gbmpredict=gbmmodel.transform(hrdatadf)

In [81]:
accuracy.evaluate(gbmpredict)

0.9419245365640052

In [82]:
auc.evaluate(gbmpredict)

0.9169767944528242

In [83]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [84]:
nn=MultilayerPerceptronClassifier(layers=[11,100,2])
# 11 - input , hidden layer 1 with 100 neurons and output layer 2

In [98]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [99]:
nnmodel=nn.fit(hrdatadf)

In [100]:
nnpredict=nnmodel.transform(hrdatadf)