In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('interation4').getOrCreate()

In [2]:
# Importing data which has a header. Schema is automatically configured.
dataset = spark.read.csv('usa-stackoverflow-iteration4.csv', header=True, inferSchema=True)



In [3]:
# Let's see the data. You'll notice nulls.
dataset.show(10)

+----------+-----+----------+-------+------------------+---------------+--------------+-----------+-----------+---------------+---------------+---------------+--------------+-----------+--------+-------------+------------+---------+--------+---+
|Respondent|Hobby|OpenSource|Student|        Employment|FormalEducation|UndergradMajor|CompanySize|YearsCoding|YearsCodingProf|JobSatisfaction|ConvertedSalary|NumberMonitors|CheckInCode|WakeTime|HoursComputer|HoursOutside|SkipMeals|Exercise|Age|
+----------+-----+----------+-------+------------------+---------------+--------------+-----------+-----------+---------------+---------------+---------------+--------------+-----------+--------+-------------+------------+---------+--------+---+
|         1|  Yes|       Yes|     No|Employed full-time|             Nd|            Cd|          8|          4|              1|              1|         120000|             2|          5|       6|            4|           0|        1|       0|  2|
|         2|  Ye

In [4]:
dataset.columns

['Respondent',
 'Hobby',
 'OpenSource',
 'Student',
 'Employment',
 'FormalEducation',
 'UndergradMajor',
 'CompanySize',
 'YearsCoding',
 'YearsCodingProf',
 'JobSatisfaction',
 'ConvertedSalary',
 'NumberMonitors',
 'CheckInCode',
 'WakeTime',
 'HoursComputer',
 'HoursOutside',
 'SkipMeals',
 'Exercise',
 'Age']

In [5]:
dataset.describe().show()

+-------+------------------+-----+----------+--------------+------------------+---------------+--------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|        Respondent|Hobby|OpenSource|       Student|        Employment|FormalEducation|UndergradMajor|       CompanySize|       YearsCoding|   YearsCodingProf|   JobSatisfaction|  ConvertedSalary|    NumberMonitors|       CheckInCode|          WakeTime|     HoursComputer|      HoursOutside|         SkipMeals|          Exercise|               Age|
+-------+------------------+-----+----------+--------------+------------------+---------------+--------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------

In [6]:
dataset.printSchema()

root
 |-- Respondent: integer (nullable = true)
 |-- Hobby: string (nullable = true)
 |-- OpenSource: string (nullable = true)
 |-- Student: string (nullable = true)
 |-- Employment: string (nullable = true)
 |-- FormalEducation: string (nullable = true)
 |-- UndergradMajor: string (nullable = true)
 |-- CompanySize: integer (nullable = true)
 |-- YearsCoding: integer (nullable = true)
 |-- YearsCodingProf: integer (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- ConvertedSalary: decimal(7,0) (nullable = true)
 |-- NumberMonitors: integer (nullable = true)
 |-- CheckInCode: integer (nullable = true)
 |-- WakeTime: integer (nullable = true)
 |-- HoursComputer: integer (nullable = true)
 |-- HoursOutside: integer (nullable = true)
 |-- SkipMeals: integer (nullable = true)
 |-- Exercise: integer (nullable = true)
 |-- Age: integer (nullable = true)



In [7]:
dataset.count()

13832

In [8]:
dataset = dataset[dataset["Employment"]=="Employed full-time"]

In [9]:
dataset.count()

11995

In [10]:
dataset = dataset[dataset["ConvertedSalary"].isNotNull()]

In [11]:
dataset.count()

11482

In [12]:
from pyspark.sql.functions import isnan
dataset.filter((dataset["ConvertedSalary"] == "") | dataset["ConvertedSalary"].isNull() | isnan(dataset["ConvertedSalary"])).count()

0

In [13]:
dataset=dataset.na.drop()

In [14]:
dataset.count()

9959

In [15]:
bounds = dataset.approxQuantile("ConvertedSalary", [0.25, 0.75], 0)
IQR = bounds[1]-bounds[0]
lowerRange = bounds[0] - 1.5*IQR
upperRange = bounds[1]+ 1.5*IQR
dataset = dataset.filter((dataset['ConvertedSalary'] >= lowerRange) & (dataset['ConvertedSalary'] <= upperRange))

In [16]:
dataset.count()

9391

In [17]:
dataset.approxQuantile("ConvertedSalary", [0.5], 0)

[100000.0]

In [18]:
from pyspark.sql import functions as f
dataset = dataset.withColumn('NewSalaryLevel', f.when(f.col('ConvertedSalary') < 100000, 0).otherwise(1))

In [19]:
columns_to_drop = ['Student', 'Employment','ConvertedSalary','Respondent','Hobby','OpenSource']
dataset = dataset.drop(*columns_to_drop)

In [20]:
dataset.show(10)

+---------------+--------------+-----------+-----------+---------------+---------------+--------------+-----------+--------+-------------+------------+---------+--------+---+--------------+
|FormalEducation|UndergradMajor|CompanySize|YearsCoding|YearsCodingProf|JobSatisfaction|NumberMonitors|CheckInCode|WakeTime|HoursComputer|HoursOutside|SkipMeals|Exercise|Age|NewSalaryLevel|
+---------------+--------------+-----------+-----------+---------------+---------------+--------------+-----------+--------+-------------+------------+---------+--------+---+--------------+
|             Nd|            Cd|          8|          4|              1|              1|             2|          5|       6|            4|           0|        1|       0|  2|             1|
|             Bd|            We|          5|          3|              1|             -3|             2|          1|       2|            2|           1|        0|       3|  2|             0|
|             Bd|            Cd|          6|      

In [21]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)
# First create a string indexer (convert every string into a number, such as male = 0 and female = 1).
# A number will be assigned to every category in the column.
FormalEducation_indexer = StringIndexer(inputCol='FormalEducation',outputCol='FormalEducationIndex')
# Now we can one hot encode these numbers. This converts the various outputs into a single vector.
# This makes it easier to process when you have multiple classes.
FormalEducation_encoder = OneHotEncoder(inputCol='FormalEducationIndex',outputCol='FormalEducationVec')

UndergradMajor_indexer = StringIndexer(inputCol='UndergradMajor',outputCol='UndergradMajorIndex')
UndergradMajor__encoder = OneHotEncoder(inputCol='UndergradMajorIndex',outputCol='UndergradMajorVec')

# assemble all of this as one vector in the features column. 
assembler = VectorAssembler(inputCols=[
 'FormalEducationVec',
 'UndergradMajorVec',
 'CompanySize',
 'YearsCoding',
 'YearsCodingProf',
 'JobSatisfaction',
 'NumberMonitors',
 'CheckInCode',
 'WakeTime',
 'HoursComputer',
 'HoursOutside',
 'SkipMeals',
 'Exercise',
 'Age'],outputCol='features')


# Train/test split. 
trainset, testset= dataset.randomSplit([0.8,0.2])

In [22]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [23]:
# Logistic Regression Model
# Note that survived is a categorial variable but didn't require any transformation.
# That's because it's already in the format of 1's and 0's. 

log_reg = LogisticRegression(featuresCol='features',labelCol='NewSalaryLevel',maxIter=100,regParam=0,elasticNetParam=0)



# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline_log = Pipeline(stages=[FormalEducation_indexer,UndergradMajor_indexer,
                           FormalEducation_encoder,UndergradMajor__encoder,
                           assembler,log_reg])

# Note pipeline. Call it as you would call a machine learning object.
fit_model = pipeline_log.fit(trainset)

# Transform test data. 
predictions_log_reg = fit_model.transform(testset)

# If we select the actual and predicted results, we can see that some predictions were correct while others were wrong.
predictions_log_reg.select('NewSalaryLevel','prediction').show()

# Evaluate the model using the binary classifer.
from pyspark.ml.evaluation import BinaryClassificationEvaluator

my_eval= BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='NewSalaryLevel')
print('Accuracy:', my_eval.evaluate(predictions_log_reg ))

+--------------+----------+
|NewSalaryLevel|prediction|
+--------------+----------+
|             1|       1.0|
|             1|       0.0|
|             1|       1.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       0.0|
|             1|       0.0|
|             0|       0.0|
+--------------+----------+
only showing top 20 rows

Accuracy: 0.7347719723494031


In [24]:
# result of Logistic Regression Model
print("Binomial  coefficients: " + str(fit_model.stages[-1].coefficients)) 
print("-----")
print("Binomial  intercept: " + str(fit_model.stages[-1].intercept))

Binomial  coefficients: [0.12167500105013086,0.5992443225793456,-0.09856448051285559,-0.4586539153122765,0.9126710811409071,-0.19860791672802613,-0.5629081431806874,0.08109984309832637,-0.23231281846133792,-0.2185040268371931,-0.2484617160026629,-0.014164421323728044,-0.46798655120824684,-0.11520198261109531,-0.8802851901480553,-0.006498473527394809,0.12670496503378637,0.11992781400227936,0.36584760573764746,0.08583951514509376,-0.1982332150264456,0.254618624661691,0.18644000069317165,-0.0988323515415614,0.17987628030560696,-0.06571098049282462,0.05031826903144528,-0.10383721783300638]
-----
Binomial  intercept: -3.319970572952229


In [25]:
# Decision Tree Model

from pyspark.ml.classification import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(featuresCol='features',labelCol='NewSalaryLevel',maxDepth=1, maxBins=10)


# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline_dec = Pipeline(stages=[FormalEducation_indexer,UndergradMajor_indexer,
                           FormalEducation_encoder,UndergradMajor__encoder,
                           assembler,decision_tree])

# Note pipeline. Call it as you would call a machine learning object.
fit_model_dec = pipeline_dec.fit(trainset)

# Transform test data. 
predictions_decision_tree = fit_model_dec.transform(testset)


# If we select the actual and predicted results, we can see that some predictions were correct while others were wrong.
predictions_decision_tree.select('NewSalaryLevel','prediction').show()


# We can then evaluate using AUC (area under the curve). AUC is linked to ROC.
print('Accuracy:', my_eval.evaluate(predictions_decision_tree ))

+--------------+----------+
|NewSalaryLevel|prediction|
+--------------+----------+
|             1|       1.0|
|             1|       0.0|
|             1|       1.0|
|             0|       1.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             1|       1.0|
|             0|       1.0|
+--------------+----------+
only showing top 20 rows

Accuracy: 0.7153761558488195


In [26]:
# result of Decision Tree Model
print("Decision Tree Feature Importances: " + str(fit_model_dec.stages[-1].featureImportances)) 
print("-----")
print("Full description of model: " + str(fit_model_dec.stages[-1].toDebugString))

Decision Tree Feature Importances: (28,[18],[1.0])
-----
Full description of model: DecisionTreeClassificationModel (uid=DecisionTreeClassifier_419c9adab8a842faa38b) of depth 1 with 3 nodes
  If (feature 18 <= 2.0)
   Predict: 0.0
  Else (feature 18 > 2.0)
   Predict: 1.0



In [27]:
# Random Forest Model
 
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol='features',labelCol='NewSalaryLevel',numTrees=30, maxDepth=5)

# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline_rf = Pipeline(stages=[FormalEducation_indexer,UndergradMajor_indexer,
                           FormalEducation_encoder,UndergradMajor__encoder,
                           assembler,rf])

# Note pipeline. Call it as you would call a machine learning object.
fit_model_rf = pipeline_rf.fit(trainset)

# Transform test data. 
predictions_rf = fit_model_rf.transform(testset)


# If we select the actual and predicted results, we can see that some predictions were correct while others were wrong.
predictions_rf.select('NewSalaryLevel','prediction').show()


# We can then evaluate using AUC (area under the curve). AUC is linked to ROC.
print('Accuracy:', my_eval.evaluate(predictions_rf ))

+--------------+----------+
|NewSalaryLevel|prediction|
+--------------+----------+
|             1|       1.0|
|             1|       0.0|
|             1|       1.0|
|             0|       1.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             1|       1.0|
|             0|       1.0|
+--------------+----------+
only showing top 20 rows

Accuracy: 0.72179055570518


In [28]:
# result of Random Forest Model
print("Random Forest Feature Importances: " + str(fit_model_rf.stages[-1].featureImportances))
print("-----")
print("trees of model: " + str(fit_model_rf.stages[-1].trees))

Random Forest Feature Importances: (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.004622908636607892,0.024572698180738102,0.002087089300563357,0.00342973350090119,0.007852638727789164,0.003789784683128826,0.004146560192849915,0.002736591119047684,0.000950544239131764,0.0012989534027582495,0.00038411165082145545,0.0015481015679914719,0.0004382047620228625,0.0009199338568529519,0.005697247662607596,0.0002691089712472885,0.030803073994245467,0.22258533852782061,0.4825717200806161,0.010518600895284879,0.012916273500800707,0.021449094584362626,0.015440440731413207,0.002721494725623307,0.011850646886356902,0.005306438040055002,0.005292660616147167,0.11380000696221405])
-----
trees of model: [DecisionTreeClassificationModel (uid=dtc_21dba5f7505e) of depth 5 with 53 nodes, DecisionTreeClassificationModel (uid=dtc_e7b2129d6c54) of depth 5 with 61 nodes, DecisionTreeClassificationModel (uid=dtc_df4c613e478e) of depth 5 with 57 nodes, DecisionTreeClassification

In [29]:
# Gradient-boosted tree model
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(featuresCol='features',labelCol='NewSalaryLevel',maxIter=20, maxDepth=5, seed=42)

# Lists everything we want to do. Index data, encode data, assemble data and then pass in the actual model.
pipeline_gbt = Pipeline(stages=[FormalEducation_indexer,UndergradMajor_indexer,
                           FormalEducation_encoder,UndergradMajor__encoder,
                           assembler,gbt])

# Note pipeline. Call it as you would call a machine learning object.
fit_model_gbt = pipeline_gbt.fit(trainset)

# Transform test data. 
predictions_gbt = fit_model_gbt.transform(testset)


# If we select the actual and predicted results, we can see that some predictions were correct while others were wrong.
predictions_gbt.select('NewSalaryLevel','prediction').show()


# We can then evaluate using AUC (area under the curve). AUC is linked to ROC.
print('Accuracy:', my_eval.evaluate(predictions_gbt ))

+--------------+----------+
|NewSalaryLevel|prediction|
+--------------+----------+
|             1|       1.0|
|             1|       0.0|
|             1|       1.0|
|             0|       1.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             1|       1.0|
|             1|       1.0|
|             0|       1.0|
+--------------+----------+
only showing top 20 rows

Accuracy: 0.7287952239877905


In [30]:
# result of Gradient-boosted tree model
print("GBT Feature Importances: " + str(fit_model_gbt.stages[-1].featureImportances))
print("-----")
print("trees of model: " + str(fit_model_gbt.stages[-1].trees))

GBT Feature Importances: (28,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27],[0.008999481518899755,0.029679853162566133,0.01638331773768424,0.004939142626011476,0.02127373775436978,0.018544690634268838,0.034928161684240275,0.018500397896068766,0.006749242806209594,0.002682707344937205,0.002449991929391356,0.009108444373766152,0.011693840902974388,0.008376668310176437,0.023602726793289095,0.00552371809274188,0.10547648956145282,0.08868073848891556,0.10466688049549959,0.06376237659422997,0.054943144007791815,0.07122902658813679,0.08903125969185284,0.030598607566361354,0.05396678612134074,0.025075504168066726,0.045395717435814345,0.04373734571294186])
-----
trees of model: [DecisionTreeRegressionModel (uid=dtr_ee36b94dbe09) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_13d5dc2afd43) of depth 5 with 61 nodes, DecisionTreeRegressionModel (uid=dtr_d5abbeb8111b) of depth 5 with 63 nodes, DecisionTreeRegressionModel (uid=dtr_02f7c78368d6) of depth 5

In [31]:
print("Full description of model: " + str(fit_model_gbt.stages[-1].toDebugString))

Full description of model: GBTClassificationModel (uid=GBTClassifier_4e379cb72031870feca9) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 18 <= 2.0)
     If (feature 18 <= 1.0)
      If (feature 16 <= 4.0)
       If (feature 21 <= 4.0)
        If (feature 4 in {0.0})
         Predict: -0.8924731182795699
        Else (feature 4 not in {0.0})
         Predict: 0.3333333333333333
       Else (feature 21 > 4.0)
        If (feature 23 <= 2.0)
         Predict: -0.4714285714285714
        Else (feature 23 > 2.0)
         Predict: -0.75
      Else (feature 16 > 4.0)
       If (feature 22 <= 4.0)
        If (feature 17 <= 2.0)
         Predict: -0.6477987421383647
        Else (feature 17 > 2.0)
         Predict: -0.37755102040816324
       Else (feature 22 > 4.0)
        If (feature 19 <= -1.0)
         Predict: -0.6
        Else (feature 19 > -1.0)
         Predict: 0.23595505617977527
     Else (feature 18 > 1.0)
      If (feature 19 <= 2.0)
       If (feature 20 <= 1.0)
        If (