In [1]:

from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.regression import LabeledPoint
from numpy import array

In [2]:

conf = SparkConf().setMaster('local').setAppName('SparkDecisionTree')
sc = SparkContext(conf = conf)

In [3]:

def binary(YN):
    if (YN == 'Y'):
        return 1
    else:
        return 0

In [4]:

def map_education(degree):
    if degree == 'BS':
        return 1
    if degree == 'MS':
        return 2
    if degree == 'PhD':
        return 3
    else:
        return 0

In [5]:

def create_labeled_points(fields):
    years_experience = int(fields[0])
    employed = binary(fields[1])
    previous_employers = binary(fields[2])
    education_level = map_education(fields[3])
    top_tier = binary(fields[4])
    interned = binary(fields[5])
    hired = binary(fields[6])
    
    return LabeledPoint(hired, array([years_experience, employed, previous_employers, education_level, top_tier, interned]))
    

In [6]:

raw_data = sc.textFile('DataScience-Python3/PastHires.csv')
header = raw_data.first()
raw_data = raw_data.filter(lambda x: x != header)

In [7]:
csv_data = raw_data.map(lambda x: x.split(','))

In [8]:
training_data = csv_data.map(create_labeled_points)

In [9]:
# create a fake candidate RDD to test the model
test_candidate = [array([10, 1, 3, 3, 1, 0])]
test_data = sc.parallelize(test_candidate)

In [10]:

# train the decision tree classifier using the data set
model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={1:2, 3:4, 4:2, 5:2}, impurity='gini', maxDepth=5, maxBins=32)

In [11]:

# run inference and get a prediction for the test candidate
predictions = model.predict(test_data)
print ("Hire prediction: ")
results = predictions.collect()

for result in results:
    print (result)

Hire prediction: 
1.0


In [12]:

# We can also print out the decision tree itself
print('Learned classification tree model: ')
print(model.toDebugString())

Learned classification tree model: 
DecisionTreeModel classifier of depth 4 with 9 nodes
  If (feature 1 in {0.0})
   If (feature 5 in {0.0})
    If (feature 0 <= 0.5)
     If (feature 3 in {1.0})
      Predict: 0.0
     Else (feature 3 not in {1.0})
      Predict: 1.0
    Else (feature 0 > 0.5)
     Predict: 0.0
   Else (feature 5 not in {0.0})
    Predict: 1.0
  Else (feature 1 not in {0.0})
   Predict: 1.0

