In [78]:
import findspark
findspark.init('/home/nick/spark-3.0.1-bin-hadoop2.7')

from pyspark.sql import SparkSession
from pyspark.sql.functions import count, isnan, isnull, when

from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName('Tree Methods').getOrCreate()

In [4]:
data = spark.read.csv('Tree_Methods/College.csv', inferSchema=True, header=True)

In [12]:
data.show(5)
data.printSchema()
data.describe().show()

+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|              School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+--------------------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|Abilene Christian...|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|
|  Adelphi University|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|
|      Adrian College|    Yes|1428|  1097|   336|       22|       50|       1036|         99|  

In [10]:
# Check for missing data
data.select([count(when(isnan(c), c)).alias(c) for c in data.columns]).show()
data.select([count(when(isnull(c), c)).alias(c) for c in data.columns]).show()

+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|School|Private|Apps|Accept|Enroll|Top10perc|Top25perc|F_Undergrad|P_Undergrad|Outstate|Room_Board|Books|Personal|PhD|Terminal|S_F_Ratio|perc_alumni|Expend|Grad_Rate|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+
|     0|      0|   0|     0|     0|        0|        0|          0|          0|       0|         0|    0|       0|  0|       0|        0|          0|     0|        0|
+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+

+------+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+--------

In [17]:
feature_columns = data.columns[2:]

In [31]:
assembler = VectorAssembler(inputCols=feature_columns,outputCol='features')
output = assembler.transform(data)
indexer = StringIndexer(inputCol='Private', outputCol='Private_IX')
output_fixed = indexer.fit(output).transform(output)

In [32]:
output_fixed.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- Private_IX: double (nullable = false)



In [33]:
final_data = output_fixed.select('features','Private_IX')

In [34]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [73]:
dtc = DecisionTreeClassifier(labelCol='Private_IX', featuresCol='features')
rfc = RandomForestClassifier(labelCol='Private_IX', featuresCol='features', numTrees=200)
gbt = GBTClassifier(labelCol='Private_IX', featuresCol='features',maxDepth=10)

In [74]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [75]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [76]:
binary_eval = BinaryClassificationEvaluator(labelCol='Private_IX')

In [77]:
print('Decision Tree Classifier:')
print(binary_eval.evaluate(dtc_preds))
print('Random Forest Classifier:')
print(binary_eval.evaluate(rfc_preds))
print('Gradient Boosted Tree:')
print(binary_eval.evaluate(gbt_preds))

Decision Tree Classifier:
0.9607237801217734
Random Forest Classifier:
0.9873081210873853
Gradient Boosted Tree:
0.961366949661264


In [81]:
multiclass_eval = MulticlassClassificationEvaluator(labelCol='Private_IX', metricName='accuracy')

In [83]:
rfc_accuracy = multiclass_eval.evaluate(rfc_preds)
rfc_accuracy

0.9453781512605042