In [55]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml.classification import GBTClassifier,DecisionTreeClassifier,RandomForestClassifier
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator

In [31]:
spark=SparkSession.builder.appName("decisionTree").getOrCreate()

In [32]:
df=spark.read.csv("College.csv",inferSchema=True,header=True)

In [33]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [34]:
feature_list=[ 'Apps','Accept','Enroll','Top10perc','Top25perc','F_Undergrad','P_Undergrad','Outstate','Room_Board','Books',
              'Personal','PhD','Terminal','S_F_Ratio','perc_alumni','Expend','Grad_Rate']
label=["Private_Index"]

In [35]:
Label_Indexer=StringIndexer(inputCol="Private",outputCol="Private_Index")
df=Label_Indexer.fit(df).transform(df)

In [37]:
assembler=VectorAssembler(inputCols=feature_list,outputCol="Feature_Index")
df=assembler.transform(df)

In [41]:
train_data,test_data=df.randomSplit([0.7,0.3])

In [38]:
dt_model=DecisionTreeClassifier(featuresCol="Feature_Index",labelCol="Private_Index")
gbt_model=GBTClassifier(featuresCol="Feature_Index",labelCol="Private_Index")
rf_model=RandomForestClassifier(featuresCol="Feature_Index",labelCol="Private_Index")

In [42]:
df_model_fit=dt_model.fit(train_data)
gbt_model_fit=gbt_model.fit(train_data)
rf_model_fit=rf_model.fit(train_data)

In [43]:
df_results=df_model_fit.transform(test_data)
gbt_results=gbt_model_fit.transform(test_data)
rf_results=rf_model_fit.transform(test_data)

In [48]:
rf_results.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- Private_Index: double (nullable = true)
 |-- Feature_Index: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)



# Binary Classification Evaluator

In [50]:
eval=BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="Private_Index")

# Multiclass Classification Evaluator

In [61]:
eval=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="Private_Index",metricName="f1")

In [62]:
eval.evaluate(rf_results)

0.963085218982548

In [63]:
eval.evaluate(df_results)

0.9227708318617409

In [64]:
eval.evaluate(gbt_results)

0.9381737467551607