In [2]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.appName('decision_tree').getOrCreate()

In [5]:
input_file_path="file:///C:/Users/ckp43_000/Documents/college.csv"

In [6]:
data=spark.read.csv(input_file_path,inferSchema=True,header=True)

In [7]:
data.printSchema()

root
 |-- private: string (nullable = true)
 |-- apps: integer (nullable = true)
 |-- accept: integer (nullable = true)
 |-- enroll: integer (nullable = true)
 |-- top10perc: integer (nullable = true)
 |-- top25perc: integer (nullable = true)
 |-- f_undergrad: integer (nullable = true)
 |-- p_undergrad: integer (nullable = true)
 |-- outstate: integer (nullable = true)
 |-- room_board: integer (nullable = true)
 |-- books: integer (nullable = true)
 |-- personal: integer (nullable = true)
 |-- phd: integer (nullable = true)
 |-- terminal: integer (nullable = true)
 |-- s_f_ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- expend: integer (nullable = true)
 |-- grad_rate: integer (nullable = true)



In [8]:
data.count()

777

In [11]:
from pyspark.ml.feature import VectorAssembler

In [13]:
data.columns

['private',
 'apps',
 'accept',
 'enroll',
 'top10perc',
 'top25perc',
 'f_undergrad',
 'p_undergrad',
 'outstate',
 'room_board',
 'books',
 'personal',
 'phd',
 'terminal',
 's_f_ratio',
 'perc_alumni',
 'expend',
 'grad_rate']

In [15]:
assembler=VectorAssembler(inputCols=[ 'apps',
 'accept',
 'enroll',
 'top10perc',
 'top25perc',
 'f_undergrad',
 'p_undergrad',
 'outstate',
 'room_board',
 'books',
 'personal',
 'phd',
 'terminal',
 's_f_ratio',
 'perc_alumni',
 'expend',
 'grad_rate'],outputCol='features')

In [16]:
output=assembler.transform(data)

In [17]:
output.show(2)

+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|private|apps|accept|enroll|top10perc|top25perc|f_undergrad|p_undergrad|outstate|room_board|books|personal|phd|terminal|s_f_ratio|perc_alumni|expend|grad_rate|            features|
+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+-----+--------+---+--------+---------+-----------+------+---------+--------------------+
|    Yes|1660|  1232|   721|       23|       52|       2885|        537|    7440|      3300|  450|    2200| 70|      78|     18.1|         12|  7041|       60|[1660.0,1232.0,72...|
|    Yes|2186|  1924|   512|       16|       29|       2683|       1227|   12280|      6450|  750|    1500| 29|      30|     12.2|         16| 10527|       56|[2186.0,1924.0,51...|
+-------+----+------+------+---------+---------+-----------+-----------+--------+----------+---

In [18]:
from pyspark.ml.feature import StringIndexer

In [25]:
indexer=StringIndexer(inputCol='private',outputCol='privateIndex')

In [26]:
output_fixed=indexer.fit(output).transform(output)

In [27]:
output_fixed.head(1)

[Row(private='Yes', apps=1660, accept=1232, enroll=721, top10perc=23, top25perc=52, f_undergrad=2885, p_undergrad=537, outstate=7440, room_board=3300, books=450, personal=2200, phd=70, terminal=78, s_f_ratio=18.1, perc_alumni=12, expend=7041, grad_rate=60, features=DenseVector([1660.0, 1232.0, 721.0, 23.0, 52.0, 2885.0, 537.0, 7440.0, 3300.0, 450.0, 2200.0, 70.0, 78.0, 18.1, 12.0, 7041.0, 60.0]), privateIndex=0.0)]

In [28]:
final_data=output_fixed.select('features','privateIndex')

In [29]:
train_data,test_data=final_data.randomSplit([0.7,0.3])

In [30]:
train_data.count()

530

In [31]:
test_data.count()

247

In [32]:
from pyspark.ml.classification import  (DecisionTreeClassifier,
                                        RandomForestClassifier,
                                        GBTClassifier)


In [33]:
from pyspark.ml import pipeline

In [35]:
dtc=DecisionTreeClassifier(labelCol='privateIndex',featuresCol='features')
rfc=RandomForestClassifier(labelCol='privateIndex',featuresCol='features')
gbt=GBTClassifier(labelCol='privateIndex',featuresCol='features')

In [36]:
dtc_model=dtc.fit(train_data)
rfc_model=rfc.fit(train_data)
gbt_model=gbt.fit(train_data)

In [37]:
dtc_preds=dtc_model.transform(test_data)
rfc_preds=rfc_model.transform(test_data)
gbt_preds=gbt_model.transform(test_data)
                        

In [38]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [39]:
my_binary_eval=BinaryClassificationEvaluator(labelCol='privateIndex')

In [41]:
print('DTC')
print(my_binary_eval.evaluate(dtc_preds))

DTC
0.8859375000000002


In [44]:
print('RFC')
print(my_binary_eval.evaluate(rfc_preds))

RFC
0.9737689393939387


In [46]:
gbt_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- privateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [47]:
rfc_preds.printSchema()

root
 |-- features: vector (nullable = true)
 |-- privateIndex: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [48]:
print('GBT')
print(my_binary_eval.evaluate(gbt_preds))

GBT
0.9361742424242422


In [49]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [53]:
acc_eval=MulticlassClassificationEvaluator(labelCol='privateIndex',metricName='accuracy')

In [54]:
rfc_acc=acc_eval.evaluate(rfc_preds)

In [55]:
rfc_acc

0.9352226720647774

In [56]:
dtc_acc=acc_eval.evaluate(dtc_preds)

In [57]:
dtc_acc

0.9149797570850202

In [59]:
gbt_acc=acc_eval.evaluate(gbt_preds)

In [60]:
type(gbt_acc)

float

In [61]:
gbt_acc

0.9149797570850202