In [89]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [90]:
spark=SparkSession.builder.appName("randomForestProject").getOrCreate()

In [91]:
df=spark.read.csv("dog_food.csv",inferSchema=True,header=True)

In [92]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [93]:
df.show(2)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
+---+---+----+---+-------+
only showing top 2 rows



In [94]:
df=df.withColumn("E",(100-(df["A"]+df["B"]+df["C"]+df["D"])))

In [95]:
feature_list=["A","B","C","D","E"]

In [96]:
assembler=VectorAssembler(inputCols=feature_list,outputCol="featureVector")

In [97]:
df=assembler.transform(df)

In [98]:
train_data,test_data=df.randomSplit([0.7,0.3])

In [99]:
rf=RandomForestClassifier(featuresCol="featureVector",labelCol="Spoiled",numTrees=200,maxDepth=20)

In [100]:
model=rf.fit(train_data)

In [101]:
result=model.transform(test_data)

In [102]:
result.show(5)

+---+---+----+---+-------+----+--------------------+-------------+-------------+----------+
|  A|  B|   C|  D|Spoiled|   E|       featureVector|rawPrediction|  probability|prediction|
+---+---+----+---+-------+----+--------------------+-------------+-------------+----------+
|  1|  1|10.0|  8|    1.0|80.0|[1.0,1.0,10.0,8.0...| [171.0,29.0]|[0.855,0.145]|       0.0|
|  1|  1|13.0|  3|    1.0|82.0|[1.0,1.0,13.0,3.0...|  [1.0,199.0]|[0.005,0.995]|       1.0|
|  1|  2| 9.0|  4|    0.0|84.0|[1.0,2.0,9.0,4.0,...|  [200.0,0.0]|    [1.0,0.0]|       0.0|
|  1|  3| 8.0|  3|    0.0|85.0|[1.0,3.0,8.0,3.0,...|  [200.0,0.0]|    [1.0,0.0]|       0.0|
|  1|  4|13.0| 10|    1.0|72.0|[1.0,4.0,13.0,10....|  [1.0,199.0]|[0.005,0.995]|       1.0|
+---+---+----+---+-------+----+--------------------+-------------+-------------+----------+
only showing top 5 rows



In [103]:
eval=BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="Spoiled")

In [104]:
eval.evaluate(result)

0.9499244142101284

In [105]:
model.featureImportances

SparseVector(5, {0: 0.0224, 1: 0.0161, 2: 0.8959, 3: 0.0159, 4: 0.0497})