In [1]:
import os
import sys

In [2]:
import pyspark 
from pyspark.sql import SparkSession 

In [3]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [4]:
data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('wine.csv')

In [5]:
data.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|           

In [6]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[fixed acidity: double, volatile acidity: double, citric acid: double, residual sugar: double, chlorides: double, free sulfur dioxide: double, total sulfur dioxide: double, density: double, pH: double, sulphates: double, alcohol: double, quality: int]>

In [13]:
#data_1 = data.(data.columns.map('quality' = col('quality').cast(StringType))
               
from pyspark.sql.types import StringType
data = data.withColumn("quality", data["quality"].cast(StringType()))               

In [14]:
data.select('quality').distinct().show()

+-------+
|quality|
+-------+
|      7|
|      3|
|      8|
|      5|
|      6|
|      9|
|      4|
+-------+



In [15]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[fixed acidity: double, volatile acidity: double, citric acid: double, residual sugar: double, chlorides: double, free sulfur dioxide: double, total sulfur dioxide: double, density: double, pH: double, sulphates: double, alcohol: double, quality: string]>

In [16]:
from pyspark.ml.linalg import Vectors

def vectorize(df):
    return df.rdd.map(lambda r : [r[11],Vectors.dense(r[0:10])]).toDF(['label','features'])

In [17]:
vectorize_data = vectorize(data)

In [18]:
vectorize_data.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    6|[7.0,0.27,0.36,20...|
|    6|[6.3,0.3,0.34,1.6...|
|    6|[8.1,0.28,0.4,6.9...|
|    6|[7.2,0.23,0.32,8....|
|    6|[7.2,0.23,0.32,8....|
+-----+--------------------+
only showing top 5 rows



In [19]:
vectorize_data.take(5)

[Row(label='6', features=DenseVector([7.0, 0.27, 0.36, 20.7, 0.045, 45.0, 170.0, 1.001, 3.0, 0.45])),
 Row(label='6', features=DenseVector([6.3, 0.3, 0.34, 1.6, 0.049, 14.0, 132.0, 0.994, 3.3, 0.49])),
 Row(label='6', features=DenseVector([8.1, 0.28, 0.4, 6.9, 0.05, 30.0, 97.0, 0.9951, 3.26, 0.44])),
 Row(label='6', features=DenseVector([7.2, 0.23, 0.32, 8.5, 0.058, 47.0, 186.0, 0.9956, 3.19, 0.4])),
 Row(label='6', features=DenseVector([7.2, 0.23, 0.32, 8.5, 0.058, 47.0, 186.0, 0.9956, 3.19, 0.4]))]

In [20]:
from pyspark.ml.feature import StringIndexer

labelIndexer = StringIndexer(inputCol = 'label',outputCol = 'indexedlabel')

In [21]:
index_data = labelIndexer.fit(vectorize_data).transform(vectorize_data)
index_data.take(2)

[Row(label='6', features=DenseVector([7.0, 0.27, 0.36, 20.7, 0.045, 45.0, 170.0, 1.001, 3.0, 0.45]), indexedlabel=0.0),
 Row(label='6', features=DenseVector([6.3, 0.3, 0.34, 1.6, 0.049, 14.0, 132.0, 0.994, 3.3, 0.49]), indexedlabel=0.0)]

In [22]:
(traindata,testdata) = index_data.randomSplit([0.8,0.2])

In [23]:
from pyspark.ml.classification import DecisionTreeClassifier

dtree = DecisionTreeClassifier(labelCol = 'indexedlabel',featuresCol = 'features',
                               maxDepth = 3,impurity='gini')

In [24]:
model = dtree.fit(traindata)

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol = 'label',predictionCol = 'prediction',metricName='f1')

In [26]:
transformed_data = model.transform(testdata)
transformed_data.show(5)

+-----+--------------------+------------+--------------------+--------------------+----------+
|label|            features|indexedlabel|       rawPrediction|         probability|prediction|
+-----+--------------------+------------+--------------------+--------------------+----------+
|    3|[6.9,0.39,0.4,4.6...|         5.0|[155.0,19.0,208.0...|[0.36299765807962...|       2.0|
|    3|[7.1,0.875,0.05,5...|         5.0|[873.0,1183.0,164...|[0.36696090794451...|       1.0|
|    3|[7.4,1.185,0.0,4....|         5.0|[873.0,1183.0,164...|[0.36696090794451...|       1.0|
|    3|[8.5,0.26,0.21,16...|         5.0|[873.0,1183.0,164...|[0.36696090794451...|       1.0|
|    3|[9.1,0.59,0.38,1....|         5.0|[873.0,1183.0,164...|[0.36696090794451...|       1.0|
+-----+--------------------+------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
print(evaluator.getMetricName(),'accuracy:',evaluator.evaluate(transformed_data))