In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-command-1").getOrCreate()
spark.sparkContext

In [3]:
df = spark.read.csv("/home/jovyan/dataset/bezdekIris.data", inferSchema=True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")

In [4]:
df.show()

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|
|    5.4|    3.7|    1.5|    0.2|Iris-setosa|
|    4.8|    3.4|    1.6|    0.2|Iris-setosa|
|    4.8|    3.0|    1.4|    0.1|Iris-setosa|
|    4.3|    3.0|    1.1|    0.1|Iris-setosa|
|    5.8|    4.0|    1.2|    0.2|Iris-setosa|
|    5.7|    4.4|    1.5|    0.4|Iris-setosa|
|    5.4|    3.9|    1.3|    0.4|Iris-setosa|
|    5.1|    3.5|    1.4|    0.3|Iris-setosa|
|    5.7|    3.8|    1.7|    0.3|I

In [5]:
#We have to perform some transformations to join all feature columns into a single column using VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [6]:
#To transform 4 features columns into one column with features vector
vector_assembler=VectorAssembler(inputCols=["sep_len","sep_wid","pet_len","pet_wid"], outputCol="features")
df_temp=vector_assembler.transform(df)
df_temp.show()

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|[4.9,3.1,1.5,0.1]|
|    5.4|    3.7|    1.5|    0.2|Iris-setosa|[5.4,3.7,1.5,0.2]|
|    4.8|    3.4|    1.6|    0.2|Iris-setosa|[4.8,3.4,1.6,0.2]|
|    4.8|    3.0|    1.4|    0.1|Iris-se

In [7]:
# remove unnecessary columns
df=df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df.show(3)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+-----------------+
only showing top 3 rows



In [8]:
from pyspark.ml.feature import StringIndexer

In [9]:
# 需要index目標欄位
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df=l_indexer.fit(df).transform(df)

In [11]:
df.show(10)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|
|Iris-setosa|[4.6,3.4,1.4,0.3]|       0.0|
|Iris-setosa|[5.0,3.4,1.5,0.2]|       0.0|
|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|
+-----------+-----------------+----------+
only showing top 10 rows



In [12]:
# divide our data into training and test sets (30% held out for testing)
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Decision tree classifier

In [13]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [14]:
#train a DecisionTree model
dt=DecisionTreeClassifier(labelCol='labelIndex', featuresCol="features")
model=dt.fit(trainingData)
predictions=model.transform(testData)

In [15]:
predictions = model.transform(testData)

In [16]:
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [17]:
# estimate the accuracy of the prediction
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [18]:
accuracy=evaluator.evaluate(predictions)
print("Test set accuracy={}".format(str(accuracy)))

Test set accuracy=0.9767441860465116


In [19]:
print("Test error=%g" % (1.0-accuracy))

Test error=0.0232558


In [20]:
print(model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_c99491df28e8) of depth 5 with 15 nodes


# Naive Bayes classifier

In [21]:
splits=df.randomSplit([0.6, 0.4], 1234)
train=splits[0]
test=splits[1]

In [22]:
from pyspark.ml.classification import NaiveBayes

In [23]:
nb=NaiveBayes(labelCol="labelIndex", featuresCol="features", smoothing=1.0, modelType="multinomial")

In [24]:
model=nb.fit(train)

In [25]:
predictions=model.transform(test)
predictions.select("label", "labelIndex", "probability", "prediction").show()

+---------------+----------+--------------------+----------+
|          label|labelIndex|         probability|prediction|
+---------------+----------+--------------------+----------+
|    Iris-setosa|       0.0|[0.72723788653438...|       0.0|
|    Iris-setosa|       0.0|[0.64170595827692...|       0.0|
|    Iris-setosa|       0.0|[0.67184222484015...|       0.0|
|    Iris-setosa|       0.0|[0.68647236934182...|       0.0|
|    Iris-setosa|       0.0|[0.79151826954673...|       0.0|
|    Iris-setosa|       0.0|[0.66189579367600...|       0.0|
|    Iris-setosa|       0.0|[0.65307352257988...|       0.0|
|    Iris-setosa|       0.0|[0.73045962362363...|       0.0|
|    Iris-setosa|       0.0|[0.59100133493054...|       0.0|
|    Iris-setosa|       0.0|[0.75334864217418...|       0.0|
|    Iris-setosa|       0.0|[0.71961717211598...|       0.0|
|    Iris-setosa|       0.0|[0.70085626812960...|       0.0|
|    Iris-setosa|       0.0|[0.75135089290995...|       0.0|
|    Iris-setosa|       

In [26]:
# evaluate this classifier, let’s compute the accuracy on the test set
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [27]:
accuracy=evaluator.evaluate(predictions)
print("Test set accuracy={}".format(str(accuracy)))

Test set accuracy=0.8235294117647058


# Random forest classifier

In [28]:
from pyspark.ml.classification import RandomForestClassifier

In [29]:
rf=RandomForestClassifier(labelCol="labelIndex", featuresCol="features", numTrees=10)
model=rf.fit(trainingData)

In [30]:
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [31]:
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [32]:
accuracy=evaluator.evaluate(predictions)

In [33]:
print("Test Error = %g" % (1.0-accuracy))

Test Error = 0.176471


In [34]:
print(model)

RandomForestClassificationModel (uid=RandomForestClassifier_4843d2b526ed) with 10 trees


In [36]:
spark.stop()