In [1]:
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession.builder.master("local").appName("pyspark-mllib-dt-rf-nb").getOrCreate()
spark.sparkContext

In [4]:
#轉成dataframe
df = spark.read.csv("/home/jovyan/dataset/bezdekIris.data", inferSchema=True)\
.toDF("sep_len", "sep_wid", "pet_len", "pet_wid", "label")

In [5]:
df.show()

+-------+-------+-------+-------+-----------+
|sep_len|sep_wid|pet_len|pet_wid|      label|
+-------+-------+-------+-------+-----------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|
|    5.4|    3.7|    1.5|    0.2|Iris-setosa|
|    4.8|    3.4|    1.6|    0.2|Iris-setosa|
|    4.8|    3.0|    1.4|    0.1|Iris-setosa|
|    4.3|    3.0|    1.1|    0.1|Iris-setosa|
|    5.8|    4.0|    1.2|    0.2|Iris-setosa|
|    5.7|    4.4|    1.5|    0.4|Iris-setosa|
|    5.4|    3.9|    1.3|    0.4|Iris-setosa|
|    5.1|    3.5|    1.4|    0.3|Iris-setosa|
|    5.7|    3.8|    1.7|    0.3|I

In [6]:
#利用VectorAssembler結合所有欄位
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
#將這四個欄位進行VectorAssembler
vector_assembler=VectorAssembler(inputCols=["sep_len","sep_wid","pet_len","pet_wid"], outputCol="features")
df_temp=vector_assembler.transform(df)
df_temp.show()

+-------+-------+-------+-------+-----------+-----------------+
|sep_len|sep_wid|pet_len|pet_wid|      label|         features|
+-------+-------+-------+-------+-----------+-----------------+
|    5.1|    3.5|    1.4|    0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|    4.9|    3.0|    1.4|    0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|    4.7|    3.2|    1.3|    0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|    4.6|    3.1|    1.5|    0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|    5.0|    3.6|    1.4|    0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|    5.4|    3.9|    1.7|    0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|    4.6|    3.4|    1.4|    0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|    5.0|    3.4|    1.5|    0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|    4.4|    2.9|    1.4|    0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|    4.9|    3.1|    1.5|    0.1|Iris-setosa|[4.9,3.1,1.5,0.1]|
|    5.4|    3.7|    1.5|    0.2|Iris-setosa|[5.4,3.7,1.5,0.2]|
|    4.8|    3.4|    1.6|    0.2|Iris-setosa|[4.8,3.4,1.6,0.2]|
|    4.8|    3.0|    1.4|    0.1|Iris-se

In [8]:
#移除不需要的欄位
df=df_temp.drop('sep_len', 'sep_wid', 'pet_len', 'pet_wid')
df.show(3)

+-----------+-----------------+
|      label|         features|
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
+-----------+-----------------+
only showing top 3 rows



In [9]:
#整理應變數
from pyspark.ml.feature import StringIndexer

In [10]:
#需要index應變數
l_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
df=l_indexer.fit(df).transform(df)

In [11]:
df.show(10)

+-----------+-----------------+----------+
|      label|         features|labelIndex|
+-----------+-----------------+----------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.0,1.4,0.2]|       0.0|
|Iris-setosa|[4.7,3.2,1.3,0.2]|       0.0|
|Iris-setosa|[4.6,3.1,1.5,0.2]|       0.0|
|Iris-setosa|[5.0,3.6,1.4,0.2]|       0.0|
|Iris-setosa|[5.4,3.9,1.7,0.4]|       0.0|
|Iris-setosa|[4.6,3.4,1.4,0.3]|       0.0|
|Iris-setosa|[5.0,3.4,1.5,0.2]|       0.0|
|Iris-setosa|[4.4,2.9,1.4,0.2]|       0.0|
|Iris-setosa|[4.9,3.1,1.5,0.1]|       0.0|
+-----------+-----------------+----------+
only showing top 10 rows



In [15]:
df.filter(df['label']!='Iris-setosa').show()

+---------------+-----------------+----------+
|          label|         features|labelIndex|
+---------------+-----------------+----------+
|Iris-versicolor|[7.0,3.2,4.7,1.4]|       1.0|
|Iris-versicolor|[6.4,3.2,4.5,1.5]|       1.0|
|Iris-versicolor|[6.9,3.1,4.9,1.5]|       1.0|
|Iris-versicolor|[5.5,2.3,4.0,1.3]|       1.0|
|Iris-versicolor|[6.5,2.8,4.6,1.5]|       1.0|
|Iris-versicolor|[5.7,2.8,4.5,1.3]|       1.0|
|Iris-versicolor|[6.3,3.3,4.7,1.6]|       1.0|
|Iris-versicolor|[4.9,2.4,3.3,1.0]|       1.0|
|Iris-versicolor|[6.6,2.9,4.6,1.3]|       1.0|
|Iris-versicolor|[5.2,2.7,3.9,1.4]|       1.0|
|Iris-versicolor|[5.0,2.0,3.5,1.0]|       1.0|
|Iris-versicolor|[5.9,3.0,4.2,1.5]|       1.0|
|Iris-versicolor|[6.0,2.2,4.0,1.0]|       1.0|
|Iris-versicolor|[6.1,2.9,4.7,1.4]|       1.0|
|Iris-versicolor|[5.6,2.9,3.6,1.3]|       1.0|
|Iris-versicolor|[6.7,3.1,4.4,1.4]|       1.0|
|Iris-versicolor|[5.6,3.0,4.5,1.5]|       1.0|
|Iris-versicolor|[5.8,2.7,4.1,1.0]|       1.0|
|Iris-versico

In [18]:
df.filter((df['label']!='Iris-setosa') & (df['label']!='Iris-versicolor')).show()

+--------------+-----------------+----------+
|         label|         features|labelIndex|
+--------------+-----------------+----------+
|Iris-virginica|[6.3,3.3,6.0,2.5]|       2.0|
|Iris-virginica|[5.8,2.7,5.1,1.9]|       2.0|
|Iris-virginica|[7.1,3.0,5.9,2.1]|       2.0|
|Iris-virginica|[6.3,2.9,5.6,1.8]|       2.0|
|Iris-virginica|[6.5,3.0,5.8,2.2]|       2.0|
|Iris-virginica|[7.6,3.0,6.6,2.1]|       2.0|
|Iris-virginica|[4.9,2.5,4.5,1.7]|       2.0|
|Iris-virginica|[7.3,2.9,6.3,1.8]|       2.0|
|Iris-virginica|[6.7,2.5,5.8,1.8]|       2.0|
|Iris-virginica|[7.2,3.6,6.1,2.5]|       2.0|
|Iris-virginica|[6.5,3.2,5.1,2.0]|       2.0|
|Iris-virginica|[6.4,2.7,5.3,1.9]|       2.0|
|Iris-virginica|[6.8,3.0,5.5,2.1]|       2.0|
|Iris-virginica|[5.7,2.5,5.0,2.0]|       2.0|
|Iris-virginica|[5.8,2.8,5.1,2.4]|       2.0|
|Iris-virginica|[6.4,3.2,5.3,2.3]|       2.0|
|Iris-virginica|[6.5,3.0,5.5,1.8]|       2.0|
|Iris-virginica|[7.7,3.8,6.7,2.2]|       2.0|
|Iris-virginica|[7.7,2.6,6.9,2.3]|

In [19]:
(trainingData, testData) = df.randomSplit([0.7, 0.3])

# Decision tree classifier

In [20]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [21]:
#訓練決策樹模型
dt=DecisionTreeClassifier(labelCol='labelIndex', featuresCol="features")
model=dt.fit(trainingData)

In [22]:
#預測
predictions = model.transform(testData)

In [23]:
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [24]:
#模型成效
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [25]:
accuracy=evaluator.evaluate(predictions)
print("Test set accuracy={}".format(str(accuracy)))

Test set accuracy=0.95


In [26]:
print("Test error=%g" % (1.0-accuracy))

Test error=0.05


In [27]:
print(model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_b6c982345361) of depth 5 with 11 nodes


# Naive Bayes classifier

In [28]:
#分割資料
splits=df.randomSplit([0.6, 0.4], 1234)
train=splits[0]
test=splits[1]

In [29]:
from pyspark.ml.classification import NaiveBayes

In [30]:
#建立貝氏分類模型
nb=NaiveBayes(labelCol="labelIndex", featuresCol="features", smoothing=1.0, modelType="multinomial")

In [31]:
#訓練模型
model=nb.fit(train)

In [32]:
#預測
predictions=model.transform(test)
predictions.select("label", "labelIndex", "probability", "prediction").show()

+---------------+----------+--------------------+----------+
|          label|labelIndex|         probability|prediction|
+---------------+----------+--------------------+----------+
|    Iris-setosa|       0.0|[0.72723788653438...|       0.0|
|    Iris-setosa|       0.0|[0.64170595827692...|       0.0|
|    Iris-setosa|       0.0|[0.67184222484015...|       0.0|
|    Iris-setosa|       0.0|[0.68647236934182...|       0.0|
|    Iris-setosa|       0.0|[0.79151826954673...|       0.0|
|    Iris-setosa|       0.0|[0.66189579367600...|       0.0|
|    Iris-setosa|       0.0|[0.65307352257988...|       0.0|
|    Iris-setosa|       0.0|[0.73045962362363...|       0.0|
|    Iris-setosa|       0.0|[0.59100133493054...|       0.0|
|    Iris-setosa|       0.0|[0.75334864217418...|       0.0|
|    Iris-setosa|       0.0|[0.71961717211598...|       0.0|
|    Iris-setosa|       0.0|[0.70085626812960...|       0.0|
|    Iris-setosa|       0.0|[0.75135089290995...|       0.0|
|    Iris-setosa|       

In [33]:
#模型成效
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [34]:
accuracy=evaluator.evaluate(predictions)
print("Test set accuracy={}".format(str(accuracy)))

Test set accuracy=0.8235294117647058


# Random forest classifier

In [35]:
from pyspark.ml.classification import RandomForestClassifier

In [36]:
#建立隨機森林模型
#10顆CART弱分類器
rf=RandomForestClassifier(labelCol="labelIndex", featuresCol="features", numTrees=10)
model=rf.fit(trainingData)

In [37]:
#預測
predictions.select("prediction", "labelIndex").show(5)

+----------+----------+
|prediction|labelIndex|
+----------+----------+
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
+----------+----------+
only showing top 5 rows



In [38]:
#模型成效
evaluator=MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [41]:
accuracy=evaluator.evaluate(predictions)
print("Test set accuracy={}".format(str(accuracy)))

Test set accuracy=0.8235294117647058


In [42]:
print("Test Error = %g" % (1.0-accuracy))

Test Error = 0.176471


In [43]:
print(model)

RandomForestClassificationModel (uid=RandomForestClassifier_fe0547ff9ec1) with 10 trees


In [44]:
spark.stop()