In [3]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
from pyspark.ml.feature import VectorAssembler,StringIndexer,PCA
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit

spark = SparkSession.builder.appName("SparkML TrainValidation").getOrCreate()
irisDF= spark.read.option("header","true")\
                        .option("inferSchema","True")\
                        .csv("datasets/iris-dataset.txt")
#irisDF.show(5)
strIndexer = StringIndexer(inputCol='class',outputCol='label')
irisDF = strIndexer.fit(irisDF).transform(irisDF)
classesDF = irisDF.select("class").groupBy("class").count()
classesDF.show()
#print(irisDF.columns[0:4])
vec =  VectorAssembler(inputCols=irisDF.columns[0:4],outputCol='featuresold')
irisDF = vec.transform(irisDF)
irisDF = irisDF.select('featuresold','label')
irisDF.show()

+---------------+-----+
|          class|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+

+-----------------+-----+
|      featuresold|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[4.8,3.4,1.6,0.2]|  0.0|
|[4.8,3.0,1.4,0.1]|  0.0|
|[4.3,3.0,1.1,0.1]|  0.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.7,4.4,1.5,0.4]|  0.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[5.1,3.5,1.4,0.3]|  0.0|
|[5.7,3.8,1.7,0.3]|  0.0|
|[5.1,3.8,1.5,0.3]|  0.0|
+-----------------+-----+
only showing top 20 rows



In [4]:

pca = PCA(inputCol="featuresold",outputCol="features",k=3) #Dimentiality redcution

irisDF = pca.fit(irisDF).transform(irisDF)
irisDF.show(truncate=False)

trainDF, testDF = irisDF.randomSplit([0.75,0.25],seed=1112)

dtClassifier = MultilayerPerceptronClassifier(layers=[3,4,3])
model = dtClassifier.fit(trainDF)

resultDF = model.transform(testDF) #Prediction
resultDF.show(100)

eva = MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = eva.evaluate(resultDF)
print("Test Accuracy : ",accuracy)

+-----------------+-----+-----------------------------------------------------------+
|featuresold      |label|features                                                   |
+-----------------+-----+-----------------------------------------------------------+
|[5.1,3.5,1.4,0.2]|0.0  |[-2.827135972679027,-5.641331045573321,0.6642769315107171] |
|[4.9,3.0,1.4,0.2]|0.0  |[-2.7959524821488437,-5.145166883252896,0.8462865195142029]|
|[4.7,3.2,1.3,0.2]|0.0  |[-2.6215235581650584,-5.177378121203909,0.6180558535097703]|
|[4.6,3.1,1.5,0.2]|0.0  |[-2.7649059004742402,-5.003599415056946,0.605093119223434] |
|[5.0,3.6,1.4,0.2]|0.0  |[-2.7827501159516603,-5.648648294377395,0.5465353947341569]|
|[5.4,3.9,1.7,0.4]|0.0  |[-3.231445736773378,-6.062506444034077,0.46843947549237885]|
|[4.6,3.4,1.4,0.3]|0.0  |[-2.690452415602345,-5.232619219784267,0.37851400931804624]|
|[5.0,3.4,1.5,0.2]|0.0  |[-2.8848611044591563,-5.485129079769225,0.6585666047730699]|
|[4.4,2.9,1.4,0.2]|0.0  |[-2.6233845324473406,-4.74392