In [3]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
from pyspark.ml.feature import VectorAssembler,StringIndexer,PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder

spark = SparkSession.builder.appName("SparkML TrainValidation").getOrCreate()
irisDF= spark.read.option("header","true")\
                        .option("inferSchema","True")\
                        .csv("datasets/iris-dataset.txt")
irisDF.show(5)
strIndexer = StringIndexer(inputCol='class',outputCol='label')
irisDF = strIndexer.fit(irisDF).transform(irisDF)
classesDF = irisDF.select("class").groupBy("class").count()
#classesDF.show()
#print(irisDF.columns[0:4])
vec =  VectorAssembler(inputCols=irisDF.columns[0:4],outputCol='featuresold')
irisDF = vec.transform(irisDF)
irisDF = irisDF.select('featuresold','label')
#irisDF.show()

pca = PCA(inputCol="featuresold",outputCol="features",k=3) #Dimentiality redcution

irisDF = pca.fit(irisDF).transform(irisDF)
#irisDF.show(truncate=False)

trainDF, testDF = irisDF.randomSplit([0.75,0.25],seed=1112)

rfClassifier = RandomForestClassifier()
eva = MulticlassClassificationEvaluator(metricName='accuracy')

params = ParamGridBuilder().addGrid(rfClassifier.numTrees, [6,7,8]) \
                            .addGrid(rfClassifier.maxDepth, [4,6,8]) \
                            .addGrid(rfClassifier.impurity, ['gini','entropy']) \
                            .build()

validator = CrossValidator(estimator=rfClassifier,
                                estimatorParamMaps=params,
                                evaluator=eva,
                                numFolds=5,
                                parallelism=4,
                                seed=123,
                          collectSubModels=True)

#Hyper Parameter Tunning
model = validator.fit(irisDF)

print(model.subModels)

print("Num Trees : ",model.bestModel.getNumTrees)
print("Max Depth : ",model.bestModel._java_obj.getMaxDepth())
print("Impurtiy : ",model.bestModel._java_obj.getImpurity())

resultDF = model.transform(testDF) #Best model will be applied to transform operation

accuracy = eva.evaluate(resultDF)
print("Test Accuracy : ",accuracy)

[[RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=6, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=7, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numTrees=7, numClasses=3, numFeatures=3, RandomForestClassificationModel: uid=RandomForestClassifier_2abdeb1fe07b, numT

In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType
from pyspark.ml.feature import VectorAssembler,StringIndexer,PCA
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder

spark = SparkSession.builder.appName("SparkML TrainValidation").getOrCreate()
irisDF= spark.read.option("header","true")\
                        .option("inferSchema","True")\
                        .csv("datasets/iris-dataset.txt")
irisDF.show(5)
strIndexer = StringIndexer(inputCol='class',outputCol='label')
irisDF = strIndexer.fit(irisDF).transform(irisDF)
classesDF = irisDF.select("class").groupBy("class").count()
#classesDF.show()
#print(irisDF.columns[0:4])
vec =  VectorAssembler(inputCols=irisDF.columns[0:4],outputCol='featuresold')
irisDF = vec.transform(irisDF)
irisDF = irisDF.select('featuresold','label')
#irisDF.show()

+------------+-----------+------------+-----------+-----------+
|sepal-length|sepal-width|petal-length|petal-width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [2]:
pca = PCA(inputCol="featuresold",outputCol="features",k=3) #Dimentiality redcution

irisDF = pca.fit(irisDF).transform(irisDF)
#irisDF.show(truncate=False)

trainDF, testDF = irisDF.randomSplit([0.75,0.25],seed=1112)

rfClassifier = RandomForestClassifier()
eva = MulticlassClassificationEvaluator(metricName='accuracy')

params = ParamGridBuilder().addGrid(rfClassifier.numTrees, [6,7,8]) \
                            .addGrid(rfClassifier.maxDepth, [4,6,8]) \
                            .addGrid(rfClassifier.impurity, ['gini','entropy']) \
                            .build()

validator = CrossValidator(estimator=rfClassifier,
                                estimatorParamMaps=params,
                                evaluator=eva,
                                numFolds=5,
                                parallelism=4,
                                seed=123,
                          collectSubModels=True)

#Hyper Parameter Tunning
model = validator.fit(irisDF)

print(model.subModels)

print("Num Trees : ",model.bestModel.getNumTrees)
print("Max Depth : ",model.bestModel._java_obj.getMaxDepth())
print("Impurtiy : ",model.bestModel._java_obj.getImpurity())

resultDF = model.transform(testDF) #Best model will be applied to transform operation

accuracy = eva.evaluate(resultDF)
print("Test Accuracy : ",accuracy)

[[RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 6 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 7 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 7 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 7 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 7 trees, RandomForestClassificationModel (uid=RandomForestClassifier_8e21bc45d01f) with 7 trees, RandomForestClassificationMode