In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [21]:
spark = SparkSession.builder.appName("sparkdev-ML-tutorial").master("local[*]").getOrCreate()

In [22]:
df = spark.read.csv(path='iris_dataset.csv',header=True)
df.printSchema()
df.show(10, truncate=False)

root
 |-- sepal_length(cm): string (nullable = true)
 |-- sepal_width(cm): string (nullable = true)
 |-- petal_length(cm): string (nullable = true)
 |-- petal_width(cm): string (nullable = true)
 |-- target: string (nullable = true)

+----------------+---------------+----------------+---------------+-----------+
|sepal_length(cm)|sepal_width(cm)|petal_length(cm)|petal_width(cm)|target     |
+----------------+---------------+----------------+---------------+-----------+
|5.1             |3.5            |1.4             |0.2            |Iris-setosa|
|4.9             |3.0            |1.4             |0.2            |Iris-setosa|
|4.7             |3.2            |1.3             |0.2            |Iris-setosa|
|4.6             |3.1            |1.5             |0.2            |Iris-setosa|
|5.0             |3.6            |1.4             |0.2            |Iris-setosa|
|5.4             |3.9            |1.7             |0.4            |Iris-setosa|
|4.6             |3.4            |1.4         

In [23]:
input_columns = ['sepal_length(cm)','sepal_width(cm)','petal_length(cm)','petal_width(cm)']

# Assuming your dataframe is named "df" and the columns you want to convert are "col1", "col2", "col3"
df = df.select([col(c).cast("double").alias(c) if c in input_columns else col(c) for c in df.columns])
df.printSchema()
df.show(5)

root
 |-- sepal_length(cm): double (nullable = true)
 |-- sepal_width(cm): double (nullable = true)
 |-- petal_length(cm): double (nullable = true)
 |-- petal_width(cm): double (nullable = true)
 |-- target: string (nullable = true)

+----------------+---------------+----------------+---------------+-----------+
|sepal_length(cm)|sepal_width(cm)|petal_length(cm)|petal_width(cm)|     target|
+----------------+---------------+----------------+---------------+-----------+
|             5.1|            3.5|             1.4|            0.2|Iris-setosa|
|             4.9|            3.0|             1.4|            0.2|Iris-setosa|
|             4.7|            3.2|             1.3|            0.2|Iris-setosa|
|             4.6|            3.1|             1.5|            0.2|Iris-setosa|
|             5.0|            3.6|             1.4|            0.2|Iris-setosa|
+----------------+---------------+----------------+---------------+-----------+
only showing top 5 rows



In [24]:
from pyspark.ml.feature import VectorAssembler

In [25]:
vector_assembler = VectorAssembler(inputCols=input_columns, outputCol="features", handleInvalid="skip")
transformed_df = vector_assembler.transform(df)
transformed_df.show(5, truncate=False)

+----------------+---------------+----------------+---------------+-----------+-----------------+
|sepal_length(cm)|sepal_width(cm)|petal_length(cm)|petal_width(cm)|target     |features         |
+----------------+---------------+----------------+---------------+-----------+-----------------+
|5.1             |3.5            |1.4             |0.2            |Iris-setosa|[5.1,3.5,1.4,0.2]|
|4.9             |3.0            |1.4             |0.2            |Iris-setosa|[4.9,3.0,1.4,0.2]|
|4.7             |3.2            |1.3             |0.2            |Iris-setosa|[4.7,3.2,1.3,0.2]|
|4.6             |3.1            |1.5             |0.2            |Iris-setosa|[4.6,3.1,1.5,0.2]|
|5.0             |3.6            |1.4             |0.2            |Iris-setosa|[5.0,3.6,1.4,0.2]|
+----------------+---------------+----------------+---------------+-----------+-----------------+
only showing top 5 rows



In [26]:
required_df = transformed_df.drop('sepal_length(cm)','sepal_width(cm)','petal_length(cm)','petal_width(cm)')
required_df.show(5, truncate=False)

+-----------+-----------------+
|target     |features         |
+-----------+-----------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|
|Iris-setosa|[4.9,3.0,1.4,0.2]|
|Iris-setosa|[4.7,3.2,1.3,0.2]|
|Iris-setosa|[4.6,3.1,1.5,0.2]|
|Iris-setosa|[5.0,3.6,1.4,0.2]|
+-----------+-----------------+
only showing top 5 rows



In [27]:
required_df.select(col("target")).distinct().show(truncate=False)

+---------------+
|target         |
+---------------+
|Iris-virginica |
|Iris-setosa    |
|Iris-versicolor|
+---------------+



In [28]:
from pyspark.ml.feature import StringIndexer

In [29]:
string_indexer = StringIndexer(inputCol='target', outputCol='targetIndexed')
project_df = string_indexer.fit(required_df).transform(required_df)
project_df.show(5, truncate=False)

+-----------+-----------------+-------------+
|target     |features         |targetIndexed|
+-----------+-----------------+-------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|0.0          |
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0          |
|Iris-setosa|[4.7,3.2,1.3,0.2]|0.0          |
|Iris-setosa|[4.6,3.1,1.5,0.2]|0.0          |
|Iris-setosa|[5.0,3.6,1.4,0.2]|0.0          |
+-----------+-----------------+-------------+
only showing top 5 rows



In [30]:
(train_set, test_set) = project_df.randomSplit([0.8,0.2])

In [31]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [32]:
dt = DecisionTreeClassifier(featuresCol='features', labelCol='targetIndexed')
model = dt.fit(train_set)

In [33]:
predictions = model.transform(test_set)
predictions.show(10, truncate=False)

+-----------+-----------------+-------------+--------------+-------------+----------+
|target     |features         |targetIndexed|rawPrediction |probability  |prediction|
+-----------+-----------------+-------------+--------------+-------------+----------+
|Iris-setosa|[4.6,3.6,1.0,0.2]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[4.9,3.1,1.5,0.1]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.0,1.6,0.2]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.3,1.4,0.2]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.5,1.3,0.3]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.5,1.6,0.6]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.1,3.4,1.5,0.2]|0.0          |[37.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.1,3.5,1.4,0.3]|0.0          |[37.0,0.0

In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol='targetIndexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions)
print('Accuracy=', accuracy)
print('Test Error=', 1.0-accuracy)

Accuracy= 0.821078431372549
Test Error= 0.178921568627451


In [35]:
rfc = RandomForestClassifier(featuresCol='features',labelCol='targetIndexed', numTrees=10)
model_rfc = rfc.fit(train_set)

In [36]:
predictions_rfc = model_rfc.transform(test_set)
predictions_rfc.show(5, truncate=False)

+-----------+-----------------+-------------+--------------+-------------+----------+
|target     |features         |targetIndexed|rawPrediction |probability  |prediction|
+-----------+-----------------+-------------+--------------+-------------+----------+
|Iris-setosa|[4.6,3.6,1.0,0.2]|0.0          |[10.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[4.9,3.0,1.4,0.2]|0.0          |[10.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[4.9,3.1,1.5,0.1]|0.0          |[10.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.0,1.6,0.2]|0.0          |[10.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
|Iris-setosa|[5.0,3.3,1.4,0.2]|0.0          |[10.0,0.0,0.0]|[1.0,0.0,0.0]|0.0       |
+-----------+-----------------+-------------+--------------+-------------+----------+
only showing top 5 rows



In [37]:
evaluator = MulticlassClassificationEvaluator(labelCol='targetIndexed', predictionCol='prediction')
accuracy = evaluator.evaluate(predictions_rfc)
print('Accuracy=', accuracy)
print('Test Error=', 1.0-accuracy)

Accuracy= 0.8487889273356402
Test Error= 0.15121107266435985
