In [56]:
#import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [57]:
# Create a spark session/application
spark = SparkSession.builder.appName('iris_dataset').getOrCreate()

In [58]:
# Load the dataset
df = spark.read.csv('../../data/iris.csv', inferSchema=True, header=True)

In [59]:
display(df.printSchema())
display(df.show(5))

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



None

+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  species|
+------------+-----------+------------+-----------+---------+
|         6.0|        3.0|         4.8|        1.8|virginica|
|         6.9|        3.1|         5.4|        2.1|virginica|
|         6.7|        3.1|         5.6|        2.4|virginica|
|         6.9|        3.1|         5.1|        2.3|virginica|
|         5.8|        2.7|         5.1|        1.9|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 5 rows



None

In [60]:
#encode the species column
indexer = StringIndexer(inputCol='species', outputCol='label')
df = indexer.fit(df).transform(df)

In [61]:
df.show(5)

+------------+-----------+------------+-----------+---------+-----+
|sepal_length|sepal_width|petal_length|petal_width|  species|label|
+------------+-----------+------------+-----------+---------+-----+
|         6.0|        3.0|         4.8|        1.8|virginica|  2.0|
|         6.9|        3.1|         5.4|        2.1|virginica|  2.0|
|         6.7|        3.1|         5.6|        2.4|virginica|  2.0|
|         6.9|        3.1|         5.1|        2.3|virginica|  2.0|
|         5.8|        2.7|         5.1|        1.9|virginica|  2.0|
+------------+-----------+------------+-----------+---------+-----+
only showing top 5 rows



In [62]:
# Create a vector assembler for the feature columns
assembler = VectorAssembler(inputCols=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], outputCol='features')
df = assembler.transform(df)

In [63]:
df.show(5)

+------------+-----------+------------+-----------+---------+-----+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|  species|label|         features|
+------------+-----------+------------+-----------+---------+-----+-----------------+
|         6.0|        3.0|         4.8|        1.8|virginica|  2.0|[6.0,3.0,4.8,1.8]|
|         6.9|        3.1|         5.4|        2.1|virginica|  2.0|[6.9,3.1,5.4,2.1]|
|         6.7|        3.1|         5.6|        2.4|virginica|  2.0|[6.7,3.1,5.6,2.4]|
|         6.9|        3.1|         5.1|        2.3|virginica|  2.0|[6.9,3.1,5.1,2.3]|
|         5.8|        2.7|         5.1|        1.9|virginica|  2.0|[5.8,2.7,5.1,1.9]|
+------------+-----------+------------+-----------+---------+-----+-----------------+
only showing top 5 rows



In [64]:
# Split the data into train and test sets
train, test = df.randomSplit([0.7, 0.3])

In [65]:
# Create a logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='label')
dt = DecisionTreeClassifier(featuresCol='features', labelCol='label')
rf = RandomForestClassifier(featuresCol='features', labelCol='label')

In [66]:
# Fit all the models on the training data
lr_model = lr.fit(train)
dt_model = dt.fit(train)
rf_model = rf.fit(train)

In [67]:
# Make predictions
lr_predictions = lr_model.transform(test)
dt_predictions = dt_model.transform(test)
rf_predictions = rf_model.transform(test)

In [68]:
# Evaluate the models
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')

lr_accuracy = evaluator.evaluate(lr_predictions)
dt_accuracy = evaluator.evaluate(dt_predictions)
rf_accuracy = evaluator.evaluate(rf_predictions)

print('Logistic Regression Accuracy: ', lr_accuracy)
print('Decision Tree Accuracy: ', dt_accuracy)
print('Random Forest Accuracy: ', rf_accuracy)

Logistic Regression Accuracy:  0.918918918918919
Decision Tree Accuracy:  0.9459459459459459
Random Forest Accuracy:  0.8918918918918919


In [70]:
# Classification report
lr_metrics = MulticlassMetrics(lr_predictions.select('label', 'prediction').rdd)
dt_metrics = MulticlassMetrics(dt_predictions.select('label', 'prediction').rdd)
rf_metrics = MulticlassMetrics(rf_predictions.select('label', 'prediction').rdd)

In [71]:
# Create Precision, Recall and F1 score for each class
labels = [0.0, 1.0, 2.0]
report = {"Logistic Regression": {"precision": {}, "recall": {}, "f1-score": {}},
            "Decision Tree": {"precision": {}, "recall": {}, "f1-score": {}},
            "Random Forest": {"precision": {}, "recall": {}, "f1-score": {}}}

for label in labels:
    report["Logistic Regression"]["precision"][label] = lr_metrics.precision(label)
    report["Logistic Regression"]["recall"][label] = lr_metrics.recall(label)
    report["Logistic Regression"]["f1-score"][label] = lr_metrics.fMeasure(label)

    report["Decision Tree"]["precision"][label] = dt_metrics.precision(label)
    report["Decision Tree"]["recall"][label] = dt_metrics.recall(label)
    report["Decision Tree"]["f1-score"][label] = dt_metrics.fMeasure(label)

    report["Random Forest"]["precision"][label] = rf_metrics.precision(label)
    report["Random Forest"]["recall"][label] = rf_metrics.recall(label)
    report["Random Forest"]["f1-score"][label] = rf_metrics.fMeasure(label)

In [72]:
# Display the report
print(report)

{'Logistic Regression': {'precision': {0.0: 1.0, 1.0: 0.8181818181818182, 2.0: 0.9166666666666666}, 'recall': {0.0: 1.0, 1.0: 0.9, 2.0: 0.8461538461538461}, 'f1-score': {0.0: 1.0, 1.0: 0.8571428571428572, 2.0: 0.8799999999999999}}, 'Decision Tree': {'precision': {0.0: 1.0, 1.0: 0.8181818181818182, 2.0: 1.0}, 'recall': {0.0: 1.0, 1.0: 1.0, 2.0: 0.8571428571428571}, 'f1-score': {0.0: 1.0, 1.0: 0.9, 2.0: 0.923076923076923}}, 'Random Forest': {'precision': {0.0: 1.0, 1.0: 0.8181818181818182, 2.0: 0.8333333333333334}, 'recall': {0.0: 1.0, 1.0: 0.8181818181818182, 2.0: 0.8333333333333334}, 'f1-score': {0.0: 1.0, 1.0: 0.8181818181818182, 2.0: 0.8333333333333334}}}


In [73]:
# Display the confusion matrix
print("Logistic Regression Confusion Matrix\n", lr_metrics.confusionMatrix().toArray())
print("Decision Tree Confusion Matrix\n", dt_metrics.confusionMatrix().toArray())
print("Random Forest Confusion Matrix\n", rf_metrics.confusionMatrix().toArray())

Logistic Regression Confusion Matrix
 [[14.  0.  0.]
 [ 0.  9.  1.]
 [ 0.  2. 11.]]
Decision Tree Confusion Matrix
 [[14.  0.  0.]
 [ 0.  9.  0.]
 [ 0.  2. 12.]]
Random Forest Confusion Matrix
 [[14.  0.  0.]
 [ 0.  9.  2.]
 [ 0.  2. 10.]]
