<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/PySpark-Analytics-Hub/blob/main/Iris%20Dataset%20Analysis%20%26%20ML%20Model/Iris_Analysis_ML_Model_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Dataset

In [6]:
!kaggle datasets download uciml/iris
!unzip iris.zip

Dataset URL: https://www.kaggle.com/datasets/uciml/iris
License(s): CC0-1.0
iris.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  iris.zip
  inflating: Iris.csv                
  inflating: database.sqlite         


# Import Libraries

In [38]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

# Initialize SparkSession

In [4]:
spark = SparkSession.builder.appName("Iris").getOrCreate()

# Import Dataset

In [57]:
df = (
    spark.read.csv(
        path=r"/content/Iris.csv",
        sep=",",
        header=True,
        inferSchema=True
    )
)

# Check Dataset

In [12]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [13]:
df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

In [42]:
# Count unique values of species
df.select(f.count_distinct('Species')).show()

+-----------------------+
|count(DISTINCT Species)|
+-----------------------+
|                      3|
+-----------------------+



In [43]:
# Check unique values of species
df.select('Species').distinct().show()

+---------------+
|        Species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [52]:
species = df.select('Species')
species.sample(withReplacement=False, fraction=15/species.count(), seed=42).show()

+---------------+
|        Species|
+---------------+
|    Iris-setosa|
|    Iris-setosa|
|    Iris-setosa|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
|Iris-versicolor|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
| Iris-virginica|
+---------------+



In [36]:
df.summary().show()

+-------+------------------+------------------+-------------------+------------------+------------------+--------------+
|summary|                Id|     SepalLengthCm|       SepalWidthCm|     PetalLengthCm|      PetalWidthCm|       Species|
+-------+------------------+------------------+-------------------+------------------+------------------+--------------+
|  count|               150|               150|                150|               150|               150|           150|
|   mean|              75.5| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|          NULL|
| stddev|43.445367992456916|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|          NULL|
|    min|                 1|               4.3|                2.0|               1.0|               0.1|   Iris-setosa|
|    25%|                38|               5.1|                2.8|               1.6|               0.3|          NULL|
|    50%|                75|    

In [39]:
def check_nans(df):
    # Show the number of null values in each column
    return df.select([f.count(f.when(f.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [40]:
check_nans(df)

+---+-------------+------------+-------------+------------+-------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species|
+---+-------------+------------+-------------+------------+-------+
|  0|            0|           0|            0|           0|      0|
+---+-------------+------------+-------------+------------+-------+



# Preprocessing

### Handle Missing Value
__Here is supposed that dataframe contains NaNs__

In [58]:
# Filling Missing Values
df = df.fillna({
    'SepalLengthCm': df.select(f.mean('SepalLengthCm')).collect()[0][0],
    'SepalWidthCm': df.select(f.mean('SepalWidthCm')).collect()[0][0],
    'PetalLengthCm': df.select(f.mean('PetalLengthCm')).collect()[0][0],
    'PetalWidthCm': df.select(f.mean('SepalLengthCm')).collect()[0][0],
})

df.show()

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-setosa|
| 11|          5.4|         3.7|          1.5|         0.2|Iris-

### Categorical Encoding

In [59]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(
    inputCol='Species',
    outputCol='Species_Indexed'
)

df = indexer.fit(df).transform(df)
df.show()

+---+-------------+------------+-------------+------------+-----------+---------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Species_Indexed|
+---+-------------+------------+-------------+------------+-----------+---------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|            0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|            0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|            0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|            0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|            0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|            0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|            0.0|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|            0.0|
|  9|          4.4|         2.9|

In [60]:
# Check unique values of species
df.select('Species_Indexed').distinct().show()

+---------------+
|Species_Indexed|
+---------------+
|            0.0|
|            1.0|
|            2.0|
+---------------+



### Normalize Data

In [62]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.linalg import Vectors

assembler = VectorAssembler(
    inputCols=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'],
    outputCol='features'
)

df = assembler.transform(df)
df.show()

+---+-------------+------------+-------------+------------+-----------+---------------+-----------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Species_Indexed|         features|
+---+-------------+------------+-------------+------------+-----------+---------------+-----------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|            0.0|[5.1,3.5,1.4,0.2]|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|            0.0|[4.9,3.0,1.4,0.2]|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|            0.0|[4.7,3.2,1.3,0.2]|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|            0.0|[4.6,3.1,1.5,0.2]|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|            0.0|[5.0,3.6,1.4,0.2]|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|            0.0|[5.4,3.9,1.7,0.4]|
|  7|          4.6|         3.4|          1.4|

In [63]:
# StandardScaler

scaler = StandardScaler(
    inputCol='features',
    outputCol='scaled_features'
)

scaler_model = scaler.fit(df)
df = scaler_model.transform(df)
df.show()

+---+-------------+------------+-------------+------------+-----------+---------------+-----------------+--------------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|Species_Indexed|         features|     scaled_features|
+---+-------------+------------+-------------+------------+-----------+---------------+-----------------+--------------------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|            0.0|[5.1,3.5,1.4,0.2]|[6.15892840883878...|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|            0.0|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|            0.0|[4.7,3.2,1.3,0.2]|[5.67587520030241...|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|            0.0|[4.6,3.1,1.5,0.2]|[5.55511189816831...|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|            0.0|[5.0,3.6,1.4,0.2]|[6.038

In [66]:
# Save preprocessed dataframe
df.toPandas().to_csv('preprocessed_iris.csv', index=False)

# PySpark Modelling

In [74]:
train, test = df.randomSplit([0.8, 0.2], seed=42)

In [71]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

dt_model = DecisionTreeClassifier(
    labelCol='Species_Indexed',
    featuresCol='scaled_features',
    predictionCol='prediction'
)

evaluator = MulticlassClassificationEvaluator(
    labelCol='Species_Indexed',
    predictionCol='prediction',
    metricName='accuracy'
)

In [75]:
# Define the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(dt_model.maxDepth, [5, 10, 15]) \
    .addGrid(dt_model.maxBins, [32, 64, 128]) \
    .build()

# Set up cross-validation
crossval = CrossValidator(
    estimator=dt_model,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,
    parallelism=2
)

# Train the model with cross-validation
cvModel = crossval.fit(train)

In [80]:
# Make predictions on the test data
predictions = cvModel.transform(test)

# Evaluate the model using accuracy
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

# Evaluate other metrics: precision, recall, and F1 score
precision_evaluator = MulticlassClassificationEvaluator(labelCol="Species_Indexed", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="Species_Indexed", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="Species_Indexed", predictionCol="prediction", metricName="f1")

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1 = f1_evaluator.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Test Accuracy: 0.9166666666666666
Precision: 0.9166666666666666
Recall: 0.9166666666666666
F1 Score: 0.9166666666666667


In [82]:
# Confusion Matrix
# Get the confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# Convert predictions to RDD
predictionAndLabels = predictions.select("prediction", 'Species_Indexed').rdd

# Instantiate the MulticlassMetrics
metrics = MulticlassMetrics(predictionAndLabels)

# Get the confusion matrix
conf_matrix = metrics.confusionMatrix()
print("Confusion Matrix:")
print(conf_matrix)




Confusion Matrix:
DenseMatrix([[12.,  0.,  0.],
             [ 0.,  4.,  1.],
             [ 0.,  1.,  6.]])
