In [1]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, FloatType

In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,103 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Ign:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://security.ubuntu.com/ubuntu

In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
            .appName("Decision-Trees-Example") \
            .config("spark.ui.port","4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [20]:
from pyspark.sql.types import StructField, StructType, FloatType, IntegerType, StringType

iris_prediction_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])

In [21]:
iris_df = spark.read.format("csv").\
    option("header", "true").\
    option("mode", "permissive").\
    option("path", "/content/Iris.csv").\
    schema(iris_prediction_schema).\
    load()
iris_df.printSchema()
iris_df.show(n=10, truncate=False)

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)

+---+-------------+------------+-------------+------------+-----------+
|Id |SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species    |
+---+-------------+------------+-------------+------------+-----------+
|1  |5.1          |3.5         |1.4          |0.2         |Iris-setosa|
|2  |4.9          |3.0         |1.4          |0.2         |Iris-setosa|
|3  |4.7          |3.2         |1.3          |0.2         |Iris-setosa|
|4  |4.6          |3.1         |1.5          |0.2         |Iris-setosa|
|5  |5.0          |3.6         |1.4          |0.2         |Iris-setosa|
|6  |5.4          |3.9         |1.7          |0.4         |Iris-setosa|
|7  |4.6          |3.4         |1.4          |0.3         |Iris-setosa|
|8  |5.0          |3.4    

In [22]:
from pyspark.ml.feature import StringIndexer

# Preprocessing 1: StringIndexer for categorical labels
label_indexer = StringIndexer(inputCol="Species", outputCol="label")
data = label_indexer.fit(iris_df).transform(iris_df)

In [23]:
data.show(n=10)

+---+-------------+------------+-------------+------------+-----------+-----+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|label|
+---+-------------+------------+-------------+------------+-----------+-----+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|  0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|  0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|  0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|  0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|  0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|  0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|  0.0|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|  0.0|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|  0.0|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-

In [25]:
# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol="features")
data_with_features = assembler.transform(data).select("label", "features")

In [26]:
# Split the data into training and test sets 80% training data and 20% testing data
train, test = data_with_features.randomSplit([0.8, 0.2], seed=13)

In [27]:
# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

In [28]:
# ============================
# TRAIN
# ============================

# Train to get the model
dt_model = dt.fit(train)

# Display model summary
print("Decision Tree model summary:{0}".format(dt_model.toDebugString))

Decision Tree model summary:DecisionTreeClassificationModel: uid=DecisionTreeClassifier_487cb66de046, depth=5, numNodes=15, numClasses=3, numFeatures=4
  If (feature 2 <= 2.449999988079071)
   Predict: 0.0
  Else (feature 2 > 2.449999988079071)
   If (feature 3 <= 1.75)
    If (feature 2 <= 5.1499998569488525)
     If (feature 3 <= 1.6500000357627869)
      Predict: 1.0
     Else (feature 3 > 1.6500000357627869)
      If (feature 0 <= 4.950000047683716)
       Predict: 2.0
      Else (feature 0 > 4.950000047683716)
       Predict: 1.0
    Else (feature 2 > 5.1499998569488525)
     Predict: 2.0
   Else (feature 3 > 1.75)
    If (feature 2 <= 4.8500001430511475)
     If (feature 0 <= 5.950000047683716)
      Predict: 1.0
     Else (feature 0 > 5.950000047683716)
      Predict: 2.0
    Else (feature 2 > 4.8500001430511475)
     Predict: 2.0



In [29]:
# ============================
# PREDICTIONS
# ============================

# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test)

# Show predictions
predictions.show()

+-----+--------------------+--------------+--------------------+----------+
|label|            features| rawPrediction|         probability|prediction|
+-----+--------------------+--------------+--------------------+----------+
|  0.0|[4.40000009536743...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[4.59999990463256...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[4.80000019073486...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.0,3.4000000953...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.0,3.4000000953...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.0,3.5,1.600000...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.0,3.5999999046...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.09999990463256...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.19999980926513...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  0.0|[5.69999980926513...|[40.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|  1.0|[5.69

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")

# Calculate F1 score
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")

# Stop Spark session
spark.stop()

Accuracy: 0.9629629629629629
Precision: 0.9675925925925926
Recall: 0.9629629629629629
F1 Score: 0.9632228719948018
