In [14]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, FloatType

In [15]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [[0m                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state inform

In [16]:
# Initialize SparkSession
spark = SparkSession.builder \
            .appName("Decision-Trees-Example") \
            .config("spark.ui.port","4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [17]:
from pyspark.sql.types import StructField, StructType, FloatType, IntegerType, StringType

iris_prediction_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])

In [18]:
iris_df = spark.read.format("csv").\
    option("header", "true").\
    option("mode", "permissive").\
    option("path", "/content/Iris.csv").\
    schema(iris_prediction_schema).\
    load()
iris_df.printSchema()
iris_df.show(n=10, truncate=False)

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)

+---+-------------+------------+-------------+------------+-----------+
|Id |SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species    |
+---+-------------+------------+-------------+------------+-----------+
|1  |5.1          |3.5         |1.4          |0.2         |Iris-setosa|
|2  |4.9          |3.0         |1.4          |0.2         |Iris-setosa|
|3  |4.7          |3.2         |1.3          |0.2         |Iris-setosa|
|4  |4.6          |3.1         |1.5          |0.2         |Iris-setosa|
|5  |5.0          |3.6         |1.4          |0.2         |Iris-setosa|
|6  |5.4          |3.9         |1.7          |0.4         |Iris-setosa|
|7  |4.6          |3.4         |1.4          |0.3         |Iris-setosa|
|8  |5.0          |3.4    

In [19]:
from pyspark.ml.feature import StringIndexer

# Preprocessing 1: StringIndexer for categorical labels
label_indexer = StringIndexer(inputCol="Species", outputCol="label")
data = label_indexer.fit(iris_df).transform(iris_df)

In [20]:
data.show(n=10)

+---+-------------+------------+-------------+------------+-----------+-----+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|label|
+---+-------------+------------+-------------+------------+-----------+-----+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|  0.0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|  0.0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|  0.0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|  0.0|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|  0.0|
|  6|          5.4|         3.9|          1.7|         0.4|Iris-setosa|  0.0|
|  7|          4.6|         3.4|          1.4|         0.3|Iris-setosa|  0.0|
|  8|          5.0|         3.4|          1.5|         0.2|Iris-setosa|  0.0|
|  9|          4.4|         2.9|          1.4|         0.2|Iris-setosa|  0.0|
| 10|          4.9|         3.1|          1.5|         0.1|Iris-

In [21]:
# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol="features")
data_with_features = assembler.transform(data).select("label", "features")

In [22]:
# Split the data into training and test sets 80% training data and 20% testing data
train, test = data_with_features.randomSplit([0.8, 0.2], seed=13)

In [23]:
from pyspark.ml.classification import LinearSVC, OneVsRest
# Initialize the LinearSVC classifier for binary
# classification
lsvc = LinearSVC(maxIter=10, regParam=0.01)

# Set up OneVsRest classifier for multi-class
# classification
ovr = OneVsRest(classifier=lsvc)

In [25]:
# ============================
# TRAIN
# ============================

# Train the model
ovr_model = ovr.fit(train)

In [26]:
# ============================
# PREDICTIONS
# ============================

# Use the trained model to make predictions on the test data
predictions = ovr_model.transform(test)

# Show predictions
predictions.show()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[4.40000009536743...|[1.90099210398048...|       0.0|
|  0.0|[4.59999990463256...|[2.08594433990854...|       0.0|
|  0.0|[4.80000019073486...|[1.76539206613239...|       0.0|
|  0.0|[5.0,3.4000000953...|[2.39387775836876...|       0.0|
|  0.0|[5.0,3.4000000953...|[2.10103921023010...|       0.0|
|  0.0|[5.0,3.5,1.600000...|[2.01860384014608...|       0.0|
|  0.0|[5.0,3.5999999046...|[2.76520695611947...|       0.0|
|  0.0|[5.09999990463256...|[2.56056966826615...|       0.0|
|  0.0|[5.19999980926513...|[3.55461864085416...|       0.0|
|  0.0|[5.69999980926513...|[2.51096786077951...|       0.0|
|  1.0|[5.69999980926513...|[-1.5025526835425...|       1.0|
|  1.0|[5.69999980926513...|[-1.7004614620314...|       1.0|
|  1.0|[5.69999980926513...|[-1.1084968734757...|       1.0|
|  1.0|[5.90000009536743

In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")

# Calculate F1 score
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")

# Stop Spark session
spark.stop()

Accuracy: 0.9629629629629629
Precision: 0.9675925925925926
Recall: 0.9629629629629629
F1 Score: 0.9632228719948018
