In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Ign:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy Release [5,713 B]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy Release.gpg [793 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,103 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:13 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
G

In [7]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.types import StructType, StructField, FloatType, StringType

# Initialize SparkSession
spark = SparkSession.builder \
            .appName("Decision-Trees-Example") \
            .config("spark.ui.port", "4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# Define the schema of the iris dataset
iris_schema = StructType([
    StructField("Id", FloatType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])

# Load the iris dataset from Google Drive
iris_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "permissive") \
    .schema(iris_schema) \
    .load("/content/drive/MyDrive/ColabNotebooks/datasets/Iris.csv")

# Index the Species column to convert string labels to numeric
indexer = StringIndexer(inputCol="Species", outputCol="label")
iris_df = indexer.fit(iris_df).transform(iris_df)

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"], outputCol="features")
data_with_features = assembler.transform(iris_df).select("label", "features")

# Split the data into training and test sets (80% training, 20% testing)
train, test = data_with_features.randomSplit([0.8, 0.2], seed=13)

# Show the dataset and training set
print("Dataset")
data_with_features.show()

print("Train set")
train.show()

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# Train to get the model
dt_model = dt.fit(train)

# Display model summary
print("Decision Tree model summary:")
print(dt_model.toDebugString)

# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test)

# Show predictions
predictions.show()

# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

Dataset
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[5.09999990463256...|
|  0.0|[4.90000009536743...|
|  0.0|[4.69999980926513...|
|  0.0|[4.59999990463256...|
|  0.0|[5.0,3.5999999046...|
|  0.0|[5.40000009536743...|
|  0.0|[4.59999990463256...|
|  0.0|[5.0,3.4000000953...|
|  0.0|[4.40000009536743...|
|  0.0|[4.90000009536743...|
|  0.0|[5.40000009536743...|
|  0.0|[4.80000019073486...|
|  0.0|[4.80000019073486...|
|  0.0|[4.30000019073486...|
|  0.0|[5.80000019073486...|
|  0.0|[5.69999980926513...|
|  0.0|[5.40000009536743...|
|  0.0|[5.09999990463256...|
|  0.0|[5.69999980926513...|
|  0.0|[5.09999990463256...|
+-----+--------------------+
only showing top 20 rows

Train set
+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[4.30000019073486...|
|  0.0|[4.40000009536743...|
|  0.0|[4.40000009536743...|
|  0.0|[4.5,2.2999999523...|
|  0.0|[4.59999990463256...|
|  0.0|[4.59999990463256...|

In [13]:
# Calculate accuracy, precision, recall, and F1 score
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

In [14]:
print(f"Accuracy: {accuracy}")

Accuracy: 0.9629629629629629


In [15]:
print(f"Precision: {precision}")

Precision: 0.9675925925925926


In [16]:
print(f"Recall: {recall}")

Recall: 0.9629629629629629


In [17]:
print(f"F1 Score: {f1}")

F1 Score: 0.9632228719948018


In [18]:
# Stop Spark session
spark.stop()