In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

In [None]:
# Import necessary modules
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import StructType, StructField, FloatType

# Initialize SparkSession
spark = SparkSession.builder \
            .appName("Decision-Trees-Example") \
            .config("spark.ui.port","4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# Define the schema of the iris dataset if necessary
iris_schema = StructType([
    StructField("label", FloatType(), True),
    StructField("feature1", FloatType(), True),
    StructField("feature2", FloatType(), True),
    StructField("feature3", FloatType(), True),
    StructField("feature4", FloatType(), True)
])

# Load the iris dataset from Google Drive
iris_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "permissive") \
    .schema(iris_schema) \
    .load("/content/drive/MyDrive/ColabNotebooks/datasets/iris.csv")

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3", "feature4"], outputCol="features")
data_with_features = assembler.transform(iris_df).select("label", "features")

# Split the data into training and test sets (80% training, 20% testing)
train, test = data_with_features.randomSplit([0.8, 0.2], seed=13)

# Show the dataset and training set
print("Dataset")
data_with_features.show()

print("Train set")
train.show()

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

# Train to get the model
dt_model = dt.fit(train)

# Display model summary
print("Decision Tree model summary:")
print(dt_model.toDebugString)

# Use the trained model to make predictions on the test data
predictions = dt_model.transform(test)

# Show predictions
predictions.show()

# Evaluate the model using MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

# Calculate accuracy, precision, recall, and F1 score
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

In [None]:
print(f"Accuracy: {accuracy}")

In [None]:
print(f"Precision: {precision}")

In [None]:
print(f"Recall: {recall}")

In [None]:
print(f"F1 Score: {f1}")

In [None]:
# Stop Spark session
spark.stop()