In [16]:
!pip install pyspark



In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:
# Create a SparkSession
spark = SparkSession.builder.appName("IrisDecisionTree").getOrCreate()

In [19]:
# Load the dataset
df = spark.read.csv("/content/iris.csv", header=True, inferSchema=True)
df.show()

+------------+-----------+------------+-----------+-------+
|Sepal.Length|Sepal.Width|Petal.Length|Petal.Width|Species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [20]:

# Prepare the data
feature_columns = df.columns[:-1]  # Exclude the target variable

# Replace periods in column names with underscores
for i in range(len(feature_columns)):
    feature_columns[i] = feature_columns[i].replace(".", "_")
    df = df.withColumnRenamed(df.columns[i], feature_columns[i])  # Rename columns in DataFrame

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df = assembler.transform(df)
df.show()

+------------+-----------+------------+-----------+-------+-----------------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| setosa|[4.9,

In [21]:
# Split the data into training and testing sets
(trainingData, testData) = df.randomSplit([0.7, 0.3])

In [22]:
from pyspark.ml.feature import StringIndexer

# Create a StringIndexer to convert the 'Species' column to numeric
indexer = StringIndexer(inputCol="Species", outputCol="Species_index")

# Fit the indexer on the training data and transform both training and testing data
# Get the fitted model (StringIndexerModel)
model = indexer.fit(trainingData)
trainingData = model.transform(trainingData)
testData = model.transform(testData)  # Use the fitted model for consistency

# Update the Decision Tree Classifier to use the new indexed label column
dt = DecisionTreeClassifier(labelCol="Species_index", featuresCol="features")

# Now you can train the model
model = dt.fit(trainingData)

In [24]:

# Make predictions on the test data
predictions = model.transform(testData)
predictions.show()

+------------+-----------+------------+-----------+----------+-----------------+-------------+--------------+-------------+----------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|   Species|         features|Species_index| rawPrediction|  probability|prediction|
+------------+-----------+------------+-----------+----------+-----------------+-------------+--------------+-------------+----------+
|         4.3|        3.0|         1.1|        0.1|    setosa|[4.3,3.0,1.1,0.1]|          1.0|[0.0,36.0,0.0]|[0.0,1.0,0.0]|       1.0|
|         4.5|        2.3|         1.3|        0.3|    setosa|[4.5,2.3,1.3,0.3]|          1.0|[0.0,36.0,0.0]|[0.0,1.0,0.0]|       1.0|
|         4.8|        3.0|         1.4|        0.3|    setosa|[4.8,3.0,1.4,0.3]|          1.0|[0.0,36.0,0.0]|[0.0,1.0,0.0]|       1.0|
|         4.8|        3.4|         1.6|        0.2|    setosa|[4.8,3.4,1.6,0.2]|          1.0|[0.0,36.0,0.0]|[0.0,1.0,0.0]|       1.0|
|         4.9|        2.4|         3.3|        1.0|vers

In [26]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="Species_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [27]:
print("Accuracy:", accuracy)

Accuracy: 0.9069767441860465


In [29]:

# You can also view the decision tree structure if needed
print(model.toDebugString)



DecisionTreeClassificationModel: uid=DecisionTreeClassifier_245271efc5d0, depth=4, numNodes=9, numClasses=3, numFeatures=4
  If (feature 2 <= 2.45)
   Predict: 1.0
  Else (feature 2 > 2.45)
   If (feature 3 <= 1.75)
    If (feature 2 <= 5.15)
     If (feature 0 <= 4.95)
      Predict: 2.0
     Else (feature 0 > 4.95)
      Predict: 0.0
    Else (feature 2 > 5.15)
     Predict: 2.0
   Else (feature 3 > 1.75)
    Predict: 2.0

