# Iris Classification with PySpark
This notebook demonstrates how to perform classification using PySpark ML on the Iris dataset.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("IrisClassification").getOrCreate()

In [None]:
from sklearn.datasets import load_iris
import pandas as pd

# Load iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['label'] = iris.target

# Convert to Spark DataFrame
data = spark.createDataFrame(df)
data.show(5)

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features")
assembled_data = assembler.transform(data)

# Index the label column
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
final_data = indexer.fit(assembled_data).transform(assembled_data)
final_data.select("features", "indexedLabel").show(5)

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=42)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

# Train a Decision Tree model
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")
model = dt.fit(train_data)

In [None]:
# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy = {accuracy:.2f}")

In [None]:
# Convert Spark DataFrame to Pandas for visualization
predictions_pd = predictions.select("prediction", "indexedLabel").toPandas()

import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(pd.crosstab(predictions_pd['indexedLabel'], predictions_pd['prediction']), annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()