### Objective:
In this notebook, we demonstrate how to:
- Load the **Iris** dataset in different formats (CSV, JSON, Parquet)
- Explore the schema
- Write and apply **custom classification logic** using **UDFs (User Defined Functions)** in PySpark

We classify each flower's:
- Petal size (Small, Medium, Large)
- Sepal width (Narrow, Moderate, Wide)

This is a common use-case when transforming raw features into categorized values for reporting or machine learning.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("Iris Data Comparison") \
    .getOrCreate()

In [None]:
# Load datasets using Spark
iris_csv_spark = spark.read.csv("/content/drive/MyDrive/Datasets/Iris.csv", header=True, inferSchema=True)
iris_json_spark = spark.read.json("/content/drive/MyDrive/Datasets/Iris.json")
iris_parquet_spark = spark.read.parquet("/content/drive/MyDrive/Datasets/Iris.parquet")

# Show a few rows
print("CSV Sample (Spark):")
iris_csv_spark.show(5)

CSV Sample (Spark):
+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [None]:
print("\nSpark CSV Schema:")
iris_csv_spark.printSchema()

print("\nSpark JSON Schema:")
iris_json_spark.printSchema()

print("\nSpark Parquet Schema:")
iris_parquet_spark.printSchema()


Spark CSV Schema:
root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)


Spark JSON Schema:
root
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- variety: string (nullable = true)


Spark Parquet Schema:
root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)



In [None]:
for col_name in iris_csv_spark.columns:
    print(col_name)

sepal.length
sepal.width
petal.length
petal.width
variety


In [None]:
iris_csv_spark = iris_csv_spark.withColumnRenamed("sepal.width", "sepal_width")

In [None]:
iris_csv_spark = iris_csv_spark.withColumnRenamed("sepal.length", "sepal_length")
iris_csv_spark = iris_csv_spark.withColumnRenamed("petal.length", "petal_length")
iris_csv_spark = iris_csv_spark.withColumnRenamed("petal.width", "petal_width")

In [None]:
# Choose one format for the demo
df = iris_csv_spark
df.show(5)

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows



In [None]:
# Define flower size classification based on petal length
def classify_by_petal_length(petal_length):
    if petal_length < 2.0:
        return "Small"
    elif 2.0 <= petal_length < 5.0:
        return "Medium"
    else:
        return "Large"

In [None]:
# Define flower size classification based on sepal width
def classify_by_sepal_width(sepal_width):
    if sepal_width < 3.0:
        return "Narrow"
    elif 3.0 <= sepal_width < 3.5:
        return "Moderate"
    else:
        return "Wide"


In [None]:
# Register the functions as UDFs
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

petal_size_udf = udf(classify_by_petal_length, StringType())
sepal_size_udf = udf(classify_by_sepal_width, StringType())

In [None]:
# Apply UDFs to CSV DataFrame
iris_classified = iris_csv_spark.withColumn("petal_size", petal_size_udf("petal_length")) \
                          .withColumn("sepal_size", sepal_size_udf("sepal_width"))

iris_classified.show(10)

+------------+-----------+------------+-----------+-------+----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|variety|petal_size|sepal_size|
+------------+-----------+------------+-----------+-------+----------+----------+
|         5.1|        3.5|         1.4|        0.2| Setosa|     Small|      Wide|
|         4.9|        3.0|         1.4|        0.2| Setosa|     Small|  Moderate|
|         4.7|        3.2|         1.3|        0.2| Setosa|     Small|  Moderate|
|         4.6|        3.1|         1.5|        0.2| Setosa|     Small|  Moderate|
|         5.0|        3.6|         1.4|        0.2| Setosa|     Small|      Wide|
|         5.4|        3.9|         1.7|        0.4| Setosa|     Small|      Wide|
|         4.6|        3.4|         1.4|        0.3| Setosa|     Small|  Moderate|
|         5.0|        3.4|         1.5|        0.2| Setosa|     Small|  Moderate|
|         4.4|        2.9|         1.4|        0.2| Setosa|     Small|    Narrow|
|         4.9|  