**Use Case 5:- Rice Growth Condition Profiling & Prediction Model**

**STEP 1:- Load & Inspect Data**

In [0]:
# Method 1: If you created a table during upload
df = spark.table("cropprediction")

# OR Method 2: If you uploaded to DBFS and know the path
# df = spark.read.csv("dbfs:/FileStore/tables/cropprediction.csv", header=True, inferSchema=True)

# Print schema
df.printSchema()

# Show first 5 rows
df.show(5)

# Check total records
print(f"Total records: {df.count()}")

root
 |-- N: long (nullable = true)
 |-- P: long (nullable = true)
 |-- K: long (nullable = true)
 |-- temperature: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- ph: double (nullable = true)
 |-- rainfall: double (nullable = true)
 |-- label: string (nullable = true)

+---+---+---+-----------+-----------+-----------+-----------+-----+
|  N|  P|  K|temperature|   humidity|         ph|   rainfall|label|
+---+---+---+-----------+-----------+-----------+-----------+-----+
| 90| 42| 43|20.87974371|82.00274423|6.502985292|202.9355362| rice|
| 85| 58| 41|21.77046169|80.31964408|7.038096361|226.6555374| rice|
| 60| 55| 44|23.00445915| 82.3207629|7.840207144|263.9642476| rice|
| 74| 35| 40|26.49109635|80.15836264|6.980400905|242.8640342| rice|
| 78| 42| 42|20.13017482|81.60487287|7.628472891|262.7173405| rice|
+---+---+---+-----------+-----------+-----------+-----------+-----+
only showing top 5 rows
Total records: 49


**STEP 2:- Rename Columns for Consistency**

In [0]:
# Rename columns to uppercase for consistency
df = (df
    .withColumnRenamed("N", "NITROGEN")
    .withColumnRenamed("P", "PHOSPHORUS")
    .withColumnRenamed("K", "POTASSIUM")
    .withColumnRenamed("temperature", "TEMPERATURE")
    .withColumnRenamed("humidity", "HUMIDITY")
    .withColumnRenamed("ph", "PH")
    .withColumnRenamed("rainfall", "RAINFALL")
    .withColumnRenamed("label", "CROP_LABEL")
)

print("Renamed columns:")
print(df.columns)
df.show(5)

Renamed columns:
['NITROGEN', 'PHOSPHORUS', 'POTASSIUM', 'TEMPERATURE', 'HUMIDITY', 'PH', 'RAINFALL', 'CROP_LABEL']
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
|NITROGEN|PHOSPHORUS|POTASSIUM|TEMPERATURE|   HUMIDITY|         PH|   RAINFALL|CROP_LABEL|
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
|      90|        42|       43|20.87974371|82.00274423|6.502985292|202.9355362|      rice|
|      85|        58|       41|21.77046169|80.31964408|7.038096361|226.6555374|      rice|
|      60|        55|       44|23.00445915| 82.3207629|7.840207144|263.9642476|      rice|
|      74|        35|       40|26.49109635|80.15836264|6.980400905|242.8640342|      rice|
|      78|        42|       42|20.13017482|81.60487287|7.628472891|262.7173405|      rice|
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
only showing top 5 rows


**STEP 3:- Clean and Filter Data**

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

# Cast all numeric columns to Double and remove nulls
df = (df
    .withColumn("NITROGEN", col("NITROGEN").cast(DoubleType()))
    .withColumn("PHOSPHORUS", col("PHOSPHORUS").cast(DoubleType()))
    .withColumn("POTASSIUM", col("POTASSIUM").cast(DoubleType()))
    .withColumn("TEMPERATURE", col("TEMPERATURE").cast(DoubleType()))
    .withColumn("HUMIDITY", col("HUMIDITY").cast(DoubleType()))
    .withColumn("PH", col("PH").cast(DoubleType()))
    .withColumn("RAINFALL", col("RAINFALL").cast(DoubleType()))
    .filter(col("CROP_LABEL").isNotNull())
)

print(f"Total clean records: {df.count()}")
df.show(5)

Total clean records: 49
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
|NITROGEN|PHOSPHORUS|POTASSIUM|TEMPERATURE|   HUMIDITY|         PH|   RAINFALL|CROP_LABEL|
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
|    90.0|      42.0|     43.0|20.87974371|82.00274423|6.502985292|202.9355362|      rice|
|    85.0|      58.0|     41.0|21.77046169|80.31964408|7.038096361|226.6555374|      rice|
|    60.0|      55.0|     44.0|23.00445915| 82.3207629|7.840207144|263.9642476|      rice|
|    74.0|      35.0|     40.0|26.49109635|80.15836264|6.980400905|242.8640342|      rice|
|    78.0|      42.0|     42.0|20.13017482|81.60487287|7.628472891|262.7173405|      rice|
+--------+----------+---------+-----------+-----------+-----------+-----------+----------+
only showing top 5 rows


**STEP 4:- Feature Engineering**

In [0]:
# Check the distribution of crops
print("Crop distribution:")
df.groupBy("CROP_LABEL").count().orderBy("count", ascending=False).show()

# Create a temporary view for SQL analysis
df.createOrReplaceTempView("crops")

Crop distribution:
+----------+-----+
|CROP_LABEL|count|
+----------+-----+
|      rice|   49|
+----------+-----+



**STEP 5:- SQL Exploratory Analysis**

In [0]:
# Average conditions per crop
print("Average conditions for each crop:")
spark.sql("""
    SELECT 
        CROP_LABEL,
        COUNT(*) AS total_samples,
        ROUND(AVG(NITROGEN), 2) AS avg_nitrogen,
        ROUND(AVG(PHOSPHORUS), 2) AS avg_phosphorus,
        ROUND(AVG(POTASSIUM), 2) AS avg_potassium,
        ROUND(AVG(TEMPERATURE), 2) AS avg_temp,
        ROUND(AVG(HUMIDITY), 2) AS avg_humidity,
        ROUND(AVG(PH), 2) AS avg_ph,
        ROUND(AVG(RAINFALL), 2) AS avg_rainfall
    FROM crops
    GROUP BY CROP_LABEL
    ORDER BY total_samples DESC
""").show(25)

Average conditions for each crop:
+----------+-------------+------------+--------------+-------------+--------+------------+------+------------+
|CROP_LABEL|total_samples|avg_nitrogen|avg_phosphorus|avg_potassium|avg_temp|avg_humidity|avg_ph|avg_rainfall|
+----------+-------------+------------+--------------+-------------+--------+------------+------+------------+
|      rice|           49|       81.31|         47.33|        40.02|   23.59|       82.13|  6.42|      238.94|
+----------+-------------+------------+--------------+-------------+--------+------------+------+------------+



**STEP 6:- Prepare ML Features**

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Index the crop labels (convert strings to numeric indices)
label_indexer = StringIndexer(inputCol="CROP_LABEL", outputCol="label", handleInvalid="keep")

# Assemble all features into a single vector
assembler = VectorAssembler(
    inputCols=["NITROGEN", "PHOSPHORUS", "POTASSIUM", "TEMPERATURE", "HUMIDITY", "PH", "RAINFALL"],
    outputCol="features"
)

# Scale features (important for Naive Bayes)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

print("✅ Feature pipeline stages created")

✅ Feature pipeline stages created


**STEP 7:- Model Training (Naive Bayes Classification)**

In [0]:
from pyspark.ml.classification import NaiveBayes

# Create Naive Bayes classifier
nb = NaiveBayes(featuresCol="scaledFeatures", labelCol="label", smoothing=1.0)

# Build the pipeline
pipeline = Pipeline(stages=[label_indexer, assembler, scaler, nb])

# Split data into train and test sets
train, test = df.randomSplit([0.8, 0.2], seed=42)

print(f"Training set: {train.count()} records")
print(f"Test set: {test.count()} records")

# Train the model
print("Training model...")
model = pipeline.fit(train)
print("✅ Model trained successfully!")

# Make predictions
pred = model.transform(test)

# Show sample predictions
print("Sample predictions:")
pred.select("CROP_LABEL", "label", "prediction").show(10)

Training set: 36 records
Test set: 13 records
Training model...
✅ Model trained successfully!
Sample predictions:
+----------+-----+----------+
|CROP_LABEL|label|prediction|
+----------+-----+----------+
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
|      rice|  0.0|       0.0|
+----------+-----+----------+
only showing top 10 rows


**STEP 8:- Model Evaluation**

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)
accuracy = accuracy_evaluator.evaluate(pred)
print(f"\n🎯 Model Accuracy: {round(accuracy * 100, 2)}%")

# F1 Score
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)
f1_score = f1_evaluator.evaluate(pred)
print(f"📊 F1 Score: {round(f1_score, 3)}")

# Confusion Matrix view
pred.createOrReplaceTempView("predictions")
print("\nConfusion Matrix:")
spark.sql("""
    SELECT 
        CROP_LABEL as actual_crop,
        prediction,
        COUNT(*) as count
    FROM predictions
    GROUP BY CROP_LABEL, prediction
    ORDER BY CROP_LABEL, count DESC
""").show(50)


🎯 Model Accuracy: 100.0%
📊 F1 Score: 1.0

Confusion Matrix:
+-----------+----------+-----+
|actual_crop|prediction|count|
+-----------+----------+-----+
|       rice|       0.0|   13|
+-----------+----------+-----+



**STEP 9:- Save Model and Results**

In [0]:
# Save predictions as a table
pred.select(
    "NITROGEN", "PHOSPHORUS", "POTASSIUM", 
    "TEMPERATURE", "HUMIDITY", "PH", "RAINFALL",
    "CROP_LABEL", "label", "prediction"
).write.mode("overwrite").saveAsTable("crop_predictions")

print("✅ Predictions saved as SQL table: crop_predictions")

# Query accuracy per crop
print("\nAccuracy per crop:")
spark.sql("""
    SELECT 
        CROP_LABEL,
        COUNT(*) as total_predictions,
        SUM(CASE WHEN prediction = label THEN 1 ELSE 0 END) as correct_predictions,
        ROUND(AVG(CASE WHEN prediction = label THEN 1 ELSE 0 END) * 100, 2) as accuracy_pct
    FROM predictions
    GROUP BY CROP_LABEL
    ORDER BY accuracy_pct DESC
""").show()

✅ Predictions saved as SQL table: crop_predictions

Accuracy per crop:
+----------+-----------------+-------------------+------------+
|CROP_LABEL|total_predictions|correct_predictions|accuracy_pct|
+----------+-----------------+-------------------+------------+
|      rice|               13|                 13|       100.0|
+----------+-----------------+-------------------+------------+



**STEP 10:- Summary results table**

In [0]:
# Create comprehensive results table
comprehensive_results = spark.sql("""
    SELECT 
        p.CROP_LABEL,
        COUNT(*) as total_samples,
        SUM(CASE WHEN p.prediction = p.label THEN 1 ELSE 0 END) as correct_predictions,
        ROUND(AVG(CASE WHEN p.prediction = p.label THEN 1 ELSE 0 END) * 100, 2) as accuracy_pct,
        ROUND(AVG(c.NITROGEN), 2) as avg_nitrogen,
        ROUND(AVG(c.PHOSPHORUS), 2) as avg_phosphorus,
        ROUND(AVG(c.POTASSIUM), 2) as avg_potassium,
        ROUND(AVG(c.TEMPERATURE), 2) as avg_temp,
        ROUND(AVG(c.HUMIDITY), 2) as avg_humidity,
        ROUND(AVG(c.PH), 2) as avg_ph,
        ROUND(AVG(c.RAINFALL), 2) as avg_rainfall
    FROM predictions p
    JOIN crops c ON p.CROP_LABEL = c.CROP_LABEL
    GROUP BY p.CROP_LABEL
    ORDER BY p.CROP_LABEL
""")

display(comprehensive_results)

CROP_LABEL,total_samples,correct_predictions,accuracy_pct,avg_nitrogen,avg_phosphorus,avg_potassium,avg_temp,avg_humidity,avg_ph,avg_rainfall
rice,637,637,100.0,81.31,47.33,40.02,23.59,82.13,6.42,238.94


Databricks visualization. Run in Databricks to view.