**Use case 3:- Pest Attack Prediction**

**STEP 1:-Load & inspect the data**

In [0]:
df = spark.table("workspace.default.pest")
df.printSchema()
df.show(5)



root
 |-- Field_ID: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Crop_Type: string (nullable = true)
 |-- Avg_Temperature_C: double (nullable = true)
 |-- Total_Rainfall_mm: double (nullable = true)
 |-- Avg_Humidity_perc: double (nullable = true)
 |-- Soil_Moisture_perc: double (nullable = true)
 |-- Fertilizer_NPK: string (nullable = true)
 |-- Pest_Risk_Index: double (nullable = true)
 |-- Disease_Outbreak: long (nullable = true)

+--------+-------------------+---------+-----------------+-----------------+-----------------+------------------+--------------+---------------+----------------+
|Field_ID|               Date|Crop_Type|Avg_Temperature_C|Total_Rainfall_mm|Avg_Humidity_perc|Soil_Moisture_perc|Fertilizer_NPK|Pest_Risk_Index|Disease_Outbreak|
+--------+-------------------+---------+-----------------+-----------------+-----------------+------------------+--------------+---------------+----------------+
|    F003|2024-06-01 00:00:00|    Wheat|            

**STEP 2:- Rename key columns for consistency**

In [0]:
df = (df
    .withColumnRenamed("Crop_Type", "CROP")
    .withColumnRenamed("Avg_Temperature_C", "TEMPERATURE")
    .withColumnRenamed("Avg_Humidity_perc", "HUMIDITY")
    .withColumnRenamed("Soil_Moisture_perc", "MOISTURE")
    .withColumnRenamed("Total_Rainfall_mm", "RAINFALL")
    .withColumnRenamed("Fertilizer_NPK", "FERTILIZER")
    .withColumnRenamed("Disease_Outbreak", "OUTBREAK")
)

print(df.columns)


['Field_ID', 'Date', 'CROP', 'TEMPERATURE', 'RAINFALL', 'HUMIDITY', 'MOISTURE', 'FERTILIZER', 'Pest_Risk_Index', 'OUTBREAK']


**STEP 3:- Clean & filter data**

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

df = (df
      .withColumn("TEMPERATURE", col("TEMPERATURE").cast(DoubleType()))
      .withColumn("HUMIDITY", col("HUMIDITY").cast(DoubleType()))
      .withColumn("MOISTURE", col("MOISTURE").cast(DoubleType()))
      .withColumn("RAINFALL", col("RAINFALL").cast(DoubleType()))
      .withColumn("OUTBREAK", col("OUTBREAK").cast(DoubleType()))
      .filter(col("OUTBREAK").isNotNull())
)

display(df.limit(5))


Field_ID,Date,CROP,TEMPERATURE,RAINFALL,HUMIDITY,MOISTURE,FERTILIZER,Pest_Risk_Index,OUTBREAK
F003,2024-06-01T00:00:00.000Z,Wheat,27.7,25.0,78.9,50.3,15-15-15,42.1,0.0
F003,2024-06-02T00:00:00.000Z,Wheat,36.1,40.9,61.8,59.0,15-15-15,44.72,0.0
F007,2024-06-03T00:00:00.000Z,Corn,37.0,52.5,61.6,53.4,20-10-10,61.03,0.0
F002,2024-06-04T00:00:00.000Z,Corn,29.0,30.6,84.9,57.0,10-20-10,55.58,0.0
F004,2024-06-05T00:00:00.000Z,Corn,31.2,40.2,68.7,63.0,10-20-10,46.11,0.0


**STEP 4:- Feature engineering**

In [0]:
from pyspark.sql.functions import when

df = df.withColumn(
    "RISK_FLAG",
    when(col("Pest_Risk_Index") > 50, 1).otherwise(0)
)

display(df.select("CROP", "FERTILIZER", "RISK_FLAG").limit(5))


CROP,FERTILIZER,RISK_FLAG
Wheat,15-15-15,0
Wheat,15-15-15,0
Corn,20-10-10,1
Corn,10-20-10,1
Corn,10-20-10,0


**STEP 5:- SQL exploratory analysis**

In [0]:
df.createOrReplaceTempView("pest")

spark.sql("""
  SELECT CROP,
         COUNT(*) AS total_records,
         ROUND(AVG(TEMPERATURE),2) AS avg_temp,
         ROUND(AVG(HUMIDITY),2) AS avg_humidity,
         ROUND(AVG(RAINFALL),2) AS avg_rainfall
  FROM pest
  GROUP BY CROP
  ORDER BY total_records DESC
""").show(10)


+-------+-------------+--------+------------+------------+
|   CROP|total_records|avg_temp|avg_humidity|avg_rainfall|
+-------+-------------+--------+------------+------------+
|   Corn|           10|   30.62|       76.58|        37.9|
|  Wheat|            7|   29.01|       75.27|        19.8|
|Soybean|            3|    27.8|       86.13|       41.07|
+-------+-------------+--------+------------+------------+



**STEP 6:- Prepare ML features**

In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

crop_indexer = StringIndexer(inputCol="CROP", outputCol="CROP_idx", handleInvalid="keep")
fert_indexer = StringIndexer(inputCol="FERTILIZER", outputCol="FERTILIZER_idx", handleInvalid="keep")

encoder = OneHotEncoder(
    inputCols=["CROP_idx", "FERTILIZER_idx"],
    outputCols=["CROP_ohe", "FERTILIZER_ohe"]
)

assembler = VectorAssembler(
    inputCols=[
        "TEMPERATURE",
        "HUMIDITY",
        "MOISTURE",
        "RAINFALL",
        "Pest_Risk_Index",
        "CROP_ohe",
        "FERTILIZER_ohe"
    ],
    outputCol="features"
)

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


**STEP 7:- Train Model**

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="OUTBREAK")

pipeline = Pipeline(stages=[crop_indexer, fert_indexer, encoder, assembler, scaler, lr])

train, test = df.randomSplit([0.8, 0.2], seed=42)
model = pipeline.fit(train)
pred = model.transform(test)


**STEP 8:- Model evaluation**

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="OUTBREAK", metricName="areaUnderROC")
auc = evaluator.evaluate(pred)
print("ROC AUC:", round(auc, 3))

pred.createOrReplaceTempView("predictions")

spark.sql("""
SELECT prediction, OUTBREAK AS actual, COUNT(*) AS count
FROM predictions
GROUP BY prediction, OUTBREAK
ORDER BY count DESC
""").show()


ROC AUC: 1.0
+----------+------+-----+
|prediction|actual|count|
+----------+------+-----+
|       0.0|   0.0|    2|
|       0.0|   1.0|    1|
+----------+------+-----+



**STEP 9:- Save results to SQL table**

In [0]:
pred.select("CROP","FERTILIZER","TEMPERATURE","HUMIDITY","MOISTURE",
            "RAINFALL","OUTBREAK","prediction") \
    .write.mode("overwrite").saveAsTable("pest_predictions")

print("✅ Predictions saved as SQL table: pest_predictions")


✅ Predictions saved as SQL table: pest_predictions


In [0]:
%sql
SELECT CROP, ROUND(AVG(prediction),2) AS outbreak_probability
FROM pest_predictions
GROUP BY CROP
ORDER BY outbreak_probability DESC;


CROP,outbreak_probability
Wheat,0.0
Corn,0.0


In [0]:
display(spark.sql("SELECT * FROM pest_predictions"))


CROP,FERTILIZER,TEMPERATURE,HUMIDITY,MOISTURE,RAINFALL,OUTBREAK,prediction
Wheat,20-10-10,25.5,84.7,64.3,35.7,0.0,0.0
Corn,10-20-10,31.2,68.7,63.0,40.2,0.0,0.0
Corn,20-10-10,34.9,97.8,57.4,53.1,1.0,0.0


In [0]:
conf = spark.sql("""
    SELECT 
        OUTBREAK AS actual, 
        prediction, 
        COUNT(*) AS count
    FROM pest_predictions
    GROUP BY actual, prediction
""")

display(conf)



actual,prediction,count
0.0,0.0,2
1.0,0.0,1


**STEP 10:- Display more rows and columns for better visualization**

In [0]:
train, test = df.randomSplit([0.5, 0.5], seed=42)



In [0]:
model = pipeline.fit(train)
pred = model.transform(test)
pred.show(5)


+--------+-------------------+-----+-----------+--------+--------+--------+----------+---------------+--------+---------+--------+--------------+-------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
|Field_ID|               Date| CROP|TEMPERATURE|RAINFALL|HUMIDITY|MOISTURE|FERTILIZER|Pest_Risk_Index|OUTBREAK|RISK_FLAG|CROP_idx|FERTILIZER_idx|     CROP_ohe|FERTILIZER_ohe|            features|      scaledFeatures|       rawPrediction|         probability|prediction|
+--------+-------------------+-----+-----------+--------+--------+--------+----------+---------------+--------+---------+--------+--------------+-------------+--------------+--------------------+--------------------+--------------------+--------------------+----------+
|    F002|2024-06-20 00:00:00|Wheat|       25.5|    35.7|    84.7|    64.3|  20-10-10|          52.82|     0.0|        1|     1.0|           2.0|(3,[1],[1.0])| (3,[2],[1.0])|[25.5,84.7,64.3,

In [0]:
display(pred)


Field_ID,Date,CROP,TEMPERATURE,RAINFALL,HUMIDITY,MOISTURE,FERTILIZER,Pest_Risk_Index,OUTBREAK,RISK_FLAG,CROP_idx,FERTILIZER_idx,CROP_ohe,FERTILIZER_ohe,features,scaledFeatures,rawPrediction,probability,prediction
F002,2024-06-20T00:00:00.000Z,Wheat,25.5,35.7,84.7,64.3,20-10-10,52.82,0.0,1,1.0,2.0,"{""type"":""0"",""size"":""3"",""indices"":[""1""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""2""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""25.5"",""84.7"",""64.3"",""35.7"",""52.82"",""0.0"",""1.0"",""0.0"",""0.0"",""0.0"",""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""5.209646160605262"",""7.17671872192945"",""7.445919586425761"",""2.1706704211775905"",""2.5978700353438957"",""0.0"",""2.1330729007701543"",""0.0"",""0.0"",""0.0"",""2.1330729007701543""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""48.1850960958362"",""-48.1850960958362""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0"",""0.0""]}",0.0
F003,2024-06-01T00:00:00.000Z,Wheat,27.7,25.0,78.9,50.3,15-15-15,42.1,0.0,0,1.0,1.0,"{""type"":""0"",""size"":""3"",""indices"":[""1""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""1""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""27.7"",""78.9"",""50.3"",""25.0"",""42.1"",""0.0"",""1.0"",""0.0"",""0.0"",""1.0"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""5.659105829363362"",""6.685278714996855"",""5.8247240310608985"",""1.5200773257546152"",""2.070623409465695"",""0.0"",""2.1330729007701543"",""0.0"",""0.0"",""2.1330729007701543"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""-5.934199639949355"",""5.934199639949355""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.002640350824333056"",""0.997359649175667""]}",1.0
F004,2024-06-05T00:00:00.000Z,Corn,31.2,40.2,68.7,63.0,10-20-10,46.11,0.0,0,0.0,0.0,"{""type"":""0"",""size"":""3"",""indices"":[""0""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""0""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""31.2"",""68.7"",""63.0"",""40.2"",""46.11"",""1.0"",""0.0"",""0.0"",""1.0"",""0.0"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""6.374155302387614"",""5.8210221510809115"",""7.295379999141882"",""2.4442843398134215"",""2.267849059630955"",""1.927248223318863"",""0.0"",""0.0"",""1.947220240924654"",""0.0"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""59.58309798677948"",""-59.58309798677948""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0"",""0.0""]}",0.0
F004,2024-06-15T00:00:00.000Z,Wheat,22.2,5.7,85.3,56.1,10-20-10,20.64,0.0,0,1.0,0.0,"{""type"":""0"",""size"":""3"",""indices"":[""1""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""0""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""22.2"",""85.3"",""56.1"",""5.7"",""20.64"",""0.0"",""1.0"",""0.0"",""1.0"",""0.0"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""4.53545665746811"",""7.227557343336269"",""6.496362189712056"",""0.3465776302720523"",""1.0151464886311625"",""0.0"",""2.1330729007701543"",""0.0"",""1.947220240924654"",""0.0"",""0.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""29.780437162661798"",""-29.780437162661798""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.9999999999998834"",""1.1657341758564144E-13""]}",0.0
F005,2024-06-16T00:00:00.000Z,Corn,34.9,53.1,97.8,57.4,20-10-10,96.71,1.0,1,0.0,2.0,"{""type"":""0"",""size"":""3"",""indices"":[""0""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""2""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""34.9"",""97.8"",""57.4"",""53.1"",""96.71"",""1.0"",""0.0"",""0.0"",""0.0"",""0.0"",""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""7.130064745298966"",""8.28669528931169"",""6.646901776995937"",""3.2286442399028026"",""4.756531827302313"",""1.927248223318863"",""0.0"",""0.0"",""0.0"",""0.0"",""2.1330729007701543""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""25.144199897201432"",""-25.144199897201432""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.999999999987977"",""1.202304922287567E-11""]}",0.0
F010,2024-06-11T00:00:00.000Z,Corn,28.7,45.8,70.4,74.6,20-10-10,62.21,0.0,1,0.0,2.0,"{""type"":""0"",""size"":""3"",""indices"":[""0""],""values"":[""1.0""]}","{""type"":""0"",""size"":""3"",""indices"":[""2""],""values"":[""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""28.7"",""70.4"",""74.6"",""45.8"",""62.21"",""1.0"",""0.0"",""0.0"",""0.0"",""0.0"",""1.0""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""5.863405678798863"",""5.965064911733569"",""8.638656316444196"",""2.7847816607824547"",""3.0597026675263868"",""1.927248223318863"",""0.0"",""0.0"",""0.0"",""0.0"",""2.1330729007701543""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""85.63856726022368"",""-85.63856726022368""]}","{""type"":""1"",""size"":null,""indices"":null,""values"":[""1.0"",""0.0""]}",0.0


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.