In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# 1. Load Dataset

In [2]:
df = pd.read_csv("Backend/ml/data/crop_recommendation_dataset_updated.csv")

print(f"\nDataset Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nMissing Values:\n{df.isnull().sum()}")
print(f"\nDataset Info:")
print(df.info())


Dataset Shape: (4991, 10)
Columns: ['nitrogen', 'phosphorus', 'potassium', 'ph', 'temperature', 'humidity', 'rainfall', 'soil', 'season', 'crop']

Missing Values:
nitrogen       0
phosphorus     0
potassium      0
ph             0
temperature    0
humidity       0
rainfall       0
soil           0
season         0
crop           0
dtype: int64

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4991 entries, 0 to 4990
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nitrogen     4991 non-null   float64
 1   phosphorus   4991 non-null   float64
 2   potassium    4991 non-null   float64
 3   ph           4991 non-null   float64
 4   temperature  4991 non-null   float64
 5   humidity     4991 non-null   float64
 6   rainfall     4991 non-null   float64
 7   soil         4991 non-null   object 
 8   season       4991 non-null   object 
 9   crop         4991 non-null   object 
dtypes: float64(7), object(

# 2. Feature Engineering

In [3]:
NUMERICAL_FEATURES = [
    "nitrogen",
    "phosphorus",
    "potassium",
    "temperature",
    "humidity",
    "ph",
    "rainfall"
]

TARGET = "crop"

# Encode categorical features
soil_encoder = LabelEncoder()
season_encoder = LabelEncoder()
crop_encoder = LabelEncoder()

df['soil_encoded'] = soil_encoder.fit_transform(df['soil'])
df['season_encoded'] = season_encoder.fit_transform(df['season'])
y_encoded = crop_encoder.fit_transform(df[TARGET])

# Feature matrix
ENCODED_FEATURES = NUMERICAL_FEATURES + ['soil_encoded', 'season_encoded']
X = df[ENCODED_FEATURES]

print(f"\nFeature Matrix Shape: {X.shape}")
print(f"  Soil Types ({len(soil_encoder.classes_)}): {list(soil_encoder.classes_)}")
print(f"  Seasons ({len(season_encoder.classes_)}): {list(season_encoder.classes_)}")
print(f"  Crops ({len(crop_encoder.classes_)}): {list(crop_encoder.classes_)}")


Feature Matrix Shape: (4991, 9)
  Soil Types (9): ['Alluvial', 'Black Soil', 'Clay', 'Clay Loam', 'Coastal Sandy', 'Loamy', 'Red Soil', 'Sandy', 'Sandy Loam']
  Seasons (3): ['Kharif', 'Rabi', 'Zayad']
  Crops (23): ['Apple', 'Banana', 'Blackgram', 'Chickpea', 'Coconut', 'Coffee', 'Cotton', 'Grapes', 'Jute', 'KidneyBeans', 'Lentil', 'Maize', 'Mango', 'MothBeans', 'MungBean', 'Muskmelon', 'Orange', 'Papaya', 'PigeonPeas', 'Pomegranate', 'Rice', 'Watermelon', 'Wheat']


# 3. Train-Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print(f"\nDataset Split:")
print(f"  Training set: {X_train.shape[0]} samples ({(X_train.shape[0]/len(X))*100:.1f}%)")
print(f"  Testing set: {X_test.shape[0]} samples ({(X_test.shape[0]/len(X))*100:.1f}%)")


Dataset Split:
  Training set: 3992 samples (80.0%)
  Testing set: 999 samples (20.0%)


# 4. Train Model

In [5]:
print("\n" + "="*60)
print("Training Random Forest Classifier...")
print("="*60)

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("Model training completed!")


Training Random Forest Classifier...
Model training completed!


# 5. Model Evaluation

In [6]:
print("\n" + "="*60)
print("MODEL PERFORMANCE EVALUATION")
print("="*60)

# Training accuracy
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)

# Testing accuracy
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nAccuracy Scores:")
print(f"  Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"  Testing Accuracy:  {test_accuracy * 100:.2f}%")
print(f"  Difference:        {(train_accuracy - test_accuracy) * 100:.2f}%")


MODEL PERFORMANCE EVALUATION

Accuracy Scores:
  Training Accuracy: 99.70%
  Testing Accuracy:  97.40%
  Difference:        2.30%


In [7]:
# Overfitting check
print(f"\nOverfitting Analysis:")
if train_accuracy - test_accuracy > 0.10:
    print("  WARNING: Model is OVERFITTING")
    print("  (Training accuracy is significantly higher than testing accuracy)")
elif train_accuracy - test_accuracy > 0.05:
    print("  MODERATE overfitting detected")
    print("  (Training accuracy is slightly higher than testing accuracy)")
else:
    print("  Model is WELL-GENERALIZED")
    print("  (Training and testing accuracies are similar)")


Overfitting Analysis:
  Model is WELL-GENERALIZED
  (Training and testing accuracies are similar)


In [8]:
# Cross-validation
print(f"\nCross-Validation (5-fold):")
cv_scores = cross_val_score(model, X, y_encoded, cv=5, scoring='accuracy')
print(f"  CV Scores: {[f'{score*100:.2f}%' for score in cv_scores]}")
print(f"  Mean CV Accuracy: {cv_scores.mean() * 100:.2f}%")
print(f"  Std Deviation: {cv_scores.std() * 100:.2f}%")


Cross-Validation (5-fold):
  CV Scores: ['96.50%', '96.89%', '96.39%', '96.69%', '95.99%']
  Mean CV Accuracy: 96.49%
  Std Deviation: 0.30%


In [9]:
# Detailed classification report
print(f"\nClassification Report:")
print("="*60)
print(classification_report(
    y_test, 
    y_test_pred, 
    target_names=crop_encoder.classes_,
    zero_division=0
))


Classification Report:
              precision    recall  f1-score   support

       Apple       1.00      1.00      1.00        43
      Banana       1.00      1.00      1.00        43
   Blackgram       0.95      0.95      0.95        43
    Chickpea       0.98      0.95      0.96        43
     Coconut       1.00      1.00      1.00        44
      Coffee       1.00      1.00      1.00        43
      Cotton       1.00      1.00      1.00        44
      Grapes       1.00      1.00      1.00        44
        Jute       1.00      1.00      1.00        43
 KidneyBeans       1.00      0.93      0.96        43
      Lentil       0.96      1.00      0.98        44
       Maize       1.00      1.00      1.00        43
       Mango       1.00      0.95      0.98        43
   MothBeans       1.00      1.00      1.00        43
    MungBean       0.98      1.00      0.99        44
   Muskmelon       0.75      0.95      0.84        43
      Orange       0.98      1.00      0.99        44
   

In [10]:
# Feature importance
print(f"\nFeature Importance:")
print("="*60)
feature_names = NUMERICAL_FEATURES + ['soil', 'season']
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

for idx, row in feature_importance.iterrows():
    print(f"  {row['Feature']:15s}: {row['Importance']:.4f} ({row['Importance']*100:.2f}%)")


Feature Importance:
  potassium      : 0.1991 (19.91%)
  rainfall       : 0.1808 (18.08%)
  nitrogen       : 0.1689 (16.89%)
  phosphorus     : 0.1414 (14.14%)
  humidity       : 0.0878 (8.78%)
  temperature    : 0.0859 (8.59%)
  season         : 0.0582 (5.82%)
  soil           : 0.0488 (4.88%)
  ph             : 0.0289 (2.89%)


In [11]:
# Confusion Matrix Summary
print(f"\nConfusion Matrix Summary:")
cm = confusion_matrix(y_test, y_test_pred)
print(f"  Correct Predictions: {np.trace(cm)}")
print(f"  Incorrect Predictions: {cm.sum() - np.trace(cm)}")
print(f"  Total Test Samples: {cm.sum()}")


Confusion Matrix Summary:
  Correct Predictions: 973
  Incorrect Predictions: 26
  Total Test Samples: 999


In [12]:
# Per-class accuracy
print(f"\nPer-Class Accuracy:")
for i, crop_name in enumerate(crop_encoder.classes_):
    if cm[i].sum() > 0:
        class_accuracy = cm[i][i] / cm[i].sum()
        print(f"  {crop_name:15s}: {class_accuracy*100:.2f}%")


Per-Class Accuracy:
  Apple          : 100.00%
  Banana         : 100.00%
  Blackgram      : 95.35%
  Chickpea       : 95.35%
  Coconut        : 100.00%
  Coffee         : 100.00%
  Cotton         : 100.00%
  Grapes         : 100.00%
  Jute           : 100.00%
  KidneyBeans    : 93.02%
  Lentil         : 100.00%
  Maize          : 100.00%
  Mango          : 95.35%
  MothBeans      : 100.00%
  MungBean       : 100.00%
  Muskmelon      : 95.35%
  Orange         : 100.00%
  Papaya         : 100.00%
  PigeonPeas     : 97.73%
  Pomegranate    : 100.00%
  Rice           : 100.00%
  Watermelon     : 68.18%
  Wheat          : 100.00%


In [13]:
print(f"\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"\nModel Training: COMPLETED")
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")
print(f"Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%")
print(f"Overfitting Status: {'Well-Generalized' if train_accuracy - test_accuracy <= 0.05 else 'Overfitting Detected'}")


TRAINING SUMMARY

Model Training: COMPLETED
Training Accuracy: 99.70%
Testing Accuracy: 97.40%
Cross-Validation Accuracy: 96.49%
Overfitting Status: Well-Generalized


# Save the trained model and encoders

In [14]:
# Save the trained model
joblib.dump(model, 'Backend/ml/models/crop_recommendation_model.pkl')
print("✓ Model saved successfully!")

# Save encoders
joblib.dump(soil_encoder, 'Backend/ml/models/soil_encoder.pkl')
print("✓ Soil encoder saved successfully!")

joblib.dump(season_encoder, 'Backend/ml/models/season_encoder.pkl')
print("✓ Season encoder saved successfully!")

joblib.dump(crop_encoder, 'Backend/ml/models/crop_encoder.pkl')
print("✓ Crop encoder saved successfully!")

✓ Model saved successfully!
✓ Soil encoder saved successfully!
✓ Season encoder saved successfully!
✓ Crop encoder saved successfully!
