In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("atharvaingle/crop-recommendation-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/atharvaingle/crop-recommendation-dataset?dataset_version_number=1...


100%|██████████| 63.7k/63.7k [00:00<00:00, 153kB/s]

Extracting files...
Path to dataset files: C:\Users\jjona\.cache\kagglehub\datasets\atharvaingle\crop-recommendation-dataset\versions\1





In [4]:
import pandas as pd
import os
# Read the dataset
df = pd.read_csv(os.path.join(path, "Crop_recommendation.csv"))
print("Dataset shape:", df.shape)
print("Dataset columns:", df.columns)
print("Dataset head:\n", df.head())

Dataset shape: (2200, 8)
Dataset columns: Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')
Dataset head:
     N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice


In [23]:
#unique values in the label column
print("Unique values in label column:", df['label'].unique())

Unique values in label column: ['rice' 'maize' 'chickpea' 'kidneybeans' 'pigeonpeas' 'mothbeans'
 'mungbean' 'blackgram' 'lentil' 'pomegranate' 'banana' 'mango' 'grapes'
 'watermelon' 'muskmelon' 'apple' 'orange' 'papaya' 'coconut' 'cotton'
 'jute' 'coffee']


In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode label
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(df.drop(['label', 'label_encoded'], axis=1))
y = df['label_encoded']


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Split the train set into 80% train and 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print("Train set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (1408, 7) (1408,)
Validation set shape: (352, 7) (352,)
Test set shape: (440, 7) (440,)


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

models = {
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(f"{name} Accuracy:", accuracy_score(y_val, preds))


Random Forest Accuracy: 0.9943181818181818
SVM Accuracy: 0.9829545454545454
KNN Accuracy: 0.9659090909090909


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=1, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best Cross-Validation Accuracy: 0.9943161454784081


In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Best model from grid search
best_rf = grid_search.best_estimator_

# Predict on validation set
val_preds = best_rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Report:\n", classification_report(y_val, val_preds))
print("Validation Confusion Matrix:\n", confusion_matrix(y_val, val_preds))

# Predict on test set
test_preds = best_rf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test Report:\n", classification_report(y_test, test_preds))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, test_preds))


Validation Accuracy: 0.9943181818181818
Validation Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        15
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00        21
           6       1.00      1.00      1.00        16
           7       1.00      1.00      1.00        18
           8       0.92      1.00      0.96        23
           9       1.00      1.00      1.00        18
          10       1.00      1.00      1.00        19
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        14
          13       1.00      1.00      1.00         9
          14       1.00      1.00      1.00        18
          15       1.00      1.00      1.00        17
          16       1.

In [22]:
import joblib
joblib.dump(best_rf, 'crop_recommendation_rf_model.pkl')
joblib.dump(scaler, 'scaler.pkl')  # Save the scaler
joblib.dump(le, 'label_encoder.pkl')  # Save the label encoder

['label_encoder.pkl']