In [2]:
# Import necessary libraries for saving models
import joblib

# Import necessary libraries for preprocessing and training
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Load the dataset
crop_data = pd.read_csv("Crop_recommendation.csv")

# 1. Separate the features from the target
X = crop_data.drop('label', axis=1)
y = crop_data['label']

# 2. Impute only the numeric features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Impute missing values in numeric features only
imputer = SimpleImputer(strategy='mean')
X[numeric_features] = imputer.fit_transform(X[numeric_features])

# Checking for duplicates and removing them
X = X.drop_duplicates()

# 3. Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Save the label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

# 4. Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Feature Scaling using MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

# Apply MinMaxScaler
X_train_scaled_minmax = scaler_minmax.fit_transform(X_train)
X_test_scaled_minmax = scaler_minmax.transform(X_test)

# Apply StandardScaler
X_train_scaled_standard = scaler_standard.fit_transform(X_train)
X_test_scaled_standard = scaler_standard.transform(X_test)

# Choosing MinMaxScaler for this example
X_train_scaled = X_train_scaled_minmax
X_test_scaled = X_test_scaled_minmax

# Save the chosen scaler
joblib.dump(scaler_minmax, 'scaler_minmax.pkl')  # Save the MinMaxScaler
# joblib.dump(scaler_standard, 'scaler_standard.pkl')  # Save the StandardScaler if used

# 6. Define and Train Models
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier()
}

# Define hyperparameter grids
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5],
    'gamma': [0, 0.1, 0.2]
}

# Choose the grid based on the model
grids = {
    "Random Forest": param_grid_rf,
    "Gradient Boosting": param_grid_gb,
    "XGBoost": param_grid_xgb
}

best_models = {}
for name, model in models.items():
    print(f"Training and tuning {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    # Save each model as a .pkl file
    joblib.dump(best_models[name], f'{name.replace(" ", "_").lower()}_model.pkl')

# 7. Evaluate the models
evaluation_results = {}
for name, model in best_models.items():
    print(f"Evaluating {name}...")
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    evaluation_results[name] = {
        "accuracy": accuracy,
        "classification_report": classification_rep,
        "confusion_matrix": conf_matrix
    }
    print(f"Accuracy for {name}: {accuracy:.4f}")
    print(classification_rep)
    print(conf_matrix)
    print("\n" + "="*50 + "\n")

# Final output: evaluation_results contains all the metrics for each model


Training and tuning Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Training and tuning Gradient Boosting...
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Training and tuning XGBoost...
Best parameters for XGBoost: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Evaluating Random Forest...
Accuracy for Random Forest: 0.9932
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       1.00      1.00      1.00        17
           6       1.00      1.00      1.00        17
           7       1.00      1.00      1.00        14
           8      