# Model Training

This notebook trains three regression models to predict crop suitability scores (0-100):
- Random Forest
- XGBoost
- Gradient Boosting

It also performs hyperparameter tuning to find the best-performing model.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import joblib
import json
from pathlib import Path

# Load datasets
train_df = pd.read_csv("../models/training_dataset.csv")
val_df = pd.read_csv("../models/validation_dataset.csv")

print(f"Training dataset shape: {train_df.shape}")
print(f"Validation dataset shape: {val_df.shape}")


In [None]:
# Prepare features and target
feature_columns = [
    'npk_match', 'ph_proximity', 'temp_suitability',
    'rainfall_suitability', 'humidity_suitability', 'soil_match',
    'historical_yield', 'season_alignment', 'regional_success'
]

X_train = train_df[feature_columns]
y_train = train_df['suitability_score']
X_val = val_df[feature_columns]
y_val = val_df['suitability_score']

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")


## Hyperparameter Tuning

We'll use GridSearchCV to find the best hyperparameters for each model.

In [None]:
def train_and_evaluate(model, param_grid, X_train, y_train, X_val, y_val):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    return best_model, rmse, mae, r2


In [None]:
# Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}
rf_model, rf_rmse, rf_mae, rf_r2 = train_and_evaluate(RandomForestRegressor(random_state=42), rf_param_grid, X_train, y_train, X_val, y_val)
print(f"Random Forest - RMSE: {rf_rmse:.2f}, MAE: {rf_mae:.2f}, R²: {rf_r2:.4f}")


In [None]:
# XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1]
}
xgb_model, xgb_rmse, xgb_mae, xgb_r2 = train_and_evaluate(xgb.XGBRegressor(random_state=42), xgb_param_grid, X_train, y_train, X_val, y_val)
print(f"XGBoost - RMSE: {xgb_rmse:.2f}, MAE: {xgb_mae:.2f}, R²: {xgb_r2:.4f}")


In [None]:
# Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10],
    'learning_rate': [0.05, 0.1]
}
gb_model, gb_rmse, gb_mae, gb_r2 = train_and_evaluate(GradientBoostingRegressor(random_state=42), gb_param_grid, X_train, y_train, X_val, y_val)
print(f"Gradient Boosting - RMSE: {gb_rmse:.2f}, MAE: {gb_mae:.2f}, R²: {gb_r2:.4f}")


In [None]:
# Save the best model
models = {
    'RandomForest': (rf_model, rf_rmse),
    'XGBoost': (xgb_model, xgb_rmse),
    'GradientBoosting': (gb_model, gb_rmse)
}

best_model_name = min(models, key=lambda k: models[k][1])
best_model, best_rmse = models[best_model_name]

print(f"\nBest model: {best_model_name} (RMSE: {best_rmse:.2f})")

output_dir = Path("../models")
joblib.dump(best_model, output_dir / "crop_suitability_model.pkl")
print(f"Model saved to {output_dir / 'crop_suitability_model.pkl'}")

# Save feature names and model type
model_info = {'feature_names': feature_columns, 'model_type': best_model_name}
with open(output_dir / "model_info.json", "w") as f:
    json.dump(model_info, f, indent=2)
print("Model info saved successfully!")
