# Model Training

This notebook trains a regression model to predict crop suitability scores (0-100).
Uses Random Forest or XGBoost for Option 2: Direct suitability score prediction.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
import joblib
from pathlib import Path

# Load training dataset
train_df = pd.read_csv("../models/training_dataset.csv")
print(f"Training dataset shape: {train_df.shape}")
print(train_df.head())


Training dataset shape: (4884, 14)
   npk_match  ph_proximity  temp_suitability  rainfall_suitability  \
0   0.000000      0.801245          0.956212              0.510209   
1   0.000000      0.547619          0.780627              0.000000   
2   0.666667      0.819444          0.607525              0.124888   
3   0.666667      0.802083          0.913970              0.000000   
4   0.666667      0.895717          0.729678              0.000000   

   humidity_suitability  soil_match  historical_yield  season_alignment  \
0              0.941970         0.3          1.000000               1.0   
1              0.821305         1.0          0.416671               1.0   
2              0.782908         1.0          1.000000               1.0   
3              0.581100         1.0          0.630325               1.0   
4              0.767406         0.3          1.000000               1.0   

   regional_success  suitability_score                    crop_name  \
0               1.0   

In [3]:
# Prepare features and target
feature_columns = [
    'npk_match', 'ph_proximity', 'temp_suitability',
    'rainfall_suitability', 'humidity_suitability', 'soil_match',
    'historical_yield', 'season_alignment', 'regional_success'
]

X = train_df[feature_columns]
y = train_df['suitability_score']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature statistics:")
print(X.describe())
print(f"\nTarget statistics:")
print(y.describe())


Features shape: (4884, 9)
Target shape: (4884,)

Feature statistics:
         npk_match  ph_proximity  temp_suitability  rainfall_suitability  \
count  4884.000000   4884.000000       4884.000000           4884.000000   
mean      0.243243      0.572344          0.801956              0.188675   
std       0.260368      0.312759          0.142623              0.235216   
min       0.000000      0.000000          0.339717              0.000000   
25%       0.000000      0.345811          0.730741              0.000000   
50%       0.333333      0.656339          0.799291              0.077739   
75%       0.333333      0.833333          0.924239              0.339243   
max       1.000000      1.000000          0.999844              0.988756   

       humidity_suitability   soil_match  historical_yield  season_alignment  \
count           4884.000000  4884.000000       4884.000000       4884.000000   
mean               0.784451     0.678522          0.518785          0.942192   
std   

In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Training set: 3907 samples
Test set: 977 samples


In [5]:
# Train Random Forest model
print("Training Random Forest model...")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"\nRandom Forest Results:")
print(f"RMSE: {rmse_rf:.2f}")
print(f"MAE: {mae_rf:.2f}")
print(f"R²: {r2_rf:.4f}")


Training Random Forest model...

Random Forest Results:
RMSE: 15.59
MAE: 8.47
R²: 0.7661


In [6]:
# Train XGBoost model
print("\nTraining XGBoost model...")
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

# Evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"\nXGBoost Results:")
print(f"RMSE: {rmse_xgb:.2f}")
print(f"MAE: {mae_xgb:.2f}")
print(f"R²: {r2_xgb:.4f}")



Training XGBoost model...

XGBoost Results:
RMSE: 16.02
MAE: 8.47
R²: 0.7530


In [7]:
# Choose best model (lower RMSE is better)
if rmse_rf < rmse_xgb:
    best_model = rf_model
    best_model_name = "RandomForest"
    print(f"\nBest model: Random Forest (RMSE: {rmse_rf:.2f})")
else:
    best_model = xgb_model
    best_model_name = "XGBoost"
    print(f"\nBest model: XGBoost (RMSE: {rmse_xgb:.2f})")

# Save model
model_path = Path("../models/crop_yield_model.pkl")
joblib.dump(best_model, model_path)
print(f"\nModel saved to {model_path}")

# Save feature names for reference
feature_info = {
    'feature_names': feature_columns,
    'model_type': best_model_name
}
import json
with open("../models/model_info.json", "w") as f:
    json.dump(feature_info, f, indent=2)
print("Model info saved to models/model_info.json")



Best model: Random Forest (RMSE: 15.59)

Model saved to ..\models\crop_yield_model.pkl
Model info saved to models/model_info.json
