In [1]:
import pandas as pd

# Load input features and targets
X = pd.read_csv("final_input_features.csv")
y_kharif = pd.read_csv("target_kharif_yield.csv")
y_rabi = pd.read_csv("target_rabi_yield.csv")
y_price = pd.read_csv("target_market_price.csv")


In [2]:
#Train-Test Split
from sklearn.model_selection import train_test_split

# Split for Kharif Yield
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X, y_kharif, test_size=0.2, random_state=42)

# Split for Rabi Yield
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_rabi, test_size=0.2, random_state=42)

# Split for Market Price
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X, y_price, test_size=0.2, random_state=42)


In [8]:
from xgboost import XGBRegressor

# Kharif Yield Model
xgb_kharif = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_kharif.fit(X_train_k, y_train_k)

# Rabi Yield Model
xgb_rabi = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_rabi.fit(X_train_r, y_train_r)

# Market Price Model
xgb_price = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_price.fit(X_train_p, y_train_p)


In [9]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

def evaluate(model, X_test, y_test, name):
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"{name} — R² Score: {r2:.3f}, RMSE: {rmse:.3f}")

evaluate(xgb_kharif, X_test_k, y_test_k, "Kharif Yield")
evaluate(xgb_rabi, X_test_r, y_test_r, "Rabi Yield")
evaluate(xgb_price, X_test_p, y_test_p, "Market Price")


Kharif Yield — R² Score: 0.655, RMSE: 1.150
Rabi Yield — R² Score: 0.738, RMSE: 1.255
Market Price — R² Score: 0.465, RMSE: 2315.721


 **XGBoost Model Evaluation Results**
| Target                | R² Score | RMSE      | Interpretation |
|-----------------------|----------|-----------|----------------|
| **Kharif Yield**      | 0.655    | 1.150     | Moderate performance; slightly worse than Random Forest |
| **Rabi Yield**        | 0.738    | 1.255     | Almost same as RF; very good performance |
| **Market Price**      | 0.465    | 2315.721  | Poorer performance than RF; model is struggling to capture market price patterns |

---

**Comparison with Random Forest**
| Model           | Target           | R² Score (RF) | R² Score (XGB) | Winner     |
|----------------|------------------|---------------|----------------|------------|
| Random Forest  | Kharif Yield     | **0.688**     | 0.655          | 🏆 RF       |
| Random Forest  | Rabi Yield       | 0.739         | **0.738**      | ⚖️ Similar  |
| Random Forest  | Market Price     | **0.676**     | 0.465          | 🏆 RF       |

---

**What This Tells You**
- **Random Forest** is performing better for your current dataset, especially for **Kharif Yield** and **Market Price**.
- **XGBoost** may benefit from **hyperparameter tuning** (learning rate, max depth, etc.) or **more data cleaning/feature engineering**.
- **Rabi Yield** prediction is good in both models, meaning your features are well-suited for that task.

---


In [10]:
xgb_models = {
    'Kharif_Yield_(Tonne/Hectare)': xgb_kharif,
    'Rabi_Yield_(Tonne/Hectare)': xgb_rabi,
    'Average_Market_Price': xgb_price
}


In [11]:
import os
import joblib

os.makedirs("saved_models_xgb", exist_ok=True)

for name, model in xgb_models.items():
    filename = f"saved_models_xgb/{name.replace(' ', '_').replace('(', '').replace(')', '').replace('/', '_')}_XGB_model.pkl"
    joblib.dump(model, filename)
    print(f"✅ Saved: {filename}")


✅ Saved: saved_models_xgb/Kharif_Yield_Tonne_Hectare_XGB_model.pkl
✅ Saved: saved_models_xgb/Rabi_Yield_Tonne_Hectare_XGB_model.pkl
✅ Saved: saved_models_xgb/Average_Market_Price_XGB_model.pkl
