# Crop Yield Prediction – Final Model Pipeline (Rev9)

## 1. Setup & Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
from scipy.signal import detrend
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import shap

np.random.seed(42)
torch.manual_seed(42)
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')
print("All libraries loaded.")

  from .autonotebook import tqdm as notebook_tqdm


All libraries loaded.


## 2. Model Selection Menu

In [2]:
RUN_MODELS = {
    'LR': True,   # Baseline Linear Regression
    'RF': True,   # Random Forest
    'XGB': True,  # XGBoost
    'LSTM': True, # LSTM
    'CNN': True   # CNN
}

RUN_OPTUNA = True

## 3. Load & Preprocess Data

In [3]:
class CropDataPipeline:
    def __init__(self, data_path):
        self.df = pd.read_csv(data_path)
        self.target = 'hg/ha_yield'
        self.time_col = 'Year'
        self.cat_cols = ['Area', 'Item']
        self.numeric_cols = ['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'fertilizer_kg/ha', 'solar_radiation_MJ/m2-day']
        self.trend_models = {}
        self.le_area = LabelEncoder()
        self.le_item = LabelEncoder()
        self.scaler = StandardScaler()
        self.scaler_dl = StandardScaler()
        self.target_scaler = StandardScaler()
        self.train_end = 2007
        self.val_end = 2010
        self.lookback = 5
        self.lag_cols = []  # Will set after detrend
        self.lags = [1, 2]
        self.ml_feats = []  # Will set later
        self.dl_feats = self.numeric_cols + ['Area_Encoded', 'Item_Encoded']
    
    def split_data(self):
        self.train_df = self.df[self.df[self.time_col] <= self.train_end].copy()
        self.val_df = self.df[(self.df[self.time_col] > self.train_end) & (self.df[self.time_col] <= self.val_end)].copy()
        self.test_df = self.df[self.df[self.time_col] > self.val_end].copy()
    
    def fit_trend(self):
        for key, group in self.train_df.groupby(self.cat_cols):
            if len(group) < 2:
                continue
            X = group[[self.time_col]]
            y = group[self.target]
            model = LinearRegression().fit(X, y)
            self.trend_models[tuple(key)] = model
    
    def add_detrend(self):
        for d in [self.train_df, self.val_df, self.test_df]:
            d['yield_trend'] = np.nan
            d['yield_detrended'] = np.nan
            for idx in d.index:
                row = d.loc[idx]
                key = (row[self.cat_cols[0]], row[self.cat_cols[1]])
                if key in self.trend_models:
                    year = [[row[self.time_col]]]
                    trend = self.trend_models[key].predict(year)[0]
                    d.loc[idx, 'yield_trend'] = trend
                    d.loc[idx, 'yield_detrended'] = row[self.target] - trend
        self.lag_cols = ['yield_detrended'] + self.numeric_cols
    
    def create_lags(self):
        self.df_full = pd.concat([self.train_df, self.val_df, self.test_df]).sort_values(self.cat_cols + [self.time_col])
        for col in self.lag_cols:
            for lag in self.lags:
                self.df_full[f'{col}_lag{lag}'] = self.df_full.groupby(self.cat_cols)[col].shift(lag)
        self.df_ml = self.df_full.dropna().copy()
        self.lagged_cols = [c for c in self.df_ml.columns if '_lag' in c]
        self.ml_feats = self.numeric_cols + self.lagged_cols + ['Area_Encoded', 'Item_Encoded']
    
    def resplit(self):
        self.train_df = self.df_ml[self.df_ml[self.time_col] <= self.train_end].copy()
        self.val_df = self.df_ml[(self.df_ml[self.time_col] > self.train_end) & (self.df_ml[self.time_col] <= self.val_end)].copy()
        self.test_df = self.df_ml[self.df_ml[self.time_col] > self.val_end].copy()
    
    def fit_encode_scale(self):
        all_cats = pd.concat([self.train_df, self.val_df, self.test_df])
        self.le_area.fit(all_cats['Area'])
        self.le_item.fit(all_cats['Item'])
        for d in [self.train_df, self.val_df, self.test_df]:
            d['Area_Encoded'] = self.le_area.transform(d['Area'])
            d['Item_Encoded'] = self.le_item.transform(d['Item'])
        scale_cols = self.numeric_cols + self.lagged_cols
        self.scaler.fit(self.train_df[scale_cols])
        self.train_df[scale_cols] = self.scaler.transform(self.train_df[scale_cols])
        self.val_df[scale_cols] = self.scaler.transform(self.val_df[scale_cols])
        self.test_df[scale_cols] = self.scaler.transform(self.test_df[scale_cols])
    
    def prepare_dl(self):
        self.df_dl = self.df_full.copy()  # Use full for sequences (no dropna)
        self.train_df_dl = self.df_dl[self.df_dl[self.time_col] <= self.train_end].copy()
        self.val_df_dl = self.df_dl[(self.df_dl[self.time_col] > self.train_end) & (self.df_dl[self.time_col] <= self.val_end)].copy()
        self.test_df_dl = self.df_dl[self.df_dl[self.time_col] > self.val_end].copy()
        for d in [self.train_df_dl, self.val_df_dl, self.test_df_dl]:
            d['Area_Encoded'] = self.le_area.transform(d['Area'])
            d['Item_Encoded'] = self.le_item.transform(d['Item'])
        self.scaler_dl.fit(self.train_df_dl[self.numeric_cols])
        self.train_df_dl[self.numeric_cols] = self.scaler_dl.transform(self.train_df_dl[self.numeric_cols])
        self.val_df_dl[self.numeric_cols] = self.scaler_dl.transform(self.val_df_dl[self.numeric_cols])
        self.test_df_dl[self.numeric_cols] = self.scaler_dl.transform(self.test_df_dl[self.numeric_cols])
        self.target_scaler.fit(self.train_df_dl[['yield_detrended']])
        self.train_df_dl['yield_detrended'] = self.target_scaler.transform(self.train_df_dl[['yield_detrended']])
        self.val_df_dl['yield_detrended'] = self.target_scaler.transform(self.val_df_dl[['yield_detrended']])
        self.test_df_dl['yield_detrended'] = self.target_scaler.transform(self.test_df_dl[['yield_detrended']])
    
    def save_transformers(self):
        joblib.dump(self.scaler, 'scaler.joblib')
        joblib.dump(self.le_area, 'le_area.joblib')
        joblib.dump(self.le_item, 'le_item.joblib')
        joblib.dump(self.trend_models, 'trend_models.joblib')
        joblib.dump(self.scaler_dl, 'scaler_dl.joblib')
        joblib.dump(self.target_scaler, 'target_scaler.joblib')

# Usage in notebook (replaces sections 3 and 4)
pipeline = CropDataPipeline('cleaned_crop_data.csv')
pipeline.split_data()
pipeline.fit_trend()
pipeline.add_detrend()
pipeline.create_lags()
pipeline.resplit()
pipeline.fit_encode_scale()
pipeline.prepare_dl()
pipeline.save_transformers()

N_AREAS = len(pipeline.le_area.classes_)
N_ITEMS = len(pipeline.le_item.classes_)


ValueError: y contains previously unseen labels: 'Sudan'

## 4. Prepare Inputs

In [None]:
# ML inputs
X_train_ml = pipeline.train_df[pipeline.ml_feats]
y_train_ml = pipeline.train_df['yield_detrended']
X_val_ml = pipeline.val_df[pipeline.ml_feats]
y_val_ml = pipeline.val_df['yield_detrended']
X_test_ml = pipeline.test_df[pipeline.ml_feats]
y_test_ml = pipeline.test_df['yield_detrended']

def create_sequences(data, lookback, feats, target):
    X, y = [], []
    for _, group in data.groupby(CAT_COLS):
        if len(group) < lookback:
            continue
        gf = group[feats].values
        gt = group[target].values
        for i in range(len(group) - lookback + 1):
            X.append(gf[i:i+lookback])
            y.append(gt[i+lookback-1])
    return np.array(X), np.array(y)

def split_dl(X):
    return [
        torch.tensor(X[..., :-2], dtype=torch.float32),
        torch.tensor(X[..., -2], dtype=torch.long),
        torch.tensor(X[..., -1], dtype=torch.long)
    ]

# DL sequences (use existing create_sequences and split_dl functions)
X_train_seq, y_train_seq = create_sequences(pipeline.train_df_dl, pipeline.lookback, pipeline.dl_feats, 'yield_detrended')
X_val_seq, y_val_seq = create_sequences(pipeline.val_df_dl, pipeline.lookback, pipeline.dl_feats, 'yield_detrended')
X_test_seq, y_test_seq = create_sequences(pipeline.test_df_dl, pipeline.lookback, pipeline.dl_feats, 'yield_detrended')

X_train_dl = split_dl(X_train_seq)
X_val_dl = split_dl(X_val_seq)
X_test_dl = split_dl(X_test_seq)

y_train_t = torch.tensor(y_train_seq, dtype=torch.float32).unsqueeze(1)
y_val_t = torch.tensor(y_val_seq, dtype=torch.float32).unsqueeze(1)
y_test_t = torch.tensor(y_test_seq, dtype=torch.float32).unsqueeze(1)

## 5. Metrics

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + 1e-8)) ** 2)) * 100

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

## 6. Optuna Objectives

In [None]:
def objective_lr(trial):
    # No hyperparameters to tune for Linear Regression
    return 0

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0)
    }
    model = RandomForestRegressor(random_state=42, n_jobs=-1, **params)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for train_idx, val_idx in tscv.split(X_train_ml):
        model.fit(X_train_ml.iloc[train_idx], y_train_ml.iloc[train_idx])
        pred = model.predict(X_train_ml.iloc[val_idx])
        scores.append(rmspe(y_train_ml.iloc[val_idx], pred))
    return np.mean(scores)

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }
    model = xgb.XGBRegressor(random_state=42, **params)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for train_idx, val_idx in tscv.split(X_train_ml):
        model.fit(X_train_ml.iloc[train_idx], y_train_ml.iloc[train_idx])
        pred = model.predict(X_train_ml.iloc[val_idx])
        scores.append(rmspe(y_train_ml.iloc[val_idx], pred))
    return np.mean(scores)

class LSTMModel(nn.Module):
    def __init__(self, n_areas, n_items, lstm_units, dense_units, dropout):
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.lstm = nn.LSTM(len(NUMERIC_COLS) + 10 + 5, lstm_units, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc1 = nn.Linear(lstm_units, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1)
        out, _ = self.lstm(x)
        out = self.drop(out[:, -1])
        out = torch.relu(self.fc1(out))
        return self.fc2(out)

def train_dl(model, opt, loss_fn, train_loader, val_loader, epochs=100, patience=10, is_final=False):
    best = float('inf')
    wait = 0
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x1, x2, x3, y in train_loader:
            opt.zero_grad()
            pred = model(x1, x2, x3)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))
        model.eval()
        with torch.no_grad():
            val_inputs = [x.to(next(model.parameters()).device) for x in val_loader.dataset.tensors[:3]]
            val_y = val_loader.dataset.tensors[3]
            val_pred = model(*val_inputs)
            val_mse = loss_fn(val_pred, val_y).item()
            val_rmspe = rmspe(val_y.numpy().flatten(), val_pred.numpy().flatten())
        val_losses.append(val_mse)
        if val_rmspe < best:
            best = val_rmspe
            wait = 0
            if is_final:
                torch.save(model.state_dict(), f'model_{{model.__class__.__name__}}.pth')
        else:
            wait += 1
            if wait >= patience:
                break
    return train_losses, val_losses

def objective_lstm(trial):
    params = {
        'lstm_units': trial.suggest_categorical('lstm_units', [64, 128]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.4),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    }
    lr = params.pop('lr')
    model = LSTMModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, val_losses = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader)
    return val_losses[-1]

class CNNModel(nn.Module):
    def __init__(self, n_areas, n_items, filters, kernel, dense_units): 
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.conv = nn.Conv1d(len(NUMERIC_COLS) + 10 + 5, filters, kernel)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(filters, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1).transpose(1, 2)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

def objective_cnn(trial):
    params = {
        'filters': trial.suggest_categorical('filters', [64, 128]),
        'kernel': trial.suggest_categorical('kernel', [2, 3]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    }
    lr = params.pop('lr')
    model = CNNModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, val_losses = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader)
    return val_losses[-1]

## 7. Run Optuna

In [None]:
if os.path.exists('best_params_optuna.joblib'):
    best_params = joblib.load('best_params_optuna.joblib')
else:
    best_params = {}

if RUN_OPTUNA:
    objectives = {
        'LR': objective_lr,
        'RF': objective_rf,
        'XGB': objective_xgb,
        'LSTM': objective_lstm,
        'CNN': objective_cnn
    }
    for name, run in RUN_MODELS.items():
        if run:
            print(f'--- Tuning {name} ---')
            study = optuna.create_study(direction='minimize')
            n_trials = 50 if name in ['RF', 'XGB'] else 30
            if name == 'LR':
                n_trials = 1
            study.optimize(objectives[name], n_trials=n_trials, show_progress_bar=True)
            best_params[name] = study.best_params
            joblib.dump(best_params, 'best_params_optuna.joblib') # Save after each study
            print(f'Best params for {name}: {study.best_params}')
else:
    print('Skipping Optuna tuning.')

## 8. Final Training

In [None]:
# Combine train+val
X_train_full_ml = pd.concat([X_train_ml, X_val_ml])
y_train_full_ml = pd.concat([y_train_ml, y_val_ml])
X_train_full_seq = np.concatenate([X_train_seq, X_val_seq])
y_train_full_seq = np.concatenate([y_train_seq, y_val_seq])
X_train_full_dl = split_dl(X_train_full_seq)
y_train_full_t = torch.tensor(y_train_full_seq, dtype=torch.float32).unsqueeze(1)

models = {}

if RUN_MODELS['LR']:
    model_lr = LinearRegression()
    model_lr.fit(X_train_full_ml, y_train_full_ml)
    models['LR'] = model_lr
    joblib.dump(model_lr, 'model_lr.joblib')

if RUN_MODELS['RF']:
    model_rf = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params['RF'])
    model_rf.fit(X_train_full_ml, y_train_full_ml)
    models['RF'] = model_rf
    joblib.dump(model_rf, 'model_rf.joblib')

if RUN_MODELS['XGB']:
    model_xgb = xgb.XGBRegressor(random_state=42, **best_params['XGB'])
    model_xgb.fit(X_train_full_ml, y_train_full_ml)
    models['XGB'] = model_xgb
    joblib.dump(model_xgb, 'model_xgb.joblib')

# DL
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_full_ds = TensorDataset(*[x.to(device) for x in X_train_full_dl], y_train_full_t.to(device))
test_ds = TensorDataset(*[x.to(device) for x in X_test_dl], y_test_t.to(device))
train_loader = DataLoader(train_full_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

if RUN_MODELS['LSTM']:
    lstm_params = best_params['LSTM']
    lr = lstm_params.pop('lr')
    model_lstm = LSTMModel(N_AREAS, N_ITEMS, **lstm_params).to(device)
    opt_lstm = optim.Adam(model_lstm.parameters(), lr=lr)
    train_losses_lstm, val_losses_lstm = train_dl(model_lstm, opt_lstm, nn.MSELoss(), train_loader, test_loader, epochs=150, patience=15, is_final=True)
    models['LSTM'] = model_lstm

if RUN_MODELS['CNN']:
    cnn_params = best_params['CNN']
    lr = cnn_params.pop('lr')
    model_cnn = CNNModel(N_AREAS, N_ITEMS, **cnn_params).to(device)
    opt_cnn = optim.Adam(model_cnn.parameters(), lr=lr)
    train_losses_cnn, val_losses_cnn = train_dl(model_cnn, opt_cnn, nn.MSELoss(), train_loader, test_loader, epochs=150, patience=15, is_final=True)
    models['CNN'] = model_cnn

## 9. Plot DL Loss Curves

In [None]:
if RUN_MODELS['LSTM'] and RUN_MODELS['CNN']:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))
    ax1.plot(train_losses_lstm, label='Train Loss')
    ax1.plot(val_losses_lstm, label='Validation (Test) Loss')
    ax1.set_title('LSTM Model Loss', fontsize=16)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Mean Squared Error')
    ax1.legend()
    ax2.plot(train_losses_cnn, label='Train Loss')
    ax2.plot(val_losses_cnn, label='Validation (Test) Loss')
    ax2.set_title('CNN Model Loss', fontsize=16)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Mean Squared Error')
    ax2.legend()
    plt.suptitle('Deep Learning Training Curves', fontsize=20)
    plt.savefig("loss_curves.png")
    plt.show()

## 10. Final Evaluation

In [None]:
# Align test sets
matched_test_df = pd.concat([group.iloc[LOOKBACK-1:] for _, group in test_df_dl.groupby(CAT_COLS) if len(group) >= LOOKBACK])
matched_test_ml_df = pd.concat([group.iloc[LOOKBACK-1:] for _, group in test_df.groupby(CAT_COLS) if len(group) >= LOOKBACK])
X_test_ml_matched = matched_test_ml_df[ML_FEATS]
X_test_dl_matched, y_test_det_matched = create_sequences(matched_test_df, 1, DL_FEATS, TARGET_DET)
y_test_det_matched = y_test_det_matched.flatten()
trend_test = matched_test_df['yield_trend'].values
y_true_original = matched_test_df[TARGET].values

# Predictions
test_preds = {}
for name in models:
    if name in ['LR', 'RF', 'XGB']:
        test_preds[name] = models[name].predict(X_test_ml_matched)
    elif name in ['LSTM', 'CNN']:
        X_test_dl_inputs_m = split_dl(X_test_seq)
        models[name].eval()
        with torch.no_grad():
            test_preds[name] = models[name](*[x.to(device) for x in X_test_dl_inputs_m]).cpu().numpy().flatten()

# Evaluate
results = []
y_preds_original = {}
for name, pred_det in test_preds.items():
    pred_orig = pred_det + trend_test
    y_preds_original[name] = pred_orig
    mae = mean_absolute_error(y_true_original, pred_orig)
    rmse = np.sqrt(mean_squared_error(y_true_original, pred_orig))
    map_e = mape(y_true_original, pred_orig)
    rms_pe = rmspe(y_true_original, pred_orig)
    r_2 = r2_score(y_true_original, pred_orig)
    results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'MAPE (%)': map_e, 'RMSPE (%)': rms_pe, 'R²': r_2})

results_df = pd.DataFrame(results).set_index('Model').sort_values('RMSPE (%)')
print("\n--- Final Performance (Test Set) ---")
print(results_df.round(2))
results_df.to_csv("final_model_performance.csv")

## 11. Plot Model Performances

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(24, 6))
sns.barplot(data=results_df.reset_index(), x='Model', y='RMSE', ax=axs[0])
axs[0].set_title('RMSE Comparison')
sns.barplot(x='Model', y='MAE', data=results_df.reset_index(), ax=axs[1])
axs[1].set_title('MAE Comparison')
sns.barplot(x='Model', y='MAPE (%)', data=results_df.reset_index(), ax=axs[2])
axs[2].set_title('MAPE Comparison')
sns.barplot(x='Model', y='R²', data=results_df.reset_index(), ax=axs[3])
axs[3].set_title('R² Comparison')
plt.tight_layout()
plt.savefig("model_performance_comparison.png")
plt.show()

## 12. Per-Crop Reporting (Best Model)

In [None]:
best_model_name = results_df.index[0]
print(f"Per-crop report for best model: {best_model_name}")
crop_results = []
items = matched_test_df['Item'].values
for crop in np.unique(items):
    mask = items == crop
    true = y_true_original[mask]
    pred = y_preds_original[best_model_name][mask]
    crop_results.append({
        'Crop': crop,
        'RMSPE (%)': rmspe(true, pred),
        'MAPE (%)': mape(true, pred),
        'RMSE': np.sqrt(mean_squared_error(true, pred)),
        'R²': r2_score(true, pred)
    })
crop_df = pd.DataFrame(crop_results).sort_values('RMSPE (%)')
print(crop_df.round(2))
crop_df.to_csv('per_crop_performance.csv', index=False)

## 13. SHAP Analysis (If Tree Model)

In [None]:
best_model_name = results_df.index[0]
if best_model_name in models and best_model_name in ['RF', 'XGB']:
    best_model = models[best_model_name]
    print(f"Running SHAP on {best_model_name}")
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test_ml_matched)
    shap.summary_plot(shap_values, X_test_ml_matched, plot_type="beeswarm", show=False)
    plt.title(f"SHAP Beeswarm ({best_model_name})", fontsize=16)
    plt.savefig("shap_beeswarm.png", bbox_inches='tight')
    plt.show()
    shap.summary_plot(shap_values, X_test_ml_matched, plot_type="bar", show=False)
    plt.title(f"Feature Importance ({best_model_name})", fontsize=16)
    plt.savefig("shap_importance.png", bbox_inches='tight')
    plt.show()
else:
    print("SHAP skipped for non-tree model.")

## 14. Export Predictions

In [None]:
final_predictions_df = matched_test_df.copy()
final_predictions_df['true_yield_original'] = y_true_original
for name in test_preds:
    final_predictions_df[f'predicted_{name}'] = y_preds_original[name]
export_cols = ['Year', 'Area', 'Item', 'true_yield_original'] + [f'predicted_{name}' for name in test_preds]
final_predictions_df[export_cols].to_csv("final_test_predictions.csv", index=False)
print("Exported predictions.")
print("\n--- Complete ---")