# Crop Yield Prediction – Final Model Pipeline (Rev7)

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
from scipy.signal import detrend
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import shap

np.random.seed(42)
torch.manual_seed(42)
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')
print("All libraries loaded.")

## 2. Model Selection Menu

In [None]:
RUN_MODELS = {
    'LR': True,   # Baseline Linear Regression
    'RF': True,   # Random Forest
    'XGB': True,  # XGBoost
    'LSTM': True, # LSTM
    'CNN': True   # CNN
}

RUN_OPTUNA = True

## 3. Load & Preprocess Data

In [None]:
try:
    df = pd.read_csv("cleaned_crop_data.csv")
    print(f"Loaded: {df.shape}")
except:
    raise FileNotFoundError("Run EDA first!")

TARGET = 'hg/ha_yield'
TIME_COL = 'Year'
CAT_COLS = ['Area', 'Item']
NUMERIC_COLS = ['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'fertilizer_kg/ha', 'solar_radiation_MJ/m2-day']

# De-trend per group
df = df.sort_values(CAT_COLS + [TIME_COL])
df['yield_detrended'] = df.groupby(CAT_COLS)[TARGET].transform(detrend)
df['yield_trend'] = df[TARGET] - df['yield_detrended']
TARGET_DET = 'yield_detrended'

# Lags for ML
LAG_COLS = [TARGET_DET] + NUMERIC_COLS
for col in LAG_COLS:
    for lag in [1, 2]:
        df[f'{col}_lag{lag}'] = df.groupby(CAT_COLS)[col].shift(lag)
df_ml = df.dropna().copy()

# Split
TRAIN_END = 2007
VAL_END = 2010
train_df = df_ml[df_ml[TIME_COL] <= TRAIN_END].copy()
val_df = df_ml[(df_ml[TIME_COL] > TRAIN_END) & (df_ml[TIME_COL] <= VAL_END)].copy()
test_df = df_ml[df_ml[TIME_COL] > VAL_END].copy()

# Encode
le_area = LabelEncoder().fit(df_ml['Area'])
le_item = LabelEncoder().fit(df_ml['Item'])
for d in [train_df, val_df, test_df]:
    d['Area_Encoded'] = le_area.transform(d['Area'])
    d['Item_Encoded'] = le_item.transform(d['Item'])

# Scale
lagged_cols = [c for c in df_ml.columns if '_lag' in c]
scale_cols = NUMERIC_COLS + lagged_cols
scaler = StandardScaler()
train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
val_df[scale_cols] = scaler.transform(val_df[scale_cols])
test_df[scale_cols] = scaler.transform(test_df[scale_cols])

joblib.dump(scaler, 'scaler.joblib')
joblib.dump(le_area, 'le_area.joblib')
joblib.dump(le_item, 'le_item.joblib')

N_AREAS = len(le_area.classes_)
N_ITEMS = len(le_item.classes_)

## 4. Prepare Inputs

In [None]:
# ML
ML_FEATS = NUMERIC_COLS + lagged_cols + ['Area_Encoded', 'Item_Encoded']
X_train_ml = train_df[ML_FEATS]
y_train_ml = train_df[TARGET_DET]
X_val_ml = val_df[ML_FEATS]
y_val_ml = val_df[TARGET_DET]
X_test_ml = test_df[ML_FEATS]
y_test_ml = test_df[TARGET_DET]

# DL Sequences
LOOKBACK = 5
DL_FEATS = NUMERIC_COLS + ['Area_Encoded', 'Item_Encoded']

def create_sequences(data, lookback, feats, target):
    X, y = [], []
    for _, group in data.groupby(CAT_COLS):
        if len(group) < lookback:
            continue
        gf = group[feats].values
        gt = group[target].values
        for i in range(len(group) - lookback + 1):
            X.append(gf[i:i+lookback])
            y.append(gt[i+lookback-1])
    return np.array(X), np.array(y)

scaler_dl = StandardScaler()
target_scaler = StandardScaler()
train_df_dl = df[df[TIME_COL] <= TRAIN_END].copy()
val_df_dl = df[(df[TIME_COL] > TRAIN_END) & (df[TIME_COL] <= VAL_END)].copy()
test_df_dl = df[df[TIME_COL] > VAL_END].copy()
train_df_dl['Area_Encoded'] = le_area.transform(train_df_dl['Area'])
train_df_dl['Item_Encoded'] = le_item.transform(train_df_dl['Item'])
val_df_dl['Area_Encoded'] = le_area.transform(val_df_dl['Area'])
val_df_dl['Item_Encoded'] = le_item.transform(val_df_dl['Item'])
test_df_dl['Area_Encoded'] = le_area.transform(test_df_dl['Area'])
test_df_dl['Item_Encoded'] = le_item.transform(test_df_dl['Item'])
train_df_dl[NUMERIC_COLS] = scaler_dl.fit_transform(train_df_dl[NUMERIC_COLS])
val_df_dl[NUMERIC_COLS] = scaler_dl.transform(val_df_dl[NUMERIC_COLS])
test_df_dl[NUMERIC_COLS] = scaler_dl.transform(test_df_dl[NUMERIC_COLS])

# Fit scaler on train target only
train_df_dl[TARGET_DET] = target_scaler.fit_transform(train_df_dl[[TARGET_DET]])
val_df_dl[TARGET_DET] = target_scaler.transform(val_df_dl[[TARGET_DET]])
test_df_dl[TARGET_DET] = target_scaler.transform(test_df_dl[[TARGET_DET]])

X_train_seq, y_train_seq = create_sequences(train_df_dl, LOOKBACK, DL_FEATS, TARGET_DET)
X_val_seq, y_val_seq = create_sequences(val_df_dl, LOOKBACK, DL_FEATS, TARGET_DET)
X_test_seq, y_test_seq = create_sequences(test_df_dl, LOOKBACK, DL_FEATS, TARGET_DET)

def split_dl(X):
    return [
        torch.tensor(X[..., :-2], dtype=torch.float32),
        torch.tensor(X[..., -2], dtype=torch.long),
        torch.tensor(X[..., -1], dtype=torch.long)
    ]

X_train_dl = split_dl(X_train_seq)
X_val_dl = split_dl(X_val_seq)
X_test_dl = split_dl(X_test_seq)

y_train_t = torch.tensor(y_train_seq, dtype=torch.float32).unsqueeze(1)
y_val_t = torch.tensor(y_val_seq, dtype=torch.float32).unsqueeze(1)
y_test_t = torch.tensor(y_test_seq, dtype=torch.float32).unsqueeze(1)

## 5. Metrics

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + 1e-8)) ** 2)) * 100

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

## 6. Optuna Objectives

In [None]:
def objective_lr(trial):
    # No hyperparameters to tune for Linear Regression
    return 0

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0)
    }
    model = RandomForestRegressor(random_state=42, n_jobs=-1, **params)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for train_idx, val_idx in tscv.split(X_train_ml):
        model.fit(X_train_ml.iloc[train_idx], y_train_ml.iloc[train_idx])
        pred = model.predict(X_train_ml.iloc[val_idx])
        scores.append(rmspe(y_train_ml.iloc[val_idx], pred))
    return np.mean(scores)

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5)
    }
    model = xgb.XGBRegressor(random_state=42, **params)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for train_idx, val_idx in tscv.split(X_train_ml):
        model.fit(X_train_ml.iloc[train_idx], y_train_ml.iloc[train_idx])
        pred = model.predict(X_train_ml.iloc[val_idx])
        scores.append(rmspe(y_train_ml.iloc[val_idx], pred))
    return np.mean(scores)

class LSTMModel(nn.Module):
    def __init__(self, n_areas, n_items, lstm_units, dense_units, dropout):
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.lstm = nn.LSTM(len(NUMERIC_COLS) + 10 + 5, lstm_units, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc1 = nn.Linear(lstm_units, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1)
        out, _ = self.lstm(x)
        out = self.drop(out[:, -1])
        out = torch.relu(self.fc1(out))
        return self.fc2(out)

def train_dl(model, opt, loss_fn, train_loader, val_loader, target_scaler, epochs=100, patience=10, is_final=False):
    best_val_loss = float('inf')
    best_rmspe = float('inf')
    wait = 0
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x1, x2, x3, y in train_loader:
            opt.zero_grad()
            pred = model(x1, x2, x3)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))
        model.eval()
        with torch.no_grad():
            val_inputs = [x.to(next(model.parameters()).device) for x in val_loader.dataset.tensors[:3]]
            val_y = val_loader.dataset.tensors[3]
            val_pred = model(*val_inputs)
            val_mse = loss_fn(val_pred, val_y).item()
            val_losses.append(val_mse)
            
            y_true_inv = target_scaler.inverse_transform(val_y.cpu().numpy())
            y_pred_inv = target_scaler.inverse_transform(val_pred.cpu().numpy())
            val_rmspe = rmspe(y_true_inv.flatten(), y_pred_inv.flatten())

        if val_mse < best_val_loss:
            best_val_loss = val_mse
            best_rmspe = val_rmspe
            wait = 0
            if is_final:
                torch.save(model.state_dict(), f'model_{model.__class__.__name__}.pth')
        else:
            wait += 1
            if wait >= patience:
                break
    return train_losses, val_losses, best_val_loss

def objective_lstm(trial):
    params = {
        'lstm_units': trial.suggest_categorical('lstm_units', [64, 128]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.4),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    }
    lr = params.pop('lr')
    weight_decay = params.pop('weight_decay')
    model = LSTMModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, _, best_val_loss = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader, target_scaler)
    return best_val_loss

class CNNModel(nn.Module):
    def __init__(self, n_areas, n_items, filters, kernel, dense_units): 
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.conv = nn.Conv1d(len(NUMERIC_COLS) + 10 + 5, filters, kernel)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(filters, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1).transpose(1, 2)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

def objective_cnn(trial):
    params = {
        'filters': trial.suggest_categorical('filters', [64, 128]),
        'kernel': trial.suggest_categorical('kernel', [2, 3]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    }
    lr = params.pop('lr')
    weight_decay = params.pop('weight_decay')
    model = CNNModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, _, best_val_loss = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader, target_scaler)
    return best_val_loss

## 7. Run Optuna

In [None]:
if os.path.exists('best_params_optuna.joblib'):
    best_params = joblib.load('best_params_optuna.joblib')
else:
    best_params = {}

if RUN_OPTUNA:
    studies = {}
    objectives = {
        'LR': objective_lr,
        'RF': objective_rf,
        'XGB': objective_xgb,
        'LSTM': objective_lstm,
        'CNN': objective_cnn
    }
    for name, run in RUN_MODELS.items():
        if run:
            print(f'--- Tuning {name} ---')
            study = optuna.create_study(direction='minimize')
            n_trials = 50 if name in ['RF', 'XGB'] else 30
            if name == 'LR':
                n_trials = 1
            study.optimize(objectives[name], n_trials=n_trials, show_progress_bar=True)
            best_params[name] = study.best_params
            studies[name] = study
            joblib.dump(best_params, 'best_params_optuna.joblib') # Save after each study
            print(f'Best params for {name}: {study.best_params}')
else:
    print('Skipping Optuna tuning.')

## 7a. Visualize Optuna Results

In [None]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

if RUN_OPTUNA and 'studies' in locals():
    for name, study in studies.items():
        if name == 'LR' or not study.trials:
            continue
        print(f'--- Visualizing Optuna results for {name} ---')
        
        # Optimization History
        fig = plot_optimization_history(study)
        fig.update_layout(title=f'{name} Optimization History')
        fig.write_image(f'optuna_{name}_history.png')
        fig.show()

        # Parallel Coordinate
        fig = plot_parallel_coordinate(study)
        fig.update_layout(title=f'{name} Parallel Coordinate')
        fig.write_image(f'optuna_{name}_parallel_coordinate.png')
        fig.show()

        # Slice Plot
        fig = plot_slice(study)
        fig.update_layout(title=f'{name} Slice Plot')
        fig.write_image(f'optuna_{name}_slice.png')
        fig.show()

        # Parameter Importance
        try:
            fig = plot_param_importances(study)
            fig.update_layout(title=f'{name} Parameter Importance')
            fig.write_image(f'optuna_{name}_param_importance.png')
            fig.show()
        except (ValueError, RuntimeError) as e:
            print(f'Could not plot parameter importance for {name}: {e}')

## 8. Final Training

In [None]:
# Combine train+val
X_train_full_ml = pd.concat([X_train_ml, X_val_ml])
y_train_full_ml = pd.concat([y_train_ml, y_val_ml])
X_train_full_seq = np.concatenate([X_train_seq, X_val_seq])
y_train_full_seq = np.concatenate([y_train_seq, y_val_seq])
X_train_full_dl = split_dl(X_train_full_seq)
y_train_full_t = torch.tensor(y_train_full_seq, dtype=torch.float32).unsqueeze(1)

models = {}

if RUN_MODELS['LR']:
    model_lr = LinearRegression()
    model_lr.fit(X_train_full_ml, y_train_full_ml)
    models['LR'] = model_lr
    joblib.dump(model_lr, 'model_lr.joblib')

if RUN_MODELS['RF']:
    model_rf = RandomForestRegressor(random_state=42, n_jobs=-1, **best_params['RF'])
    model_rf.fit(X_train_full_ml, y_train_full_ml)
    models['RF'] = model_rf
    joblib.dump(model_rf, 'model_rf.joblib')

if RUN_MODELS['XGB']:
    model_xgb = xgb.XGBRegressor(random_state=42, **best_params['XGB'])
    model_xgb.fit(X_train_full_ml, y_train_full_ml)
    models['XGB'] = model_xgb
    joblib.dump(model_xgb, 'model_xgb.joblib')

# DL
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_full_ds = TensorDataset(*[x.to(device) for x in X_train_full_dl], y_train_full_t.to(device))
test_ds = TensorDataset(*[x.to(device) for x in X_test_dl], y_test_t.to(device))
train_loader = DataLoader(train_full_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

if RUN_MODELS['LSTM']:
    lstm_params = best_params['LSTM'].copy()
    lr = lstm_params.pop('lr')
    weight_decay = lstm_params.pop('weight_decay')
    model_lstm = LSTMModel(N_AREAS, N_ITEMS, **lstm_params).to(device)
    opt_lstm = optim.Adam(model_lstm.parameters(), lr=lr, weight_decay=weight_decay)
    train_losses_lstm, val_losses_lstm, _ = train_dl(model_lstm, opt_lstm, nn.MSELoss(), train_loader, test_loader, target_scaler, epochs=150, patience=15, is_final=True)
    models['LSTM'] = model_lstm

if RUN_MODELS['CNN']:
    cnn_params = best_params['CNN'].copy()
    lr = cnn_params.pop('lr')
    weight_decay = cnn_params.pop('weight_decay')
    model_cnn = CNNModel(N_AREAS, N_ITEMS, **cnn_params).to(device)
    opt_cnn = optim.Adam(model_cnn.parameters(), lr=lr, weight_decay=weight_decay)
    train_losses_cnn, val_losses_cnn, _ = train_dl(model_cnn, opt_cnn, nn.MSELoss(), train_loader, test_loader, target_scaler, epochs=150, patience=15, is_final=True)
    models['CNN'] = model_cnn

## 9. Plot DL Loss Curves

In [None]:
if RUN_MODELS['LSTM'] and RUN_MODELS['CNN']:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))
    ax1.plot(train_losses_lstm, label='Train Loss')
    ax1.plot(val_losses_lstm, label='Validation (Test) Loss')
    ax1.set_title('LSTM Model Loss', fontsize=16)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Mean Squared Error')
    ax1.legend()
    ax2.plot(train_losses_cnn, label='Train Loss')
    ax2.plot(val_losses_cnn, label='Validation (Test) Loss')
    ax2.set_title('CNN Model Loss', fontsize=16)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Mean Squared Error')
    ax2.legend()
    plt.suptitle('Deep Learning Training Curves', fontsize=20)
    plt.savefig("loss_curves.png")
    plt.show()

## 10. Final Evaluation

In [None]:
# Align test sets
matched_test_df = pd.concat([group.iloc[LOOKBACK-1:] for _, group in test_df_dl.groupby(CAT_COLS) if len(group) >= LOOKBACK])
matched_test_ml_df = pd.concat([group.iloc[LOOKBACK-1:] for _, group in test_df.groupby(CAT_COLS) if len(group) >= LOOKBACK])
X_test_ml_matched = matched_test_ml_df[ML_FEATS]
X_test_dl_matched, y_test_det_matched = create_sequences(matched_test_df, 1, DL_FEATS, TARGET_DET)
y_test_det_matched = y_test_det_matched.flatten()
trend_test = matched_test_df['yield_trend'].values
y_true_original = matched_test_df[TARGET].values

# Predictions
test_preds = {}
for name in models:
    if name in ['LR', 'RF', 'XGB']:
        test_preds[name] = models[name].predict(X_test_ml_matched)
    elif name in ['LSTM', 'CNN']:
        X_test_dl_inputs_m = split_dl(X_test_seq)
        models[name].eval()
        with torch.no_grad():
            preds_scaled = models[name](*[x.to(device) for x in X_test_dl_inputs_m]).cpu().numpy()
            test_preds[name] = target_scaler.inverse_transform(preds_scaled).flatten()

# Evaluate
results = []
y_preds_original = {}
for name, pred_det in test_preds.items():
    pred_orig = pred_det + trend_test
    y_preds_original[name] = pred_orig
    mae = mean_absolute_error(y_true_original, pred_orig)
    rmse = np.sqrt(mean_squared_error(y_true_original, pred_orig))
    map_e = mape(y_true_original, pred_orig)
    rms_pe = rmspe(y_true_original, pred_orig)
    r_2 = r2_score(y_true_original, pred_orig)
    results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'MAPE (%)': map_e, 'RMSPE (%)': rms_pe, 'R²': r_2})

results_df = pd.DataFrame(results).set_index('Model').sort_values('RMSPE (%)')
print("\n--- Final Performance (Test Set) ---")
print(results_df.round(2))
results_df.to_csv("final_model_performance.csv")

## 11. Plot Model Performances

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(24, 6))
sns.barplot(data=results_df.reset_index(), x='Model', y='RMSE', ax=axs[0])
axs[0].set_title('RMSE Comparison')
sns.barplot(x='Model', y='MAE', data=results_df.reset_index(), ax=axs[1])
axs[1].set_title('MAE Comparison')
sns.barplot(x='Model', y='MAPE (%)', data=results_df.reset_index(), ax=axs[2])
axs[2].set_title('MAPE Comparison')
sns.barplot(x='Model', y='R²', data=results_df.reset_index(), ax=axs[3])
axs[3].set_title('R² Comparison')
plt.tight_layout()
plt.savefig("model_performance_comparison.png")
plt.show()

## 12. Per-Crop Reporting (Best Model)

In [None]:
best_model_name = results_df.index[0]
print(f"Per-crop report for best model: {best_model_name}")
crop_results = []
items = matched_test_df['Item'].values
for crop in np.unique(items):
    mask = items == crop
    true = y_true_original[mask]
    pred = y_preds_original[best_model_name][mask]
    crop_results.append({
        'Crop': crop,
        'RMSPE (%)': rmspe(true, pred),
        'MAPE (%)': mape(true, pred),
        'RMSE': np.sqrt(mean_squared_error(true, pred)),
        'R²': r2_score(true, pred)
    })
crop_df = pd.DataFrame(crop_results).sort_values('RMSPE (%)')
print(crop_df.round(2))
crop_df.to_csv('per_crop_performance.csv', index=False)

## 13. SHAP Analysis (If Tree Model)

In [None]:
best_model_name = results_df.index[0]
if best_model_name in models and best_model_name in ['RF', 'XGB']:
    best_model = models[best_model_name]
    print(f"Running SHAP on {best_model_name}")
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test_ml_matched)
    shap.summary_plot(shap_values, X_test_ml_matched, plot_type="beeswarm", show=False)
    plt.title(f"SHAP Beeswarm ({best_model_name})", fontsize=16)
    plt.savefig("shap_beeswarm.png", bbox_inches='tight')
    plt.show()
    shap.summary_plot(shap_values, X_test_ml_matched, plot_type="bar", show=False)
    plt.title(f"Feature Importance ({best_model_name})", fontsize=16)
    plt.savefig("shap_importance.png", bbox_inches='tight')
    plt.show()
else:
    print("SHAP skipped for non-tree model.")

## 14. Export Predictions

In [None]:
final_predictions_df = matched_test_df.copy()
final_predictions_df['true_yield_original'] = y_true_original
for name in test_preds:
    final_predictions_df[f'predicted_{name}'] = y_preds_original[name]
export_cols = ['Year', 'Area', 'Item', 'true_yield_original'] + [f'predicted_{name}' for name in test_preds]
final_predictions_df[export_cols].to_csv("final_test_predictions.csv", index=False)
print("Exported predictions.")
print("\n--- Complete ---")