# Crop Yield Prediction – Final Model Pipeline (Rev9)

## 1. Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
from scipy.signal import detrend
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, TensorDataset
import optuna
import shap

np.random.seed(42)
torch.manual_seed(42)
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')
print("All libraries loaded.")

## 2. Model Selection Menu

In [None]:
RUN_MODELS = {
    'LR': True,   # Baseline Linear Regression
    'RF': True,   # Random Forest
    'XGB': True,  # XGBoost
    'LSTM': True, # LSTM
    'CNN': True   # CNN
}

RUN_OPTUNA = True

## 3. Load & Preprocess Data (Corrected)

In [None]:
# Corrected Data Preprocessing
try:
    df = pd.read_csv("cleaned_crop_data.csv")
    print(f"Loaded initial data: {df.shape}")
except FileNotFoundError:
    raise FileNotFoundError("Ensure 'cleaned_crop_data.csv' is present. Run the EDA notebook first.")

TARGET = 'hg/ha_yield'
TIME_COL = 'Year'
CAT_COLS = ['Area', 'Item']
NUMERIC_COLS = ['average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp', 'fertilizer_kg/ha', 'solar_radiation_MJ/m2-day']
TARGET_DET = 'yield_detrended'

# 1. Split data chronologically
TRAIN_END = 2007
VAL_END = 2010
train_df_orig = df[df[TIME_COL] <= TRAIN_END].copy()
val_df_orig = df[(df[TIME_COL] > TRAIN_END) & (df[TIME_COL] <= VAL_END)].copy()
test_df_orig = df[df[TIME_COL] > VAL_END].copy()
print(f"1. Initial data split: Train: {train_df_orig.shape}, Val: {val_df_orig.shape}, Test: {test_df_orig.shape}")

# 2. Fit encoders ON TRAINING DATA ONLY
le_area = LabelEncoder().fit(train_df_orig['Area'])
le_item = LabelEncoder().fit(train_df_orig['Item'])
for d in [train_df_orig, val_df_orig, test_df_orig]:
    d['Area_Encoded'] = d['Area'].apply(lambda x: le_area.transform([x])[0] if x in le_area.classes_ else -1)
    d['Item_Encoded'] = d['Item'].apply(lambda x: le_item.transform([x])[0] if x in le_item.classes_ else -1)
print("2. Encoders fitted on train set and applied to all sets.")

# 3. Fit trend models ON TRAINING DATA ONLY
print("3. Fitting trend models on training data...")
trend_models = {}
for group, group_df in train_df_orig.groupby(CAT_COLS):
    trend_model = LinearRegression()
    trend_model.fit(group_df[[TIME_COL]], group_df[TARGET])
    trend_models[group] = trend_model

global_trend_model = LinearRegression().fit(train_df_orig[[TIME_COL]], train_df_orig[TARGET])
print(f"   Fitted {len(trend_models)} group-specific trend models and 1 global model.")

# 4. Apply detrending to all datasets
for df_set in [train_df_orig, val_df_orig, test_df_orig]:
    df_set['yield_trend'] = 0.0
    for group, group_df in df_set.groupby(CAT_COLS):
        model = trend_models.get(group, global_trend_model)
        trend_prediction = model.predict(group_df[[TIME_COL]])
        df_set.loc[group_df.index, 'yield_trend'] = trend_prediction
    df_set['yield_detrended'] = df_set[TARGET] - df_set['yield_trend']
print("   Detrending applied to all datasets.")

# 5. Create lags and finalize split for ML models
full_df_ml = pd.concat([train_df_orig, val_df_orig, test_df_orig]).sort_values(CAT_COLS + [TIME_COL])
lag_cols = ['yield_detrended'] + NUMERIC_COLS
for col in lag_cols:
    for lag in [1, 2]:
        full_df_ml[f'{col}_lag{lag}'] = full_df_ml.groupby(CAT_COLS)[col].shift(lag)

df_ml = full_df_ml.dropna().copy()
train_df = df_ml[df_ml[TIME_COL] <= TRAIN_END].copy()
val_df = df_ml[(df_ml[TIME_COL] > TRAIN_END) & (df_ml[TIME_COL] <= VAL_END)].copy()
test_df = df_ml[df_ml[TIME_COL] > VAL_END].copy()
print(f"4. Lags created for ML models: Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

# 6. Fit scalers for ML models ON TRAINING DATA ONLY
lagged_cols = [c for c in df_ml.columns if '_lag' in c]
ml_features = NUMERIC_COLS + lagged_cols + ['Area_Encoded', 'Item_Encoded']
scale_cols = NUMERIC_COLS + lagged_cols

x_scaler = StandardScaler()
train_df[scale_cols] = x_scaler.fit_transform(train_df[scale_cols])
val_df[scale_cols] = x_scaler.transform(val_df[scale_cols])
test_df[scale_cols] = x_scaler.transform(test_df[scale_cols])

y_scaler = StandardScaler()
train_df[TARGET_DET] = y_scaler.fit_transform(train_df[[TARGET_DET]])
val_df[TARGET_DET] = y_scaler.transform(val_df[[TARGET_DET]])
test_df[TARGET_DET] = y_scaler.transform(test_df[[TARGET_DET]])
print("5. X and y scalers for ML models fitted and applied.")

# 7. Save transformers
joblib.dump(x_scaler, 'scaler.joblib')
joblib.dump(y_scaler, 'y_scaler.joblib')
joblib.dump(le_area, 'le_area.joblib')
joblib.dump(le_item, 'le_item.joblib')
joblib.dump(trend_models, 'trend_models.joblib')
joblib.dump(global_trend_model, 'global_trend_model.joblib')
print("6. All transformers saved to disk.")

N_AREAS = len(le_area.classes_)
N_ITEMS = len(le_item.classes_)

## 4. Prepare Inputs (Corrected)

In [None]:
# ML Inputs
X_train_ml = train_df[ml_features]
y_train_ml = train_df[TARGET_DET]
X_val_ml = val_df[ml_features]
y_val_ml = val_df[TARGET_DET]
X_test_ml = test_df[ml_features]
y_test_ml = test_df[TARGET_DET]
print("ML inputs prepared.")

# DL Inputs
LOOKBACK = 5
DL_FEATS = NUMERIC_COLS + ['Area_Encoded', 'Item_Encoded']

# Correctly scale DL features
scaler_dl_x = StandardScaler()
train_df_orig[NUMERIC_COLS] = scaler_dl_x.fit_transform(train_df_orig[NUMERIC_COLS])
val_df_orig[NUMERIC_COLS] = scaler_dl_x.transform(val_df_orig[NUMERIC_COLS])
test_df_orig[NUMERIC_COLS] = scaler_dl_x.transform(test_df_orig[NUMERIC_COLS])

# Use the already fitted y_scaler for the target
train_df_orig[TARGET_DET] = y_scaler.transform(train_df_orig[[TARGET_DET]])
val_df_orig[TARGET_DET] = y_scaler.transform(val_df_orig[[TARGET_DET]])
test_df_orig[TARGET_DET] = y_scaler.transform(test_df_orig[[TARGET_DET]])
print("DL features and target scaled.")

def create_sequences(data, lookback, feats, target):
    X, y = [], []
    # Keep track of original index
    y_indices = []
    for _, group in data.groupby(CAT_COLS):
        if len(group) < lookback:
            continue
        gf = group[feats].values
        gt = group[target].values
        indices = group.index
        for i in range(len(group) - lookback + 1):
            X.append(gf[i:i+lookback])
            y.append(gt[i+lookback-1])
            y_indices.append(indices[i+lookback-1])
    return np.array(X), np.array(y), y_indices

X_train_seq, y_train_seq, _ = create_sequences(train_df_orig, LOOKBACK, DL_FEATS, TARGET_DET)
X_val_seq, y_val_seq, _ = create_sequences(val_df_orig, LOOKBACK, DL_FEATS, TARGET_DET)
X_test_seq, y_test_seq, y_test_indices = create_sequences(test_df_orig, LOOKBACK, DL_FEATS, TARGET_DET)
print("DL sequences created.")

# Create a reference dataframe for test set evaluation
test_df_dl_seq_ref = test_df_orig.loc[y_test_indices]

def split_dl(X):
    # The number of numeric features is len(NUMERIC_COLS)
    numeric_feature_count = len(NUMERIC_COLS)
    return [
        torch.tensor(X[..., :numeric_feature_count], dtype=torch.float32),
        torch.tensor(X[..., numeric_feature_count], dtype=torch.long),
        torch.tensor(X[..., numeric_feature_count+1], dtype=torch.long)
    ]

X_train_dl = split_dl(X_train_seq)
X_val_dl = split_dl(X_val_seq)
X_test_dl = split_dl(X_test_seq)

y_train_t = torch.tensor(y_train_seq, dtype=torch.float32).unsqueeze(1)
y_val_t = torch.tensor(y_val_seq, dtype=torch.float32).unsqueeze(1)
y_test_t = torch.tensor(y_test_seq, dtype=torch.float32).unsqueeze(1)
print("DL tensors created.")

## 5. Metrics

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / (y_true + 1e-8)) ** 2)) * 100

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

## 6. Optuna Objectives (Corrected)

In [None]:
def objective_lr(trial):
    # No hyperparameters to tune for Linear Regression
    model = LinearRegression()
    model.fit(X_train_ml, y_train_ml)
    preds = model.predict(X_val_ml)
    return np.sqrt(mean_squared_error(y_val_ml, preds)) # Return RMSE

def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0)
    }
    model = RandomForestRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train_ml, y_train_ml)
    preds = model.predict(X_val_ml)
    return np.sqrt(mean_squared_error(y_val_ml, preds)) # Return RMSE

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000), 
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'early_stopping_rounds': 50, # Increase patience for lower LR
        'eval_metric': 'rmse'
    }

    model = xgb.XGBRegressor(
        random_state=42,
        **params
    )

    model.fit(
        X_train_ml, y_train_ml,
        eval_set=[(X_val_ml, y_val_ml)],
        verbose=False
    )

    preds = model.predict(X_val_ml)
    return np.sqrt(mean_squared_error(y_val_ml, preds)) # Return RMSE

class LSTMModel(nn.Module):
    def __init__(self, n_areas, n_items, lstm_units, dense_units, dropout):
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.lstm = nn.LSTM(len(NUMERIC_COLS) + 10 + 5, lstm_units, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.fc1 = nn.Linear(lstm_units, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1)
        out, _ = self.lstm(x)
        out = self.drop(out[:, -1])
        out = torch.relu(self.fc1(out))
        return self.fc2(out)

def train_dl(model, opt, loss_fn, train_loader, val_loader, target_scaler, epochs=100, patience=10, is_final=False):
    scheduler = ReduceLROnPlateau(opt, 'min', patience=5, factor=0.5)
    best_val_rmse = float('inf') # Optimize for RMSE
    wait = 0
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x1, x2, x3, y in train_loader:
            opt.zero_grad()
            pred = model(x1, x2, x3)
            loss = loss_fn(pred, y)
            loss.backward()
            opt.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_loader))
        
        model.eval()
        with torch.no_grad():
            val_inputs = [x.to(next(model.parameters()).device) for x in val_loader.dataset.tensors[:3]]
            val_y = val_loader.dataset.tensors[3]
            val_pred = model(*val_inputs)
            val_mse = loss_fn(val_pred, val_y).item()
            val_rmse = np.sqrt(val_mse) # Calculate RMSE
            val_losses.append(val_mse)

        scheduler.step(val_rmse) # Step based on validation RMSE

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            wait = 0
            if is_final:
                 torch.save(model.state_dict(), f'model_{model.__class__.__name__}.pth')
        else:
            wait += 1
            if wait >= patience:
                break
    return train_losses, val_losses, best_val_rmse

def objective_lstm(trial):
    params = {
        'lstm_units': trial.suggest_categorical('lstm_units', [64, 128]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'dropout': trial.suggest_float('dropout', 0.1, 0.4),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    }
    lr = params.pop('lr')
    weight_decay = params.pop('weight_decay')
    model = LSTMModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, _, best_val_rmse = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader, y_scaler)
    return best_val_rmse

class CNNModel(nn.Module):
    def __init__(self, n_areas, n_items, filters, kernel, dense_units): 
        super().__init__()
        self.embed_area = nn.Embedding(n_areas, 10)
        self.embed_item = nn.Embedding(n_items, 5)
        self.conv = nn.Conv1d(len(NUMERIC_COLS) + 10 + 5, filters, kernel)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(filters, dense_units)
        self.fc2 = nn.Linear(dense_units, 1)
    def forward(self, num, area, item):
        e_area = self.embed_area(area)
        e_item = self.embed_item(item)
        x = torch.cat([num, e_area, e_item], dim=-1).transpose(1, 2)
        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

def objective_cnn(trial):
    params = {
        'filters': trial.suggest_categorical('filters', [64, 128]),
        'kernel': trial.suggest_categorical('kernel', [2, 3]),
        'dense_units': trial.suggest_categorical('dense_units', [32, 64]),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'weight_decay': trial.suggest_float('weight_decay', 1e-5, 1e-3, log=True)
    }
    lr = params.pop('lr')
    weight_decay = params.pop('weight_decay')
    model = CNNModel(N_AREAS, N_ITEMS, **params)
    opt = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    train_ds = TensorDataset(*X_train_dl, y_train_t)
    val_ds = TensorDataset(*X_val_dl, y_val_t)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=64)
    _, _, best_val_rmse = train_dl(model, opt, nn.MSELoss(), train_loader, val_loader, y_scaler)
    return best_val_rmse

## 7. Run Optuna

In [None]:
if os.path.exists('best_params_optuna.joblib'):
    best_params = joblib.load('best_params_optuna.joblib')
else:
    best_params = {}

if RUN_OPTUNA:
    studies = {}
    objectives = {
        'LR': objective_lr,
        'RF': objective_rf,
        'XGB': objective_xgb,
        'LSTM': objective_lstm,
        'CNN': objective_cnn
    }
    for name, run in RUN_MODELS.items():
        if run:
            print(f'--- Tuning {name} ---')
            study = optuna.create_study(direction='minimize')
            n_trials = 50 if name in ['RF', 'XGB'] else 30
            if name == 'LR':
                n_trials = 1
            study.optimize(objectives[name], n_trials=n_trials, show_progress_bar=True)
            best_params[name] = study.best_params
            studies[name] = study
            joblib.dump(best_params, 'best_params_optuna.joblib') # Save after each study
            print(f'Best params for {name}: {study.best_params}')
else:
    print('Skipping Optuna tuning.')

## 7a. Visualize Optuna Results

In [None]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_param_importances

if RUN_OPTUNA and 'studies' in locals():
    for name, study in studies.items():
        if name == 'LR' or not study.trials:
            continue
        print(f'--- Visualizing Optuna results for {name} ---')
        
        # Optimization History
        fig = plot_optimization_history(study)
        fig.update_layout(title=f'{name} Optimization History')
        fig.write_image(f'optuna_{name}_history.png')
        fig.show()

        # Parallel Coordinate
        fig = plot_parallel_coordinate(study)
        fig.update_layout(title=f'{name} Parallel Coordinate')
        fig.write_image(f'optuna_{name}_parallel_coordinate.png')
        fig.show()

        # Slice Plot
        fig = plot_slice(study)
        fig.update_layout(title=f'{name} Slice Plot')
        fig.write_image(f'optuna_{name}_slice.png')
        fig.show()

        # Parameter Importance
        try:
            fig = plot_param_importances(study)
            fig.update_layout(title=f'{name} Parameter Importance')
            fig.write_image(f'optuna_{name}_param_importance.png')
            fig.show()
        except (ValueError, RuntimeError) as e:
            print(f'Could not plot parameter importance for {name}: {e}')

## 8. Final Training (Corrected)

In [None]:
# Combine train+val for final ML model training
X_train_full_ml = pd.concat([X_train_ml, X_val_ml])
y_train_full_ml = pd.concat([y_train_ml, y_val_ml])

models = {}
print("--- Final Model Training ---")

if RUN_MODELS['LR']:
    print("Training Linear Regression...")
    model_lr = LinearRegression()
    model_lr.fit(X_train_full_ml, y_train_full_ml)
    models['LR'] = model_lr
    joblib.dump(model_lr, 'model_lr.joblib')

if RUN_MODELS['RF']:
    print("Training Random Forest...")
    # Use best params from Optuna, or default if not run
    rf_params = best_params.get('RF', {'n_estimators': 100, 'max_depth': 10})
    model_rf = RandomForestRegressor(random_state=42, n_jobs=-1, **rf_params)
    model_rf.fit(X_train_full_ml, y_train_full_ml)
    models['RF'] = model_rf
    joblib.dump(model_rf, 'model_rf.joblib')

if RUN_MODELS['XGB']:
    print("Training XGBoost...")
    xgb_params = best_params.get('XGB', {'n_estimators': 200, 'learning_rate': 0.05})
    model_xgb = xgb.XGBRegressor(random_state=42, **xgb_params)
    model_xgb.fit(X_train_full_ml, y_train_full_ml)
    models['XGB'] = model_xgb
    joblib.dump(model_xgb, 'model_xgb.joblib')

# Combine train+val for final DL model training
X_train_full_seq = np.concatenate([X_train_seq, X_val_seq])
y_train_full_seq = np.concatenate([y_train_seq, y_val_seq])
X_train_full_dl = split_dl(X_train_full_seq)
y_train_full_t = torch.tensor(y_train_full_seq, dtype=torch.float32).unsqueeze(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_full_ds = TensorDataset(*[x.to(device) for x in X_train_full_dl], y_train_full_t.to(device))
test_ds = TensorDataset(*[x.to(device) for x in X_test_dl], y_test_t.to(device))
train_loader = DataLoader(train_full_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

if RUN_MODELS['LSTM']:
    print("Training LSTM...")
    lstm_params = best_params.get('LSTM', {'lstm_units': 64, 'dense_units': 32, 'dropout': 0.2})
    lr = lstm_params.pop('lr', 0.001)
    weight_decay = lstm_params.pop('weight_decay', 1e-4)
    model_lstm = LSTMModel(N_AREAS, N_ITEMS, **lstm_params).to(device)
    opt_lstm = optim.Adam(model_lstm.parameters(), lr=lr, weight_decay=weight_decay)
    train_losses_lstm, val_losses_lstm, _ = train_dl(model_lstm, opt_lstm, nn.MSELoss(), train_loader, test_loader, y_scaler, epochs=150, patience=15, is_final=True)
    models['LSTM'] = model_lstm

if RUN_MODELS['CNN']:
    print("Training CNN...")
    cnn_params = best_params.get('CNN', {'filters': 64, 'kernel': 2, 'dense_units': 32})
    lr = cnn_params.pop('lr', 0.001)
    weight_decay = cnn_params.pop('weight_decay', 1e-4)
    model_cnn = CNNModel(N_AREAS, N_ITEMS, **cnn_params).to(device)
    opt_cnn = optim.Adam(model_cnn.parameters(), lr=lr, weight_decay=weight_decay)
    train_losses_cnn, val_losses_cnn, _ = train_dl(model_cnn, opt_cnn, nn.MSELoss(), train_loader, test_loader, y_scaler, epochs=150, patience=15, is_final=True)
    models['CNN'] = model_cnn

## 9. Plot DL Loss Curves

In [None]:
if RUN_MODELS['LSTM'] and RUN_MODELS['CNN']:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 7))
    ax1.plot(train_losses_lstm, label='Train Loss')
    ax1.plot(val_losses_lstm, label='Validation (Test) Loss')
    ax1.set_title('LSTM Model Loss', fontsize=16)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Mean Squared Error')
    ax1.legend()
    ax2.plot(train_losses_cnn, label='Train Loss')
    ax2.plot(val_losses_cnn, label='Validation (Test) Loss')
    ax2.set_title('CNN Model Loss', fontsize=16)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Mean Squared Error')
    ax2.legend()
    plt.suptitle('Deep Learning Training Curves', fontsize=20)
    plt.savefig("loss_curves.png")
    plt.show()

## 10. Final Evaluation (Corrected)

In [None]:
def reconstruct_yield(y_pred_scaled, df_ref, y_scaler_obj):
    """Inverse transforms a prediction to the original yield scale."""
    if y_pred_scaled.ndim == 1:
        y_pred_scaled = y_pred_scaled.reshape(-1, 1)
    
    y_pred_detrended = y_scaler_obj.inverse_transform(y_pred_scaled)
    
    trend = df_ref['yield_trend'].values.reshape(-1, 1)
    y_pred_actual = y_pred_detrended + trend
    
    return y_pred_actual.flatten()

# --- Evaluation ---
results = []
y_preds_original = {}

print("\n--- Final Performance (Test Set) ---")

for name, model in models.items():
    if name in ['LR', 'RF', 'XGB']:
        # Predict on the scaled test set
        preds_scaled = model.predict(X_test_ml)
        # Reconstruct the predictions
        pred_orig = reconstruct_yield(preds_scaled, test_df, y_scaler)
        y_true_orig = test_df[TARGET].values
        # Store for later use
        y_preds_original[name] = pred_orig
        
    elif name in ['LSTM', 'CNN']:
        model.eval()
        with torch.no_grad():
            # Predict on the scaled test set
            preds_scaled_t = model(*[x.to(device) for x in X_test_dl])
            preds_scaled = preds_scaled_t.cpu().numpy()
            # Reconstruct the predictions
            pred_orig = reconstruct_yield(preds_scaled, test_df_dl_seq_ref, y_scaler)
            y_true_orig = test_df_dl_seq_ref[TARGET].values
            # Store for later use
            y_preds_original[name] = pred_orig

    # Calculate metrics
    mae = mean_absolute_error(y_true_orig, pred_orig)
    rmse = np.sqrt(mean_squared_error(y_true_orig, pred_orig))
    map_e = mape(y_true_orig, pred_orig)
    rms_pe = rmspe(y_true_orig, pred_orig)
    r_2 = r2_score(y_true_orig, pred_orig)
    results.append({'Model': name, 'MAE': mae, 'RMSE': rmse, 'MAPE (%)': map_e, 'RMSPE (%)': rms_pe, 'R²': r_2})

results_df = pd.DataFrame(results).set_index('Model').sort_values('RMSE')
print(results_df.round(2))
results_df.to_csv("final_model_performance.csv")

## 10a. Visualize Final Predictions

In [None]:
# Plot 1: Predicted vs Actual Scatter Plots
print("--- Visualizing Predicted vs. Actual ---")
n_models = len(y_preds_original)
n_cols = 3
n_rows = (n_models + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
axes = axes.flatten()

for i, (name, y_pred) in enumerate(y_preds_original.items()):
    ax = axes[i]
    if name in ['LR', 'RF', 'XGB']:
        y_true = test_df[TARGET].values
    else:
        y_true = test_df_dl_seq_ref[TARGET].values
    
    r2 = r2_score(y_true, y_pred)
    ax.scatter(y_true, y_pred, alpha=0.5, label=f'Predictions')
    ax.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2, label='Ideal')
    ax.set_xlabel("Actual Yield")
    ax.set_ylabel("Predicted Yield")
    ax.set_title(f'{name}: Predicted vs Actual (R² = {r2:.3f})')
    ax.legend()
    ax.grid(True)

# Hide unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.savefig("predicted_vs_true.png")
plt.show()

# Plot 2: Time Series Predictions per Crop
print("\n--- Visualizing Time Series Predictions per Crop ---")
unique_crops = df['Item'].unique()

for crop in unique_crops:
    plt.figure(figsize=(18, 8))
    
    # Plot historical training data
    train_crop_df = df[(df['Item'] == crop) & (df['Year'] <= TRAIN_END)]
    if not train_crop_df.empty:
        plt.plot(train_crop_df['Year'], train_crop_df[TARGET], 'k-', label='Historical Train Data', linewidth=1.5)

    # Plot actual test data
    test_crop_df = df[(df['Item'] == crop) & (df['Year'] > VAL_END)]
    if not test_crop_df.empty:
        plt.plot(test_crop_df['Year'], test_crop_df[TARGET], 'b-o', label='Actual Test Data', markersize=5)

    # Plot predictions from each model
    for name, y_pred in y_preds_original.items():
        if name in ['LR', 'RF', 'XGB']:
            pred_df = test_df.copy()
            pred_df['prediction'] = y_pred
        else:
            pred_df = test_df_dl_seq_ref.copy()
            pred_df['prediction'] = y_pred
            
        crop_pred_df = pred_df[pred_df['Item'] == crop]
        if not crop_pred_df.empty:
            plt.plot(crop_pred_df['Year'], crop_pred_df['prediction'], '--', label=f'Predicted - {name}')

    plt.title(f'Yield Prediction for {crop}', fontsize=16)
    plt.xlabel('Year')
    plt.ylabel('Yield (hg/ha)')
    plt.legend()
    plt.grid(True)
    plt.show()


## 11. Plot Model Performances

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(24, 6))
sns.barplot(data=results_df.reset_index(), x='Model', y='RMSE', ax=axs[0])
axs[0].set_title('RMSE Comparison')
sns.barplot(x='Model', y='MAE', data=results_df.reset_index(), ax=axs[1])
axs[1].set_title('MAE Comparison')
sns.barplot(x='Model', y='MAPE (%)', data=results_df.reset_index(), ax=axs[2])
axs[2].set_title('MAPE Comparison')
sns.barplot(x='Model', y='R²', data=results_df.reset_index(), ax=axs[3])
axs[3].set_title('R² Comparison')
plt.tight_layout()
plt.savefig("model_performance_comparison.png")
plt.show()

## 12. Per-Crop Reporting (Best Model)

In [None]:
best_model_name = results_df.index[0]
print(f"Per-crop report for best model: {best_model_name}")
crop_results = []
# Use the correctly aligned test dataframe based on the best model
if best_model_name in ['LR', 'RF', 'XGB']:
    reporting_df = test_df
    y_true_original = reporting_df[TARGET].values
else:
    reporting_df = test_df_dl_seq_ref
    y_true_original = reporting_df[TARGET].values

items = reporting_df['Item'].values

for crop in np.unique(items):
    mask = items == crop
    true = y_true_original[mask]
    pred = y_preds_original[best_model_name][mask]
    if len(true) > 0:
        crop_results.append({
            'Crop': crop,
            'RMSPE (%)': rmspe(true, pred),
            'MAPE (%)': mape(true, pred),
            'RMSE': np.sqrt(mean_squared_error(true, pred)),
            'R²': r2_score(true, pred)
        })
crop_df = pd.DataFrame(crop_results).sort_values('RMSPE (%)')
print(crop_df.round(2))
crop_df.to_csv('per_crop_performance.csv', index=False)

## 13. SHAP Analysis (If Tree Model)

In [None]:
best_model_name = results_df.index[0]
if best_model_name in models and best_model_name in ['RF', 'XGB']:
    best_model = models[best_model_name]
    print(f"Running SHAP on {best_model_name}")
    # For SHAP, we need to use the correctly aligned test features
    X_test_shap = X_test_ml
    explainer = shap.TreeExplainer(best_model)
    shap_values = explainer.shap_values(X_test_shap)
    shap.summary_plot(shap_values, X_test_shap, plot_type="beeswarm", show=False)
    plt.title(f"SHAP Beeswarm ({best_model_name})", fontsize=16)
    plt.savefig("shap_beeswarm.png", bbox_inches='tight')
    plt.show()
    shap.summary_plot(shap_values, X_test_shap, plot_type="bar", show=False)
    plt.title(f"Feature Importance ({best_model_name})", fontsize=16)
    plt.savefig("shap_importance.png", bbox_inches='tight')
    plt.show()
else:
    print("SHAP skipped for non-tree model.")

## 14. Export Predictions

In [None]:
# Create a base dataframe for predictions. ML models have more test samples than DL models.
final_predictions_df = test_df.copy()
final_predictions_df['true_yield_original'] = final_predictions_df[TARGET]

# Add predictions. Note that DL predictions will have NaNs for non-sequenced rows.
for name, preds in y_preds_original.items():
    if name in ['LR', 'RF', 'XGB']:
        final_predictions_df[f'predicted_{name}'] = preds
    else:
        # Align DL predictions with the main test dataframe
        dl_preds_series = pd.Series(preds, index=test_df_dl_seq_ref.index, name=f'predicted_{name}')
        final_predictions_df = final_predictions_df.join(dl_preds_series)

export_cols = ['Year', 'Area', 'Item', 'true_yield_original'] + [f'predicted_{name}' for name in models.keys()]
final_predictions_df[export_cols].to_csv("final_test_predictions.csv", index=False)
print("Exported predictions.")
print("\n--- Complete ---")