# Crop Yield Prediction: PyTorch Neural Network with Optuna (Part 5)

## Overview
Here I'm training a **Feedforward Neural Network (PyTorch)** to predict crop yields. I've set it up so we can easily swap the target crop in the data loading part.

## My Plan
1.  **Pick a Crop:** Choose what we want to predict (like Rice).
2.  **Clean Up:** Remove some outliers (min/max yields per country) to help the model learn better.
3.  **Split Data:** Divide by year (Train, Val, Test) so we aren't cheating by predicting the past with future data.
4.  **Scale:** Normalize everything so the Neural Net doesn't get confused by big numbers.
5.  **Initial Model:** Try a basic network first to see how it does.
6.  **Tune It:** Use **Optuna** to automatically find the best settings (hyperparameters).
7.  **Final Test:** See if the tuned model is actually better.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# Optuna Visualization stuff
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

# Making plots look nice
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Check if I have a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### 1. Loading Data & Choosing a Crop
Grab the dataset and decide what to predict. I'm sticking with **Rice** for now. I'll also drop rows where the target yield is missing.

In [None]:
# Load data
df = pd.read_parquet('Parquet/XY_v3.parquet')

# --- SEE WHAT CROPS WE HAVE ---
target_columns = [col for col in df.columns if col.startswith('Y_')]
available_crops = [col.replace('Y_', '') for col in target_columns]

print("--- Crops available ---")
print(available_crops)
print("-" * 40)

# --- PICK YOUR CROP HERE ---
CHOSEN_CROP = 'rice'  # <--- Change to 'lettuce', 'pepper', etc. if you want
# ---------------------------

TARGET_COL = f'Y_{CHOSEN_CROP}'

if TARGET_COL not in df.columns:
    raise ValueError(f"Target {TARGET_COL} not found. Typo?")

print(f"Target: {TARGET_COL}")

# Drop missing targets
df_model = df.dropna(subset=[TARGET_COL])
print(f"Rows with valid target: {len(df_model)}")

### 2. Cleaning & Preparing Features
Before I set up my features, I need to remove some extreme outliers. Specifically, I'm removing the min and max yield rows for each country to make the model more robust.

Then I'll split the data by year (Train < 2014, Val 2014-2018, Test >= 2019) and scale everything.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import torch
import pandas as pd

# --- Remove min/max TARGET_COL rows per country (robust version) ---
df_model = df_model.copy()

# Get row positions (not index labels)
idx_min = df_model.groupby('area')[TARGET_COL].idxmin().values
idx_max = df_model.groupby('area')[TARGET_COL].idxmax().values

# Combine them safely
rows_to_drop = np.concatenate([idx_min, idx_max])

# Drop by position
df_model = df_model.iloc[~df_model.index.isin(rows_to_drop)].reset_index(drop=True)
print(f"Rows after dropping min/max outliers: {len(df_model)}")

# --- DROP UNUSED COLUMNS ---
cols_to_drop = [c for c in df_model.columns 
                if c.startswith("avg_yield_") and CHOSEN_CROP not in c]
df_model = df_model.drop(columns=cols_to_drop)

# --- DEFINE FEATURES ---
feature_cols = [c for c in df_model.columns 
                if not c.startswith('Y_') and c not in ['area']]

print(f"Total Features: {len(feature_cols)}")

# --- SPLIT DATA BY YEAR ---
TRAIN_END_YEAR = 2014
VAL_END_YEAR = 2019

# 1. Train (< 2014)
mask_train = df_model['year'] < TRAIN_END_YEAR
X_train_raw = df_model[mask_train][feature_cols]
y_train = df_model[mask_train][TARGET_COL]

# 2. Validation (2014 - 2018)
mask_val = (df_model['year'] >= TRAIN_END_YEAR) & (df_model['year'] < VAL_END_YEAR)
X_val_raw = df_model[mask_val][feature_cols]
y_val = df_model[mask_val][TARGET_COL]

# 3. Test (>= 2019)
mask_test = df_model['year'] >= VAL_END_YEAR
X_test_raw = df_model[mask_test][feature_cols]
y_test = df_model[mask_test][TARGET_COL]

# --- IMPUTE NANS ---
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_raw), columns=feature_cols)
X_val_imputed = pd.DataFrame(imputer.transform(X_val_raw), columns=feature_cols)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_raw), columns=feature_cols)

# --- SCALE DATA ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_imputed)
X_val = scaler.transform(X_val_imputed)
X_test = scaler.transform(X_test_imputed)

# Make Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

print(f"\nTrain size: {len(X_train)}")
print(f"Val size:   {len(X_val)}")
print(f"Test size:  {len(X_test)}")

### 3. Initial Model Testing
I'll try a standard Feedforward Network first to check for any weird errors. I'll also check the metrics for Train, Validation, and Test to see how it generalizes.

In [None]:
# --- BUILD NETWORK ---
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        return self.layers(x)

# --- TRAIN FUNCTION ---
def train_model(model, X_t, y_t, X_v, y_v, lr=0.001, epochs=150, batch_size=32, verbose=True):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_loader = DataLoader(TensorDataset(X_t, y_t), batch_size=batch_size, shuffle=True)
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * batch_X.size(0)
            
        # Record losses
        train_rmse = np.sqrt(epoch_loss / len(X_t))
        
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_v)
            val_loss = criterion(val_outputs, y_v)
            val_rmse = np.sqrt(val_loss.item())
            
        train_losses.append(train_rmse)
        val_losses.append(val_rmse)
        
        if verbose and (epoch % 50 == 0 or epoch == epochs-1):
            print(f"Epoch {epoch}/{epochs} | Train RMSE: {train_rmse:.2f} | Val RMSE: {val_rmse:.2f}")
            
    return train_losses, val_losses

# --- TRAIN INITIAL MODEL ---
input_dim = X_train.shape[1]
model_init = SimpleNN(input_dim).to(device)

# Using specific hyperparameters requested
train_hist, val_hist = train_model(model_init, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, 
                                   lr=0.001, epochs=150, batch_size=32)

# --- PLOT LEARNING CURVE ---
plt.figure(figsize=(10, 6))
plt.plot(train_hist, label='Training RMSE', color='blue')
plt.plot(val_hist, label='Validation RMSE', color='red')
plt.title(f'Initial Model Learning Curve ({CHOSEN_CROP})')
plt.xlabel('Epochs')
plt.ylabel('RMSE')
plt.legend()
plt.show()

# --- METRICS TABLE ---
def get_metrics(model, X, y_true):
    model.eval()
    with torch.no_grad():
        preds = model(X).cpu().numpy().flatten()
    y_true_np = y_true
    # Handle if y_true is tensor or numpy
    if isinstance(y_true, torch.Tensor):
        y_true_np = y_true.cpu().numpy().flatten()
        
    rmse = np.sqrt(mean_squared_error(y_true_np, preds))
    r2 = r2_score(y_true_np, preds)
    return rmse, r2, preds

# Calculate for all splits
rmse_t, r2_t, _ = get_metrics(model_init, X_train_tensor, y_train_tensor)
rmse_v, r2_v, _ = get_metrics(model_init, X_val_tensor, y_val_tensor)
rmse_test, r2_test, preds_init_test = get_metrics(model_init, X_test_tensor, y_test)

# Display nicer table
metrics_data = {
    'Metric': ['RMSE', 'R²'],
    'Train': [rmse_t, r2_t],
    'Validation': [rmse_v, r2_v],
    'Test': [rmse_test, r2_test]
}
print("\n--- Initial Model Metrics ---")
display(pd.DataFrame(metrics_data))

### 4. Tuning with Optuna
Now I'm using Optuna to hunt for the best architecture. I've set the search space exactly as planned.

In [None]:
# --- DYNAMIC MODEL BUILDER ---
class DynamicNN(nn.Module):
    def __init__(self, input_dim, n_layers, n_units, dropout, activation_name):
        super(DynamicNN, self).__init__()
        layers = []
        in_features = input_dim
        
        # Select Activation
        if activation_name == "ReLU":
            act_fn = nn.ReLU()
        elif activation_name == "LeakyReLU":
            act_fn = nn.LeakyReLU()
        else:
            act_fn = nn.ReLU()

        # Build Layers
        for _ in range(n_layers):
            layers.append(nn.Linear(in_features, n_units))
            layers.append(act_fn)
            layers.append(nn.Dropout(dropout))
            in_features = n_units
            
        layers.append(nn.Linear(in_features, 1))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# --- OPTUNA OBJECTIVE ---
def objective(trial):
    # Scoped Search Space
    n_layers = trial.suggest_int("n_layers", 3, 3)   # Fixed
    n_units = trial.suggest_int("n_units", 32, 48)   # Band around 42
    dropout = trial.suggest_float("dropout", 0.14, 0.18)
    lr = trial.suggest_float("lr", 0.0006, 0.0010, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16])
    activation = trial.suggest_categorical("activation", ["LeakyReLU", "ReLU"])
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam"])

    # Build & Setup
    model = DynamicNN(input_dim, n_layers, n_units, dropout, activation).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
        
    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), 
                              batch_size=batch_size, shuffle=True)
    
    # Training Loop
    epochs = 40 # Slightly fewer for tuning speed
    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
        
        # Check Validation
        model.eval()
        with torch.no_grad():
            val_pred = model(X_val_tensor)
            val_rmse = np.sqrt(criterion(val_pred, y_val_tensor).item())

        # Pruning
        trial.report(val_rmse, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return val_rmse

# --- RUN STUDY ---
study = optuna.create_study(direction='minimize', study_name='Crop_Yield_NN_Optuna')
study.optimize(objective, n_trials=20)

print("\nBest Params:")
print(study.best_params)

### 5. Visualizing the Tuning Process
Just some quick plots to see what Optuna found interesting.

In [None]:
# Optimization History
fig = plot_optimization_history(study)
fig.show()

# Parameter Importance
try:
    fig = plot_param_importances(study)
    fig.show()
except:
    print("Couldn't plot importance.")

### 6. Final Model
Now I'll build the final model using the best settings. I'll train it on the combined Train + Validation data to give it as much info as possible before the final test.

In [None]:
# Combine Train + Val
X_train_full = np.vstack((X_train, X_val))
y_train_full = np.concatenate((y_train, y_val))

X_train_full_tensor = torch.tensor(X_train_full, dtype=torch.float32).to(device)
y_train_full_tensor = torch.tensor(y_train_full, dtype=torch.float32).view(-1, 1).to(device)

bp = study.best_params

final_model = DynamicNN(
    input_dim,
    bp['n_layers'], 
    bp['n_units'], 
    bp['dropout'], 
    bp['activation']
).to(device)

optimizer = optim.Adam(final_model.parameters(), lr=bp['lr'])
criterion = nn.MSELoss()
train_loader = DataLoader(TensorDataset(X_train_full_tensor, y_train_full_tensor), 
                          batch_size=bp['batch_size'], shuffle=True)

print("Training Final Model...")
final_model.train()
for epoch in range(150):
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = final_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

print("Done.")

### 7. Results & Analysis
Let's see the numbers! I'll compare the Initial Model vs the Tuned Model.

In [None]:
# Get Metrics for Final Model
# Note: I'm evaluating 'Train' and 'Val' on the original splits for comparison consistency,
# even though the model saw the Val data during training this time.
rmse_f_t, r2_f_t, _ = get_metrics(final_model, X_train_tensor, y_train_tensor)
rmse_f_v, r2_f_v, _ = get_metrics(final_model, X_val_tensor, y_val_tensor)
rmse_f_test, r2_f_test, preds_final_test = get_metrics(final_model, X_test_tensor, y_test)

metrics_final = {
    'Metric': ['RMSE', 'R²'],
    'Train': [rmse_f_t, r2_f_t],
    'Validation': [rmse_f_v, r2_f_v],
    'Test': [rmse_f_test, r2_f_test]
}

print("--- Final Tuned Model Metrics ---")
display(pd.DataFrame(metrics_final))

# --- COMPARE PLOTS ---
fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

all_preds = np.concatenate([preds_init_test, preds_final_test])
all_true = np.concatenate([y_test, y_test])
min_val, max_val = min(all_preds.min(), all_true.min()), max(all_preds.max(), all_true.max())

# Initial
axes[0].scatter(y_test, preds_init_test, alpha=0.4, color='orange')
axes[0].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
axes[0].set_title(f'Initial Model\nTest RMSE: {rmse_test:.2f}')

# Tuned
axes[1].scatter(y_test, preds_final_test, alpha=0.4, color='green')
axes[1].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
axes[1].set_title(f'Tuned Model\nTest RMSE: {rmse_f_test:.2f}')

plt.show()

In [None]:
# --- FULL TIMELINE PLOT ---

# 1. Predictions
# (Scaling & Tensor conversion needed since it's a PyTorch model)
X_all = scaler.transform(df_model[feature_cols])
X_all_tensor = torch.tensor(X_all, dtype=torch.float32).to(device)

final_model.eval()
with torch.no_grad():
    all_predictions = final_model(X_all_tensor).cpu().numpy().flatten()

# 2. Build DataFrame
df_full_trend = pd.DataFrame({
    'Year': df_model['year'],
    'Actual': df_model[TARGET_COL],
    'Predicted': all_predictions
})

# 3. Aggregate yearly
yearly_trend = df_full_trend.groupby('Year').mean()

# 4. Plot
plt.figure(figsize=(14, 7))

# Plot lines
plt.plot(yearly_trend.index, yearly_trend['Actual'],
         marker='o', label='Actual Yield', linewidth=2, color='blue')

plt.plot(yearly_trend.index, yearly_trend['Predicted'],
         marker='x', linestyle='--', label='Predicted Yield', linewidth=2, color='orange')

# --- Boundaries ---
MIN_YEAR = yearly_trend.index.min()
MAX_YEAR = yearly_trend.index.max()

train_boundary = TRAIN_END_YEAR - 0.5
val_boundary = VAL_END_YEAR - 0.5

# --- Shaded Regions ---
plt.axvspan(MIN_YEAR - 0.5, train_boundary, color='green', alpha=0.1,
            label=f'Train (<{TRAIN_END_YEAR})')

plt.axvspan(train_boundary, val_boundary, color='yellow', alpha=0.1,
            label=f'Validation ({TRAIN_END_YEAR}-{VAL_END_YEAR - 1})')

plt.axvspan(val_boundary, MAX_YEAR + 0.5, color='red', alpha=0.1,
            label=f'Test (>={VAL_END_YEAR})')

# --- Split Lines ---
plt.axvline(train_boundary, color='grey', linestyle=':', alpha=0.5)
plt.axvline(val_boundary, color='grey', linestyle=':', alpha=0.5)

# --- Labels ---
y_max = yearly_trend['Actual'].max()
text_y = y_max * 1.05

plt.text((MIN_YEAR + train_boundary)/2, text_y, 'TRAINING',
         ha='center', fontsize=12, fontweight='bold', color='green')

plt.text((train_boundary + val_boundary)/2, text_y, 'VALIDATION',
         ha='center', fontsize=12, fontweight='bold', color='#D4AC0D')

plt.text((val_boundary + MAX_YEAR)/2, text_y, 'TESTING',
         ha='center', fontsize=12, fontweight='bold', color='red')

# --- Final Styling ---
plt.title(f'Full Timeline Analysis: Actual vs. Predicted Yield ({CHOSEN_CROP})', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Yield (hg/ha)', fontsize=12)

plt.legend(loc='upper left')
plt.grid(True, alpha=0.3)

plt.xticks(np.arange(MIN_YEAR, MAX_YEAR + 1, 2))
plt.xlim(MIN_YEAR - 0.5, MAX_YEAR + 0.5)

plt.tight_layout()
plt.show()

### 8. Geographic Error Map
Checking where the model messes up the most.

In [None]:
import plotly.express as px

# Re-join test data with Country names
mask_test = df_model['year'] >= VAL_END_YEAR
test_countries = df_model[mask_test]['area'].values

comp_df = pd.DataFrame({
    'area': test_countries,
    'Actual': y_test,
    'Predicted': preds_final_test
})

# Clean names for map
comp_df['area'] = comp_df['area'].replace({
    'United_States_of_America': 'United States',
    'Viet_Nam': 'Vietnam',
    'China,_mainland': 'China',
    # Add more if needed...
})

# Calculate RMSPE
comp_df['Sq_Err_Pct'] = ((comp_df['Actual'] - comp_df['Predicted']) / (comp_df['Actual'] + 1e-6))**2
map_data = comp_df.groupby('area')['Sq_Err_Pct'].mean().apply(np.sqrt).mul(100).reset_index(name='RMSPE')

fig = px.choropleth(
    map_data, 
    locations='area', 
    locationmode='country names', 
    color='RMSPE',
    title='Prediction Error by Country (RMSPE %)',
    color_continuous_scale=['green', 'red']
)
fig.show()

### 9. Feature Importance
Since Neural Nets are "black boxes", I'm using Permutation Importance to see which variables actually matter.

In [None]:
# Wrapper for Permutation Importance
class PyTorchEstimator:
    def __init__(self, model, device):
        self.model = model
        self.device = device
    def fit(self, X, y): pass
    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            X_t = torch.tensor(X, dtype=torch.float32).to(self.device)
            return self.model(X_t).cpu().numpy().flatten()

estimator = PyTorchEstimator(final_model, device)
res = permutation_importance(estimator, X_val, y_val, scoring='neg_root_mean_squared_error', n_repeats=5)

imps = pd.DataFrame({'Feature': feature_cols, 'Importance': np.abs(res.importances_mean)})
imps = imps.sort_values('Importance', ascending=False).head(15)

sns.barplot(x='Importance', y='Feature', data=imps, palette='viridis')
plt.title('Top 15 Features')
plt.show()