In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import copy
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# Optuna Visualization Tools
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# ==============================
# 1. LOAD DATA & BASIC CONFIG
# ==============================

# Choose crop (must match suffix after 'Y_' in your df)
CHOSEN_CROP = "rice"  # e.g. "rice", "wheat", "maize_corn", ...
TARGET_COL = f"Y_{CHOSEN_CROP}"

# Sequence length for LSTM
SEQ_LEN = 3

# Paths
PARQUET_PATH = "Parquet/XY_v2.parquet"

# Load parquet
df = pd.read_parquet(PARQUET_PATH)
print("Raw data shape:", df.shape)

# ==============================
# 2. BASIC FILTERING & TARGET HANDLING
# ==============================

# Ensure required columns exist
assert "year" in df.columns, "'year' column not found!"
assert "area" in df.columns, "'area' column not found!"
assert TARGET_COL in df.columns, f"{TARGET_COL} not found in df columns!"

# Keep only years within the needed range (optional, depending on your data)
df = df[(df["year"] >= 1982) & (df["year"] <= 2023)].copy()

# Drop rows where the chosen target is NaN
df = df.dropna(subset=[TARGET_COL]).copy()
print("After dropping NaN targets:", df.shape)


In [None]:
# 3. FEATURE SELECTION
# ==============================
# We will:
# - Keep all non-yield features (weather, inputs, coords, year, etc.)
# - Keep lagged average yields only for the chosen crop: avg_yield_<CHOSEN_CROP>_*
# - Drop all other avg_yield_* columns that belong to other crops
# - Remove all Y_* columns from the feature set (we keep TARGET_COL only for labels)
# - We will NOT use 'area' as a numeric feature; only for grouping sequences.
#   (Simpler than one-hot encoding or embeddings. Comment this choice.)

df_model = df.copy()

# Identify all yield target columns
all_target_cols = [c for c in df_model.columns if c.startswith("Y_")]

# Drop all target columns except the chosen one
other_targets = [c for c in all_target_cols if c != TARGET_COL]
df_model = df_model.drop(columns=other_targets)

# Identify all avg_yield_* columns
avg_yield_cols = [c for c in df_model.columns if c.startswith("avg_yield_")]

# Keep only lagged avg_yield columns for the chosen crop
chosen_prefix = f"avg_yield_{CHOSEN_CROP}_"
keep_yield_lag_cols = [c for c in avg_yield_cols if c.startswith(chosen_prefix)]
drop_yield_lag_cols = sorted(set(avg_yield_cols) - set(keep_yield_lag_cols))

df_model = df_model.drop(columns=drop_yield_lag_cols)
print(f"Kept lagged yield columns for crop '{CHOSEN_CROP}':")
print(keep_yield_lag_cols)
print(f"Dropped {len(drop_yield_lag_cols)} lagged yield columns of other crops.")

# Sort by area and year for time consistency
df_model = df_model.sort_values(["area", "year"]).reset_index(drop=True)

# Now define feature columns:
feature_cols = [
    c for c in df_model.columns
    if c not in ["area", "year", TARGET_COL] and not c.startswith("Y_")
]

print("Number of feature columns:", len(feature_cols))

# ==============================
# 4. GROUPED FILLING OF NaNs (FFILL + BFILL) ON FEATURES
# ==============================
# Step 1: forward-fill and backward-fill per 'area' in time order for feature columns
# Target column is NOT touched here (we already dropped NaNs in target)

df_model[feature_cols] = (
    df_model
    .groupby("area", group_keys=False)[feature_cols]
    .apply(lambda g: g.ffill().bfill())
)

# ==============================
# 5. TIME-BASED SPLIT BY YEAR
# ==============================
# Rules:
# Train: year < 2014
# Val:   2014 <= year <= 2018
# Test:  2019 <= year <= 2023

train_mask = df_model["year"] < 2014
val_mask   = (df_model["year"] >= 2014) & (df_model["year"] <= 2018)
test_mask  = df_model["year"] >= 2019

def print_split_info(df_, mask, name):
    years = sorted(df_.loc[mask, "year"].unique())
    print(f"{name} years: {years}")
    print(f"{name} rows: {mask.sum()}")

print_split_info(df_model, train_mask, "TRAIN")
print_split_info(df_model, val_mask,   "VAL")
print_split_info(df_model, test_mask,  "TEST")

# Extract raw feature and target subsets
X_train_raw = df_model.loc[train_mask, feature_cols].copy()
X_val_raw   = df_model.loc[val_mask,   feature_cols].copy()
X_test_raw  = df_model.loc[test_mask,  feature_cols].copy()

y_train = df_model.loc[train_mask, TARGET_COL].values.astype(np.float32)
y_val   = df_model.loc[val_mask,   TARGET_COL].values.astype(np.float32)
y_test  = df_model.loc[test_mask,  TARGET_COL].values.astype(np.float32)

# ==============================
# 6. REMAINING NaNs → TRAIN MEAN IMPUTATION
# ==============================
# Compute column means from the TRAIN set (after ffill/bfill)
col_means = X_train_raw.mean(axis=0)

# Fill remaining NaNs with these means
X_train_filled = X_train_raw.fillna(col_means)
X_val_filled   = X_val_raw.fillna(col_means)
X_test_filled  = X_test_raw.fillna(col_means)

# Optional sanity check
print("Any NaNs remaining in train features?", X_train_filled.isna().any().any())
print("Any NaNs remaining in val features?",   X_val_filled.isna().any().any())
print("Any NaNs remaining in test features?",  X_test_filled.isna().any().any())

In [None]:
# ==============================
# 7. SCALING (StandardScaler on FEATURES ONLY)
# ==============================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filled.values)
X_val_scaled   = scaler.transform(X_val_filled.values)
X_test_scaled  = scaler.transform(X_test_filled.values)

# Create DataFrames with scaled features (to keep alignment with area/year)
X_train_scaled_df = pd.DataFrame(
    X_train_scaled, index=X_train_filled.index, columns=feature_cols
)
X_val_scaled_df = pd.DataFrame(
    X_val_scaled, index=X_val_filled.index, columns=feature_cols
)
X_test_scaled_df = pd.DataFrame(
    X_test_scaled, index=X_test_filled.index, columns=feature_cols
)

# Rebuild "split" dataframes including area/year/target for sequence building
train_df = df_model.loc[train_mask, ["area", "year", TARGET_COL]].join(X_train_scaled_df)
val_df   = df_model.loc[val_mask,   ["area", "year", TARGET_COL]].join(X_val_scaled_df)
test_df  = df_model.loc[test_mask,  ["area", "year", TARGET_COL]].join(X_test_scaled_df)


In [None]:
# ==============================
# 8. SEQUENCE BUILDING FUNCTION
# ==============================
def build_sequences_panel(df_split, feature_cols, target_col, seq_len=3):
    """
    Build rolling sequences per area, using only rows from df_split.
    No leakage across splits since each df_split is already split by year.
    
    For each 'area':
    - Sort by year
    - Create rolling windows of length seq_len of features
    - The label is the target at the last timestep of the window
    
    Returns:
        X_seq: (num_sequences, seq_len, num_features)
        y_seq: (num_sequences,)
    """
    X_list = []
    y_list = []

    for area, g in df_split.groupby("area"):
        g = g.sort_values("year")
        feat_vals = g[feature_cols].values
        targ_vals = g[target_col].values

        if len(g) < seq_len:
            # Not enough timesteps in this split for this area
            continue

        for i in range(len(g) - seq_len + 1):
            x_seq = feat_vals[i : i + seq_len]
            y_seq = targ_vals[i + seq_len - 1]  # target at last time step
            X_list.append(x_seq)
            y_list.append(y_seq)

    if len(X_list) == 0:
        return np.empty((0, seq_len, len(feature_cols)), dtype=np.float32), np.empty((0,), dtype=np.float32)

    X_seq = np.stack(X_list).astype(np.float32)
    y_seq = np.array(y_list, dtype=np.float32)
    return X_seq, y_seq

X_train_seq, y_train_seq = build_sequences_panel(train_df, feature_cols, TARGET_COL, seq_len=SEQ_LEN)
X_val_seq,   y_val_seq   = build_sequences_panel(val_df,   feature_cols, TARGET_COL, seq_len=SEQ_LEN)
X_test_seq,  y_test_seq  = build_sequences_panel(test_df,  feature_cols, TARGET_COL, seq_len=SEQ_LEN)

print("Train sequence shape:", X_train_seq.shape, y_train_seq.shape)
print("Val sequence shape:  ", X_val_seq.shape, y_val_seq.shape)
print("Test sequence shape: ", X_test_seq.shape, y_test_seq.shape)



In [None]:
# ==============================
# 9. DATALOADERS
# ==============================

BATCH_SIZE = 64

train_dataset = TensorDataset(
    torch.from_numpy(X_train_seq), torch.from_numpy(y_train_seq)
)
val_dataset = TensorDataset(
    torch.from_numpy(X_val_seq), torch.from_numpy(y_val_seq)
)
test_dataset = TensorDataset(
    torch.from_numpy(X_test_seq), torch.from_numpy(y_test_seq)
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)


In [None]:
# ==============================
# 10. LSTM MODEL DEFINITION
# ==============================

class LSTMRegressor(nn.Module):
    def __init__(
        self,
        input_dim: int,
        hidden_dim: int = 64,
        num_layers: int = 2,
        dropout: float = 0.2,
    ):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
        )

        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim, device=x.device)

        out, (hn, cn) = self.lstm(x, (h0, c0))  # out: (batch, seq_len, hidden_dim)
        # Use output at last time step
        last_out = out[:, -1, :]  # (batch, hidden_dim)
        y_hat = self.fc(last_out).squeeze(-1)  # (batch,)

        return y_hat

# Instantiate model
input_dim = len(feature_cols)
HIDDEN_DIM = 64
NUM_LAYERS = 2
DROPOUT = 0.2

model = LSTMRegressor(
    input_dim=input_dim,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,).to(device)

print(model)

In [None]:
# ==============================
# 11. TRAINING SETUP
# ==============================

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

NUM_EPOCHS = 120
PATIENCE = 15  # for early stopping

best_val_rmse = float("inf")
best_state = None
no_improve_epochs = 0

train_rmse_history = []
val_rmse_history = []

# ==============================
# 12. TRAINING LOOP WITH EARLY STOPPING
# ==============================
for epoch in range(1, NUM_EPOCHS + 1):
    # ---- TRAIN ----
    model.train()
    train_preds = []
    train_trues = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        train_preds.append(y_pred.detach().cpu().numpy())
        train_trues.append(y_batch.detach().cpu().numpy())

    train_preds = np.concatenate(train_preds)
    train_trues = np.concatenate(train_trues)
    train_rmse = mean_squared_error(train_trues, train_preds)

    # ---- VALIDATION ----
    model.eval()
    val_preds = []
    val_trues = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(X_batch)

            val_preds.append(y_pred.cpu().numpy())
            val_trues.append(y_batch.cpu().numpy())

    val_preds = np.concatenate(val_preds)
    val_trues = np.concatenate(val_trues)
    val_rmse = mean_squared_error(val_trues, val_preds)

    train_rmse_history.append(train_rmse)
    val_rmse_history.append(val_rmse)

    print(f"Epoch {epoch:03d}/{NUM_EPOCHS} - Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")

    # Early stopping check
    if val_rmse < best_val_rmse - 1e-6:  # small tolerance
        best_val_rmse = val_rmse
        best_state = copy.deepcopy(model.state_dict())
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs >= PATIENCE:
            print(f"Early stopping at epoch {epoch} (no improvement for {PATIENCE} epochs).")
            break

# Restore best model weights
if best_state is not None:
    model.load_state_dict(best_state)

# ==============================
# 13. FINAL EVALUATION (TRAIN / VAL / TEST)
# ==============================
def evaluate_loader(model, data_loader):
    model.eval()
    all_preds = []
    all_trues = []
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(X_batch)

            all_preds.append(y_pred.cpu().numpy())
            all_trues.append(y_batch.cpu().numpy())
    if len(all_preds) == 0:
        return None, None, None, None, None

    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_trues)

    rmse = mean_squared_error(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    return rmse, mae, r2, y_true, y_pred

train_rmse, train_mae, train_r2, y_train_true, y_train_pred = evaluate_loader(model, train_loader)
val_rmse,   val_mae,   val_r2,   y_val_true,   y_val_pred   = evaluate_loader(model, val_loader)
test_rmse,  test_mae,  test_r2,  y_test_true,  y_test_pred  = evaluate_loader(model, test_loader)

print("\n===== FINAL METRICS (in original target units) =====")
print(f"TRAIN - RMSE: {train_rmse:.4f}, MAE: {train_mae:.4f}, R²: {train_r2:.4f}")
print(f"VAL   - RMSE: {val_rmse:.4f}, MAE: {val_mae:.4f}, R²: {val_r2:.4f}")
print(f"TEST  - RMSE: {test_rmse:.4f}, MAE: {test_mae:.4f}, R²: {test_r2:.4f}")

# ==============================
# 14. PLOTS
# ==============================

# ---- RMSE vs Epoch ----
plt.figure(figsize=(7, 5))
plt.plot(train_rmse_history, label="Train RMSE")
plt.plot(val_rmse_history,   label="Val RMSE")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title(f"Train & Validation RMSE (Crop: {CHOSEN_CROP})")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()