In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import copy
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# Optuna Visualization Tools
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# ==============================
# 1. LOAD & CONFIG
# ==============================
CHOSEN_CROP = "rice" 
TARGET_COL = f"Y_{CHOSEN_CROP}"
SEQ_LEN = 3
PARQUET_PATH = "Parquet/XY_v2.parquet"

df = pd.read_parquet(PARQUET_PATH)
df = df[(df["year"] >= 1982) & (df["year"] <= 2023)].copy()
df = df.dropna(subset=[TARGET_COL]).copy()
print(f"--> Filtered years (1982-2023) and dropped NaN targets. New Shape: {df.shape}")

In [None]:
# ==============================
# 2. FEATURE SELECTION
# ==============================
df_model = df.copy()

# Drop other target columns
all_targets = [c for c in df_model.columns if c.startswith("Y_")]
df_model = df_model.drop(columns=[c for c in all_targets if c != TARGET_COL])

# Drop unrelated crop yields
avg_yield_cols = [c for c in df_model.columns if c.startswith("avg_yield_")]
chosen_prefix = f"avg_yield_{CHOSEN_CROP}_"
keep_cols = [c for c in avg_yield_cols if c.startswith(chosen_prefix)]
drop_cols = list(set(avg_yield_cols) - set(keep_cols))
df_model = df_model.drop(columns=drop_cols)

# Sort for time consistency
df_model = df_model.sort_values(["area", "year"]).reset_index(drop=True)

# Define feature columns
feature_cols = [c for c in df_model.columns if c not in ["area", "year", TARGET_COL] and not c.startswith("Y_")]
print(f"--> Selected {len(feature_cols)} input features for prediction.")

# Fill NaNs grouped by area
df_model[feature_cols] = df_model.groupby("area", group_keys=False)[feature_cols].apply(lambda g: g.ffill().bfill())

In [None]:
# ==============================
# 3. SPLIT & SCALE
# ==============================
train_mask = df_model["year"] < 2014
val_mask   = (df_model["year"] >= 2014) & (df_model["year"] <= 2018)
test_mask  = df_model["year"] >= 2019

print(f"--> Train samples: {sum(train_mask)}")
print(f"--> Val samples:   {sum(val_mask)}")
print(f"--> Test samples:  {sum(test_mask)}")

# Separate Features and Targets
X_raw = df_model[feature_cols].values
y_raw = df_model[[TARGET_COL]].values 

# Impute remaining NaNs (if any) with Train Mean
# We calculate mean only on training data to avoid leakage
train_mean = np.nanmean(X_raw[train_mask], axis=0)
inds = np.where(np.isnan(X_raw))
X_raw[inds] = np.take(train_mean, inds[1])

# Initialize Scalers
scaler_X = StandardScaler()
scaler_y = StandardScaler() # Important: Scale the target for LSTM stability

# Fit on TRAIN only
X_scaled = X_raw.copy()
y_scaled = y_raw.copy()

X_scaled[train_mask] = scaler_X.fit_transform(X_raw[train_mask])
X_scaled[val_mask]   = scaler_X.transform(X_raw[val_mask])
X_scaled[test_mask]  = scaler_X.transform(X_raw[test_mask])

y_scaled[train_mask] = scaler_y.fit_transform(y_raw[train_mask])
y_scaled[val_mask]   = scaler_y.transform(y_raw[val_mask])
y_scaled[test_mask]  = scaler_y.transform(y_raw[test_mask])

# Reconstruct DataFrame for sequence building
df_scaled = df_model[["area", "year"]].copy()
df_scaled["target_scaled"] = y_scaled
X_df = pd.DataFrame(X_scaled, columns=feature_cols, index=df_model.index)
df_final = pd.concat([df_scaled, X_df], axis=1)
df_final

In [None]:
# ==============================
# 4. SEQUENCE BUILDING
# ==============================
def build_sequences(df, feat_cols, target_col, seq_len=3):
    X_list, y_list = [], []
    
    # Group by area to prevent data leakage between different regions
    for _, g in df.groupby("area"):
        g = g.sort_values("year")
        feats = g[feat_cols].values
        targs = g[target_col].values
        
        if len(g) <= seq_len: continue
            
        # Sliding window: Features [t-3, t-2, t-1] -> Target [t]
        for i in range(len(g) - seq_len):
            X_list.append(feats[i : i+seq_len])
            y_list.append(targs[i+seq_len])
            
    return np.array(X_list, dtype=np.float32), np.array(y_list, dtype=np.float32)

# Generate sequences for each split
def get_split_data(mask):
    subset = df_final[mask].copy()
    return build_sequences(subset, feature_cols, "target_scaled", SEQ_LEN)

X_train, y_train = get_split_data(train_mask)
X_val, y_val     = get_split_data(val_mask)
X_test, y_test   = get_split_data(test_mask)

print(f"--> Train Sequences shape: {X_train.shape}")
print(f"--> Val Sequences shape:   {X_val.shape}")
print(f"--> Test Sequences shape:  {X_test.shape}")

# DataLoaders
BATCH_SIZE = 32
train_loader = DataLoader(TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)), batch_size=BATCH_SIZE, shuffle= False)
val_loader   = DataLoader(TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val)),   batch_size=BATCH_SIZE, shuffle= False)
test_loader  = DataLoader(TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test)),  batch_size=BATCH_SIZE, shuffle= False)

In [None]:
import copy

# ==============================
# 5. MODEL DEFINITION
# ==============================
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, num_layers=2, dropout=0.2, activation="ReLU"):
        super().__init__()
        
        # 1. LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_dim, 
            hidden_size=hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout
        )
        
        # 2. Modified Output Head (Linear -> Activation -> Linear)
        # This adds the non-linearity you saw in the example notebook
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),  # Intermediate Dense Layer
            getattr(nn, activation)(),          # Activation (ReLU, Tanh, etc.)
            nn.Linear(hidden_dim, 1)            # Final Output Layer
        )
        
    def forward(self, x):
        # x shape: (batch, seq_len, features)
        out, _ = self.lstm(x)
        
        # Take the output from the last time step
        last_step_out = out[:, -1, :]
        
        # Pass through the new sequential head
        prediction = self.fc(last_step_out)
        
        return prediction.squeeze()

# Initialize the modified model
# Note: We added the 'activation' parameter here

model = LSTMRegressor(
    input_dim=len(feature_cols), 
    hidden_dim=64, 
    activation="ReLU"  # You can change this to "Tanh" or "Sigmoid" if you want
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.MSELoss()

# ==============================
# 6. TRAINING (Modified to Track Train RMSE)
# ==============================
epochs = 100
patience = 15
best_rmse = float('inf')
counter = 0
best_weights = None

# History lists to store RMSE in ORIGINAL units
train_rmse_history = []
val_rmse_history = []

print(f"Starting training on {len(train_loader)} batches...")

for epoch in range(epochs):
    # --- TRAIN LOOP ---
    model.train()
    train_loss = 0
    for X_b, y_b in train_loader:
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        pred = model(X_b)
        loss = criterion(pred, y_b)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 1. Calculate Train RMSE (Scaled)
    avg_train_loss_scaled = train_loss / len(train_loader)
    train_rmse_scaled = np.sqrt(avg_train_loss_scaled)
    
    # 2. Convert to Original Units
    # We multiply by the scaler's std dev to get the approximate RMSE in original units
    train_rmse_orig = train_rmse_scaled * scaler_y.scale_[0]
    train_rmse_history.append(train_rmse_orig)
    
    # --- VALIDATION LOOP ---
    model.eval()
    val_preds = []
    val_trues = []
    with torch.no_grad():
        for X_b, y_b in val_loader:
            X_b = X_b.to(device)
            pred = model(X_b).cpu().numpy()
            val_preds.append(pred)
            val_trues.append(y_b.numpy())
            
    if val_preds:
        vp = np.concatenate(val_preds).reshape(-1, 1)
        vt = np.concatenate(val_trues).reshape(-1, 1)
        
        # INVERSE TRANSFORM (Scale back to original units)
        vp_inv = scaler_y.inverse_transform(vp)
        vt_inv = scaler_y.inverse_transform(vt)
        
        # Calculate Val RMSE
        val_rmse = np.sqrt(mean_squared_error(vt_inv, vp_inv))
        val_rmse_history.append(val_rmse)
        
        # PRINT: Train RMSE vs Val RMSE (Both in Original Units)
        print(f"Epoch {epoch+1:03d}: Train RMSE {train_rmse_orig:.4f} | Val RMSE {val_rmse:.4f}")
        
        # Early Stopping Check
        if val_rmse < best_rmse:
            best_rmse = val_rmse
            best_weights = copy.deepcopy(model.state_dict())
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}!")
                break

print("Training finished.")

# ==============================
# PLOT: Train RMSE vs Val RMSE
# ==============================
plt.figure(figsize=(10, 6))
plt.plot(train_rmse_history, label='Train RMSE (Original Units)', color='blue')
plt.plot(val_rmse_history, label='Val RMSE (Original Units)', color='orange')
plt.xlabel("Epochs")
plt.ylabel(f"RMSE (Yield: {CHOSEN_CROP})")
plt.title("Learning Curve: Train vs Validation Error")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# ==============================
# 7. FINAL EVALUATION
# ==============================
if best_weights:
    model.load_state_dict(best_weights)
    
model.eval()

def evaluate(loader):
    preds, trues = [], []
    with torch.no_grad():
        for X_b, y_b in loader:
            preds.append(model(X_b.to(device)).cpu().numpy())
            trues.append(y_b.numpy())
    
    if not preds: return 0, 0, 0, np.array([]), np.array([])
    
    p_inv = scaler_y.inverse_transform(np.concatenate(preds).reshape(-1, 1))
    t_inv = scaler_y.inverse_transform(np.concatenate(trues).reshape(-1, 1))
    
    rmse = np.sqrt(mean_squared_error(t_inv, p_inv))
    mae = mean_absolute_error(t_inv, p_inv)
    r2 = r2_score(t_inv, p_inv)
    return rmse, mae, r2, p_inv, t_inv

test_rmse, test_mae, test_r2, p_test, t_test = evaluate(test_loader)

print("\n=== Final Test Results ===")
print(f"RMSE: {test_rmse:.2f}")
print(f"MAE:  {test_mae:.2f}")
print(f"R^2:  {test_r2:.4f}")