In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# Optuna Visualization Tools
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Load dataset
df = pd.read_parquet('Parquet/XY_v2.parquet')

# --- LIST AVAILABLE CROPS ---
# Assumes targets start with 'Y_'
target_columns = [col for col in df.columns if col.startswith('Y_')]
available_crops = [col.replace('Y_', '') for col in target_columns]

print("--- Available Crops found in Dataset ---")
print(available_crops)
print("-" * 40)

# --- CONFIGURATION: SET CROP HERE ---
CHOSEN_CROP = 'rice'  # <--- CHANGE THIS to 'lettuce', 'pepper', etc. based on list above
# ------------------------------------

# Define Target and Dynamic Lag Features
TARGET_COL = f'Y_{CHOSEN_CROP}'
LAG_1_FEATURE = f'avg_yield_{CHOSEN_CROP}_1y'

if TARGET_COL not in df.columns:
    raise ValueError(f"Target {TARGET_COL} not found in dataset. Check spelling.")

print(f"Predicting Target: {TARGET_COL}")
print(f"Using Lag 1 Feature: {LAG_1_FEATURE}")

# Clean Missing Targets for the chosen crop
df_model = df.dropna(subset=[TARGET_COL])

print(f"Data Loaded. Rows with valid target: {len(df_model)}")

In [None]:

# --- IMPORTS (Add these if not already present) ---
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import torch
import pandas as pd

# --- DROP UNWANTED COLUMNS ---
# Drop all columns that start with "avg_yield_" but do NOT match the chosen crop
cols_to_drop = [c for c in df_model.columns 
                if c.startswith("avg_yield_") and CHOSEN_CROP not in c]

df_model = df_model.drop(columns=cols_to_drop)

# --- FEATURE SELECTION ---
# Select independent variables (exclude 'Y_' columns and metadata)
feature_cols = [c for c in df_model.columns 
                if not c.startswith('Y_') and c not in ['area']]

# --- DISPLAY FEATURES TABLE ---
print(f"Total Features Used: {len(feature_cols)}")
print("-" * 30)
feature_preview = pd.DataFrame(feature_cols, columns=['Feature Name']).T
display(feature_preview)

# --- TIME-SERIES SPLIT ---
TRAIN_END_YEAR = 2014
VAL_END_YEAR = 2019

# 1. Training Set (< 2014)
mask_train = df_model['year'] < TRAIN_END_YEAR
X_train_raw = df_model[mask_train][feature_cols]
y_train = df_model[mask_train][TARGET_COL]

# 2. Validation Set (>= 2014 and < 2019)
mask_val = (df_model['year'] >= TRAIN_END_YEAR) & (df_model['year'] < VAL_END_YEAR)
X_val_raw = df_model[mask_val][feature_cols]
y_val = df_model[mask_val][TARGET_COL]

# 3. Test Set (>= 2019)
mask_test = df_model['year'] >= VAL_END_YEAR
X_test_raw = df_model[mask_test][feature_cols]
y_test = df_model[mask_test][TARGET_COL]

# --- IMPUTATION (Handle NaNs before scaling) ---
imputer = SimpleImputer(strategy='mean')  # Or 'median' if data is skewed
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_raw), columns=feature_cols)
X_val_imputed = pd.DataFrame(imputer.transform(X_val_raw), columns=feature_cols)
X_test_imputed = pd.DataFrame(imputer.transform(X_test_raw), columns=feature_cols)

# Optional: Print NaN counts to verify (should be 0 after imputation)
print("NaNs in X_train_imputed:", X_train_imputed.isnull().sum().sum())
print("NaNs in X_val_imputed:", X_val_imputed.isnull().sum().sum())
print("NaNs in X_test_imputed:", X_test_imputed.isnull().sum().sum())

X_train_imputed.head(5)


In [None]:
# Baseline: yield(t) = yield(t-1)
# Note: We use the raw dataframe for baseline lag feature access
y_pred_baseline = df_model[mask_test][LAG_1_FEATURE]

# Clean NaNs for metric calculation
mask_valid = ~y_pred_baseline.isna() & ~y_test.isna()
y_test_clean = y_test[mask_valid]
y_pred_clean = y_pred_baseline[mask_valid]

rmse_baseline = np.sqrt(mean_squared_error(y_test_clean, y_pred_clean))
r2_baseline = r2_score(y_test_clean, y_pred_clean)

print(f"Baseline RMSE: {rmse_baseline:.2f}")

In [None]:
# =========================
# 1. Torch Datasets & Loaders
# =========================

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Make sure these exist from previous cells:
# X_train_imputed, X_val_imputed, X_test_imputed
# y_train, y_val, y_test

class TabularDataset(Dataset):
    def __init__(self, X, y):
        """
        X: pandas DataFrame or numpy array (n_samples, n_features)
        y: pandas Series or numpy array (n_samples,)
        """
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, (pd.Series, pd.DataFrame)):
            y = y.values

        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# Create dataset objects
train_dataset = TabularDataset(X_train_imputed, y_train)
val_dataset   = TabularDataset(X_val_imputed, y_val)
test_dataset  = TabularDataset(X_test_imputed, y_test)

# Dataloaders
batch_size = 16 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

input_dim = X_train_imputed.shape[1]
print(f"Input dim: {input_dim}, Train samples: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")


In [None]:
# =========================
# 2. Transformer Regression Model (TFT-style)
# =========================

class TransformerRegressor(nn.Module):
    def __init__(
        self,
        input_dim,
        d_model=64,
        nhead=4,
        num_layers=2,
        dim_feedforward=128,
        dropout=0.1,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.d_model = d_model

        # Project feature vector -> model dimension
        self.input_proj = nn.Linear(input_dim, d_model)

        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
            activation="relu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Output head
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, 1)

    def forward(self, x):
        """
        x: (batch_size, input_dim)
        We treat features as a single 'token':
          - project to d_model
          - add fake sequence dimension of length 1
        """
        # (B, F) -> (B, d_model)
        x = self.input_proj(x)

        # (B, d_model) -> (B, 1, d_model)
        x = x.unsqueeze(1)

        # Transformer encoder (sequence length = 1 here)
        x = self.encoder(x)  # (B, 1, d_model)

        # Pool over sequence dimension (trivial when seq_len=1)
        x = x.mean(dim=1)    # (B, d_model)

        x = self.dropout(x)
        out = self.fc_out(x).squeeze(-1)  # (B,)

        return out

In [None]:
# =========================
# 3. Training & Evaluation
# =========================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = TransformerRegressor(input_dim,
        d_model=64,
        nhead=4,
        num_layers=2,
        dim_feedforward=128,
        dropout=0.1).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5
)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")


from sklearn.metrics import mean_squared_error

# ---------- helper: RMSE ----------
def compute_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# ---------- helper: one epoch ----------
def run_epoch(loader, model, criterion, optimizer=None):
    """
    If optimizer is provided -> training mode
    Otherwise -> evaluation mode
    """
    if optimizer is not None:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_preds = []
    all_targets = []

    with torch.set_grad_enabled(optimizer is not None):
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            preds = model(X_batch)
            loss = criterion(preds, y_batch)

            if optimizer is not None:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            total_loss += loss.item() * X_batch.size(0)
            all_preds.append(preds.detach().cpu().numpy())
            all_targets.append(y_batch.detach().cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    return avg_loss, all_preds, all_targets


# =========================
# TRAINING LOOP (Transformer model)
# =========================

history = {
    "train_loss": [],
    "val_loss": [],
    "train_rmse": [],
    "val_rmse": []
}

num_epochs = 150
best_val_loss = float("inf")
best_state = None

for epoch in range(num_epochs):

    # -------- TRAIN --------
    train_loss, train_preds, train_targets = run_epoch(
        train_loader, model, criterion, optimizer
    )
    train_rmse = compute_rmse(train_targets, train_preds)

    # -------- VAL --------
    val_loss, val_preds, val_targets = run_epoch(
        val_loader, model, criterion, optimizer=None
    )
    val_rmse = compute_rmse(val_targets, val_preds)

    # Scheduler (optional, if you defined it)
    try:
        scheduler.step(val_loss)
    except NameError:
        pass

    # Save history
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_rmse"].append(train_rmse)
    history["val_rmse"].append(val_rmse)

    # Track best model by val_loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict()

    # -------- PRINT in your format --------
    if epoch == 0 or (epoch % 20 == 0) or epoch == num_epochs - 1:
        print(
            f"Epoch {epoch}/{num_epochs} | "
            f"Train RMSE: {train_rmse:.2f} | Val RMSE: {val_rmse:.2f}"
        )

# Load best weights
if best_state is not None:
    model.load_state_dict(best_state)

print(f"\nBest validation loss (RMSE): {best_val_loss:.4f}")

In [None]:

import matplotlib.pyplot as plt

epochs = range(len(history["train_loss"]))

plt.figure(figsize=(8, 5))
plt.plot(epochs, history["train_rmse"], label="Train Loss (RMSE)")
plt.plot(epochs, history["val_rmse"], label="Val Loss (RMSE)")
plt.xlabel("Epoch")
plt.ylabel("Loss (RMSE)")
plt.title("Transformer Model – Train vs Validation Loss")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Evaluate on test set
model.eval()
test_loss, test_preds, test_targets = run_epoch(
    test_loader, model, criterion, optimizer=None
)

# Flatten to 1D
test_preds = test_preds.reshape(-1)
test_targets = test_targets.reshape(-1)

test_rmse = np.sqrt(mean_squared_error(test_targets, test_preds))
test_r2   = r2_score(test_targets, test_preds)

print(f"Test MSE:  {test_loss:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R²:   {test_r2:.4f}")

In [None]:
import optuna
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

def compute_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def run_epoch(loader, model, criterion, optimizer=None):
    """
    If optimizer is provided -> training mode
    Otherwise -> evaluation mode
    """
    if optimizer is not None:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_preds = []
    all_targets = []

    with torch.set_grad_enabled(optimizer is not None):
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            preds = model(X_batch)
            loss = criterion(preds, y_batch)

            if optimizer is not None:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            total_loss += loss.item() * X_batch.size(0)
            all_preds.append(preds.detach().cpu().numpy())
            all_targets.append(y_batch.detach().cpu().numpy())

    avg_loss = total_loss / len(loader.dataset)
    all_preds = np.concatenate(all_preds)
    all_targets = np.concatenate(all_targets)
    return avg_loss, all_preds, all_targets

# convenience: create loaders for different batch sizes
def create_dataloaders(batch_size):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

input_dim = train_dataset[0][0].shape[0]
print("Input dim:", input_dim)


In [None]:
def objective(trial):
    # ---------- hyperparameters (around your current values) ----------
    d_model = trial.suggest_categorical("d_model", [32, 64, 128])
    nhead   = trial.suggest_categorical("nhead", [2, 4, 8])
    num_layers = trial.suggest_categorical("num_layers", [1, 2, 3])
    dim_feedforward = trial.suggest_categorical("dim_feedforward", [64, 128, 256])
    dropout = trial.suggest_categorical("dropout", [0.05, 0.1, 0.2])
    lr = trial.suggest_float("lr", 5e-4,5e-3, log=True)           
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)

    # Transformer requirement: d_model divisible by nhead
    if d_model % nhead != 0:
        raise optuna.exceptions.TrialPruned()

    # ---------- dataloaders ----------
    train_loader, val_loader, _ = create_dataloaders(batch_size)

    # ---------- model / optimizer / loss ----------
    model = TransformerRegressor(
        input_dim=input_dim,
        d_model=d_model,
        nhead=nhead,
        num_layers=num_layers,
        dim_feedforward=dim_feedforward,
        dropout=dropout,
    ).to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )

    max_epochs = 80       # shorter than your full 150-epoch run
    patience   = 10
    best_val_rmse = float("inf")
    patience_counter = 0

    for epoch in range(max_epochs):
        # ---- train ----
        train_loss, train_preds, train_targets = run_epoch(
            train_loader, model, criterion, optimizer
        )

        # ---- validate ----
        val_loss, val_preds, val_targets = run_epoch(
            val_loader, model, criterion, optimizer=None
        )

        val_rmse = compute_rmse(val_targets, val_preds)

        # report to Optuna (for pruning / logging)
        trial.report(val_rmse, step=epoch)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

        # early stopping on best RMSE
        if val_rmse < best_val_rmse - 1e-3:
            best_val_rmse = val_rmse
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    # objective value = best validation RMSE seen in this trial
    return best_val_rmse


In [None]:
study = optuna.create_study(
    direction="minimize",
    study_name="Transformer_Optuna"
)

study.optimize(objective, n_trials=10, show_progress_bar=True)

print("Number of finished trials:", len(study.trials))
print("Best trial value (Val RMSE):", study.best_trial.value)
print("Best trial params:")
for k, v in study.best_trial.params.items():
    print(f"  {k}: {v}")


In [None]:
from optuna.visualization.matplotlib import (
    plot_optimization_history,
    plot_parallel_coordinate,
    plot_slice,
    plot_param_importances,
)

def set_title_safe(ax, title):
    if hasattr(ax, "set_title"):
        ax.set_title(title)
    else:
        ax.flat[0].set_title(title)

name = f"{CHOSEN_CROP.capitalize()}_Yield_Transformer" if "CHOSEN_CROP" in globals() else "Yield_Transformer"

# 1. Optimization History
ax = plot_optimization_history(study)
set_title_safe(ax, f"{name} – Optimization History")
plt.tight_layout()
plt.show()

# 2. Parallel Coordinate
ax = plot_parallel_coordinate(study)
set_title_safe(ax, f"{name} – Parallel Coordinate")
plt.tight_layout()
plt.show()

# 3. Slice Plot
ax = plot_slice(study)
set_title_safe(ax, f"{name} – Slice Plot")
plt.tight_layout()
plt.show()

# 4. Parameter Importance
try:
    ax = plot_param_importances(study)
    set_title_safe(ax, f"{name} – Hyperparameter Importance")
    plt.tight_layout()
    plt.show()
except (ValueError, RuntimeError) as e:
    print(f"Could not plot parameter importance: {e}")


In [None]:
best_params = study.best_trial.params
print("Best params for final model:", best_params)

# dataloaders using best batch size
batch_size_best = best_params.get("batch_size", 128)
train_loader, val_loader, test_loader = create_dataloaders(batch_size_best)

# final model
final_model = TransformerRegressor(
    input_dim=input_dim,
    d_model=best_params["d_model"],
    nhead=best_params["nhead"],
    num_layers=best_params["num_layers"],
    dim_feedforward=best_params["dim_feedforward"],
    dropout=best_params["dropout"],
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=best_params["lr"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5
)

history = {
    "train_loss": [],
    "val_loss": [],
    "train_rmse": [],
    "val_rmse": []
}

num_epochs = 150
best_val_loss = float("inf")
best_state = None

for epoch in range(num_epochs):

    # ---- train ----
    train_loss, train_preds, train_targets = run_epoch(
        train_loader, model, criterion, optimizer
    )
    train_rmse = compute_rmse(train_targets, train_preds)

    # ---- validate ----
    val_loss, val_preds, val_targets = run_epoch(
        val_loader, model, criterion, optimizer=None
    )
    val_rmse = compute_rmse(val_targets, val_preds)

    scheduler.step(val_loss)

    # store history
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_rmse"].append(train_rmse)
    history["val_rmse"].append(val_rmse)

    # track best
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict()

    # print in your preferred style
    if epoch == 0 or (epoch % 20 == 0) or epoch == num_epochs - 1:
        print(
            f"Epoch {epoch}/{num_epochs} | "
            f"Train RMSE: {train_rmse:.2f} | Val RMSE: {val_rmse:.2f}"
        )

# load best weights
if best_state is not None:
    model.load_state_dict(best_state)

print(f"\nBest validation loss (MSE): {best_val_loss:.4f}")


In [None]:
# ---- RMSE curves ----
epochs = range(len(history["train_rmse"]))

plt.figure(figsize=(8, 5))
plt.plot(epochs, history["train_rmse"], label="Train RMSE", linewidth=2)
plt.plot(epochs, history["val_rmse"], label="Validation RMSE", linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.title("Transformer – Train vs Validation RMSE (Best Hyperparams)")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# ---- Test evaluation ----
test_loss, final_test_preds, final_test_targets = run_epoch(
    test_loader, model, criterion, optimizer=None
)

final_test_predstest_preds = test_preds.reshape(-1)
final_test_targets = test_targets.reshape(-1)

final_test_rmse = compute_rmse(test_targets, test_preds)
final_test_r2   = r2_score(test_targets, test_preds)

print(f"Test MSE:  {test_loss:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R²:   {test_r2:.4f}")

# optional: predicted vs true scatter
plt.figure(figsize=(6, 6))
plt.scatter(test_targets, test_preds, alpha=0.3)
lim_min = min(test_targets.min(), test_preds.min())
lim_max = max(test_targets.max(), test_preds.max())
plt.plot([lim_min, lim_max], [lim_min, lim_max], linestyle="--")
plt.xlabel("True Yield")
plt.ylabel("Predicted Yield")
title_crop = CHOSEN_CROP if "CHOSEN_CROP" in globals() else "Crop"
plt.title(f"Transformer (Optuna Best) – {title_crop} – Test Set")
plt.grid(alpha=0.3)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate Improvement %
imp_final = (rmse_baseline - final_test_rmse) / rmse_baseline * 100

print("--- Final Performance Report (Test Set, Transformer) ---")
print(f"Baseline Model:     RMSE={rmse_baseline:.2f}, R2={r2_baseline:.4f}")
print(f"Initial TFT Model:  RMSE={test_rmse:.2f}, R2={test_r2:.4f}")
print(f"Tuned TFT Model:    RMSE={final_test_rmse:.2f}, R2={final_test_r2:.4f} "
      f"(RMSE Improved {imp_final:.2f}%)")

# --- PLOTTING RESULTS ---
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Axis Limits (use all predictions & truths)
all_preds = np.concatenate([y_pred_baseline,test_preds, final_test_preds])
all_true  = np.concatenate([y_test_clean,y_test,y_test])

min_val = min(all_preds.min(), all_true.min())
max_val = max(all_preds.max(), all_true.max())

# 1. Baseline Plot
axes[0].scatter(y_test_clean, y_pred_clean, alpha=0.4)
axes[0].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
axes[0].set_title(f'Baseline Model\nRMSE: {rmse_baseline:.2f} | R2: {r2_baseline:.3f}')
axes[0].set_xlabel("True")
axes[0].set_ylabel("Predicted")

# 2. Initial Transformer Plot
axes[1].scatter(y_test, test_preds, alpha=0.4)
axes[1].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
axes[1].set_title(f'Initial TFT Model\nRMSE: {test_rmse:.2f} | R2: {test_r2:.3f}')
axes[1].set_xlabel("True")

# 3. Tuned Transformer Plot
axes[2].scatter(y_test, final_test_preds, alpha=0.4)
axes[2].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2)
axes[2].set_title(f'Tuned TFT Model\nRMSE: {final_test_rmse:.2f} | R2: {final_test_r2:.3f}')
axes[2].set_xlabel("True")

title_crop = CHOSEN_CROP if "CHOSEN_CROP" in globals() else "Crop"
plt.suptitle(f'{title_crop.capitalize()} Yield – Transformer Baseline vs Initial vs Tuned',
             fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# --- FULL TIMELINE PLOT (FILTER BY COUNTRY) ---
import matplotlib.pyplot as plt

# Country to plot (parameter)
TARGET_COUNTRY = "Thailand"    # <<--- Change here anytime

# 1. Generate Predictions for all data (scaled)
X_all_scaled = scaler.transform(df_model[feature_cols])
X_all_tensor = torch.tensor(X_all_scaled, dtype=torch.float32).to(device)

final_model.eval()
with torch.no_grad():
    all_predictions = final_model(X_all_tensor).cpu().numpy().flatten()

# 2. Create DataFrame with Area column
df_full_trend = pd.DataFrame({
    'Year': df_model['year'],
    'Area': df_model['area'],
    'Actual': df_model[TARGET_COL],
    'Predicted': all_predictions
})

# 3. Filter for selected country
country_trend = df_full_trend[df_full_trend['Area'] == TARGET_COUNTRY]

# 4. Aggregate by Year
yearly_trend = country_trend.groupby('Year')[['Actual', 'Predicted']].mean()

# 5. Plotting
plt.figure(figsize=(14, 7))

plt.plot(yearly_trend.index, yearly_trend['Actual'], 
         marker='o', label=f'Actual Yield ({TARGET_COUNTRY})', linewidth=2)
plt.plot(yearly_trend.index, yearly_trend['Predicted'], 
         marker='x', linestyle='--', label=f'Predicted Yield ({TARGET_COUNTRY})', linewidth=2)

# Define split boundaries
MIN_YEAR = yearly_trend.index.min()
MAX_YEAR = yearly_trend.index.max()
train_boundary = TRAIN_END_YEAR - 0.5
val_boundary   = VAL_END_YEAR - 0.5

# Highlight training / validation / testing
plt.axvspan(MIN_YEAR - 0.5, train_boundary, color='green',  alpha=0.1)
plt.axvspan(train_boundary, val_boundary,   color='yellow', alpha=0.1)
plt.axvspan(val_boundary, MAX_YEAR + 0.5,   color='red',    alpha=0.1)

# Text labels
y_max = yearly_trend['Actual'].max()
text_y = y_max * 1.05

plt.text((MIN_YEAR + train_boundary)/2, text_y, 'TRAINING',   ha='center', fontsize=12, fontweight='bold', color='green')
plt.text((train_boundary + val_boundary)/2, text_y, 'VALIDATION', ha='center', fontsize=12, fontweight='bold', color='#D4AC0D')
plt.text((val_boundary + MAX_YEAR)/2, text_y, 'TESTING',     ha='center', fontsize=12, fontweight='bold', color='red')

# Final formatting
plt.title(f'Full Timeline Analysis: Actual vs Predicted Yield ({CHOSEN_CROP}, {TARGET_COUNTRY})',
          fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Yield (hg/ha)', fontsize=12)
plt.legend(loc='upper left')
plt.grid(True, alpha=0.3)
plt.xticks(np.arange(MIN_YEAR, MAX_YEAR + 1, 2))
plt.xlim(MIN_YEAR - 0.5, MAX_YEAR + 0.5)

plt.tight_layout()
plt.show()
