In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, TensorDataset
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

# Optuna Visualization Tools
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_slice
from optuna.visualization import plot_param_importances

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:

# Load dataset
df = pd.read_parquet('Parquet/XY_v2.parquet')

# --- LIST AVAILABLE CROPS ---
# Assumes targets start with 'Y_'
target_columns = [col for col in df.columns if col.startswith('Y_')]
available_crops = [col.replace('Y_', '') for col in target_columns]

print("--- Available Crops found in Dataset ---")
print(available_crops)
print("-" * 40)


# ==========================================
# 1. CHOOSE CROP DYNAMICALLY
# ==========================================
CHOSEN_CROP = "rice"
TARGET_COL = f"Y_{CHOSEN_CROP}"
print("Chosen crop:", CHOSEN_CROP)
print("Target column:", TARGET_COL)

# Safety check: make sure the target exists
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column {TARGET_COL} not found in df.columns")

# ==========================================
# 2. BUILD BASE df_model
# ==========================================
df_model = df.copy()

# ==========================================
# 3. DROP avg_yield_* FOR OTHER CROPS
#    Keep only avg_yield_<CHOSEN_CROP>_*
# ==========================================
crop_prefix = f"avg_yield_{CHOSEN_CROP}_"

cols_to_drop = [
    c for c in df_model.columns
    if c.startswith("avg_yield_") and not c.startswith(crop_prefix)
]

df_model = df_model.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} avg_yield_* columns not for {CHOSEN_CROP}")

# ==========================================
# 4. DEFINE META + TARGETS + FEATURES
# ==========================================
meta_cols = ["year", "area"]

# All Y_* target columns (we will exclude them from features)
target_cols = [c for c in df_model.columns if c.startswith("Y_")]

# Feature columns = everything except meta + all Y_* targets
FEATURE_COLS = [
    c for c in df_model.columns
    if c not in meta_cols and c not in target_cols
]

print("Number of features:", len(FEATURE_COLS))
print("Example features:", FEATURE_COLS[:22])

# Reorder df_model for clarity
df_model = df_model[meta_cols + FEATURE_COLS + [TARGET_COL]].copy()



In [None]:


# ==========================================
# 1. TRAIN / VAL / TEST SPLIT + IMPUTATION + SCALING
# ==========================================
TRAIN_END_YEAR = 2014
VAL_END_YEAR   = 2019

# Masks for splits
mask_train = df_model["year"] < TRAIN_END_YEAR
mask_val   = (df_model["year"] >= TRAIN_END_YEAR) & (df_model["year"] < VAL_END_YEAR)
mask_test  = df_model["year"] >= VAL_END_YEAR

# Raw feature + target splits
X_train_raw = df_model.loc[mask_train, FEATURE_COLS]
X_val_raw   = df_model.loc[mask_val,   FEATURE_COLS]
X_test_raw  = df_model.loc[mask_test,  FEATURE_COLS]

y_train = df_model.loc[mask_train, TARGET_COL]
y_val   = df_model.loc[mask_val,   TARGET_COL]
y_test  = df_model.loc[mask_test,  TARGET_COL]

print("Train years:", df_model.loc[mask_train, "year"].min(), "->", df_model.loc[mask_train, "year"].max())
print("Val years  :", df_model.loc[mask_val,   "year"].min(), "->", df_model.loc[mask_val,   "year"].max())
print("Test years :", df_model.loc[mask_test,  "year"].min(), "->", df_model.loc[mask_test,  "year"].max())

# Impute NaNs in features using TRAIN statistics
imputer = SimpleImputer(strategy="mean")
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train_raw), columns=FEATURE_COLS)
X_val_imputed   = pd.DataFrame(imputer.transform(X_val_raw),       columns=FEATURE_COLS)
X_test_imputed  = pd.DataFrame(imputer.transform(X_test_raw),      columns=FEATURE_COLS)

# Scale features using TRAIN statistics
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=FEATURE_COLS)
X_val_scaled   = pd.DataFrame(scaler.transform(X_val_imputed),       columns=FEATURE_COLS)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test_imputed),      columns=FEATURE_COLS)


In [None]:

# ==========================================
# 2. REBUILD train_df / val_df / test_df (CLEAN)
# ==========================================
train_df = pd.concat(
    [
        df_model.loc[mask_train, ["year", "area"]].reset_index(drop=True),
        X_train_scaled.reset_index(drop=True),
        y_train.reset_index(drop=True),
    ],
    axis=1,
)

val_df = pd.concat(
    [
        df_model.loc[mask_val, ["year", "area"]].reset_index(drop=True),
        X_val_scaled.reset_index(drop=True),
        y_val.reset_index(drop=True),
    ],
    axis=1,
)

test_df = pd.concat(
    [
        df_model.loc[mask_test, ["year", "area"]].reset_index(drop=True),
        X_test_scaled.reset_index(drop=True),
        y_test.reset_index(drop=True),
    ],
    axis=1,
)

print("Before cleaning:")
print("train_df shape:", train_df.shape)
print("val_df shape  :", val_df.shape)
print("test_df shape :", test_df.shape)
print("Columns in train_df:", train_df.columns.tolist())

# Final cleaning helper
def clean_df(df):
    df = df.replace([np.inf, -np.inf], np.nan)   # convert inf to NaN
    df = df.dropna(axis=0, how="any")            # drop any row with NaN
    df = df.reset_index(drop=True)
    return df

train_df = clean_df(train_df)
val_df   = clean_df(val_df)
test_df  = clean_df(test_df)

print("After cleaning:")
print("train_df shape:", train_df.shape)
print("val_df shape  :", val_df.shape)
print("test_df shape :", test_df.shape)
print("NaNs in train_df:", train_df.isna().sum().sum())
print("NaNs in val_df  :", val_df.isna().sum().sum())
print("NaNs in test_df :", test_df.isna().sum().sum())


In [None]:

# ==========================================
# 3. CONCAT CLEANED DFS INTO full_df FOR SEQUENCE DATASET
# ==========================================
full_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
# Ensure columns are in the right order
full_df = full_df[["year", "area"] + FEATURE_COLS + [TARGET_COL]].copy()

print("full_df shape:", full_df.shape)

# Sanity check: no NaNs
assert not full_df.isna().any().any(), "NaNs remain in full_df!"


In [None]:

# ==========================================
# 4. TFT-STYLE SEQUENCE DATASET
# ==========================================
class RiceYieldTFTDatasetFull(Dataset):
    def __init__(
        self,
        df,
        seq_len,
        feature_cols,
        target_col="Y_rice",
        group_col="area",
        time_col="year",
        mode="train",
        train_end=2014,
        val_end=2019,
    ):
        """
        df: DataFrame with columns [time_col, group_col] + feature_cols + [target_col]
        seq_len: input sequence length
        mode: "train", "val", or "test" → determined by TARGET YEAR
        """
        self.seq_len    = seq_len
        self.feature_cols = feature_cols
        self.target_col = target_col
        self.group_col  = group_col
        self.time_col   = time_col
        self.mode       = mode
        self.train_end  = train_end
        self.val_end    = val_end

        # Sort globally
        self.df = df.sort_values([group_col, time_col]).reset_index(drop=True)

        self.groups = {}
        self.index  = []  # list of (group_key, start_idx)

        for g, gdf in self.df.groupby(group_col):
            gdf = gdf.sort_values(time_col).reset_index(drop=True)
            self.groups[g] = gdf

            years = gdf[time_col].values
            if len(gdf) > seq_len:
                for start in range(0, len(gdf) - seq_len):
                    target_idx   = start + seq_len
                    target_year  = years[target_idx]

                    if mode == "train" and target_year < train_end:
                        self.index.append((g, start))
                    elif mode == "val" and (train_end <= target_year < val_end):
                        self.index.append((g, start))
                    elif mode == "test" and target_year >= val_end:
                        self.index.append((g, start))

        print(f"[{mode}] Built {len(self.index)} windows for seq_len={seq_len}")

    def __len__(self):
        return len(self.index)

    def __getitem__(self, idx):
        g, start = self.index[idx]
        gdf = self.groups[g]

        # x: seq_len rows of features
        x_window = gdf.iloc[start : start + self.seq_len][self.feature_cols].values

        # y: target at next time step (one-step-ahead)
        target_idx = start + self.seq_len
        y_value = gdf.iloc[target_idx][self.target_col]

        x_tensor = torch.tensor(x_window, dtype=torch.float32)      # (T, F)
        y_tensor = torch.tensor([y_value], dtype=torch.float32)     # (1,) so batch → (B,1)
        return x_tensor, y_tensor


In [None]:

# ==========================================
# 5. CREATE DATASETS + DATALOADERS
# ==========================================
SEQ_LEN = 5  

train_dataset = RiceYieldTFTDatasetFull(
    full_df,
    seq_len=SEQ_LEN,
    feature_cols=FEATURE_COLS,
    target_col=TARGET_COL,
    mode="train",
    train_end=TRAIN_END_YEAR,
    val_end=VAL_END_YEAR,
)

val_dataset = RiceYieldTFTDatasetFull(
    full_df,
    seq_len=SEQ_LEN,
    feature_cols=FEATURE_COLS,
    target_col=TARGET_COL,
    mode="val",
    train_end=TRAIN_END_YEAR,
    val_end=VAL_END_YEAR,
)

test_dataset = RiceYieldTFTDatasetFull(
    full_df,
    seq_len=SEQ_LEN,
    feature_cols=FEATURE_COLS,
    target_col=TARGET_COL,
    mode="test",
    train_end=TRAIN_END_YEAR,
    val_end=VAL_END_YEAR,
)

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

input_dim = len(FEATURE_COLS)
print("Input dim:", input_dim)
print("Train samples:", len(train_dataset),
      "Val samples:",   len(val_dataset),
      "Test samples:",  len(test_dataset))


In [None]:

# ==========================================
# 6. TEMPORAL FUSION TRANSFORMER-LIKE MODEL
# ==========================================
class TemporalFusionTransformer(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_heads=4, dropout=0.1):
        super().__init__()

        self.input_proj = nn.Linear(input_size, hidden_size)

        self.encoder_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout,
        )

        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_size,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True,
        )

        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 2, hidden_size),
        )

        self.decoder_lstm = nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            dropout=dropout,
        )

        self.output_proj = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, 1),
        )

        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)

    def forward(self, x):
        """
        x: (batch, seq_len, input_size)
        returns: (batch, 1) → predicted Y at t+1
        """
        # Project inputs
        x = self.input_proj(x)                 # (B, T, H)

        # Encoder LSTM
        enc_out, (h, c) = self.encoder_lstm(x) # enc_out: (B, T, H)

        # Self-attention over encoder outputs
        attn_out, _ = self.attention(enc_out, enc_out, enc_out)  # (B, T, H)

        # Residual + norm
        x = self.norm1(enc_out + attn_out)

        # Position-wise feed-forward
        ffn_out = self.ffn(x)

        # Another residual + norm
        x = self.norm2(x + ffn_out)

        # Decoder LSTM (reuse encoder hidden state)
        dec_out, _ = self.decoder_lstm(x, (h, c))  # (B, T, H)

        # Take last time step as representation
        last_step = dec_out[:, -1, :]  # (B, H)

        # Project to scalar
        out = self.output_proj(last_step)  # (B, 1)
        return out

# Instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

model = TemporalFusionTransformer(
    input_size=input_dim,
    hidden_size=64,
    num_heads=4,
    dropout=0.1,
).to(device)

print("Model parameters:", sum(p.numel() for p in model.parameters()))


In [None]:

# ==========================================
# 7. METRICS + TRAINING LOOP
# ==========================================
def compute_rmse(y_true, y_pred):
    y_true = np.array(y_true).reshape(-1)
    y_pred = np.array(y_pred).reshape(-1)
    return np.sqrt(mean_squared_error(y_true, y_pred))

def compute_mae(y_true, y_pred):
    y_true = np.array(y_true).reshape(-1)
    y_pred = np.array(y_pred).reshape(-1)
    return mean_absolute_error(y_true, y_pred)

def compute_r2(y_true, y_pred):
    y_true = np.array(y_true).reshape(-1)
    y_pred = np.array(y_pred).reshape(-1)
    return r2_score(y_true, y_pred)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5
)

num_epochs = 100

history = {
    "train_rmse": [],
    "val_rmse":   [],
}

for epoch in range(num_epochs):
    # ------ TRAIN ------
    model.train()
    train_losses = []
    train_y_true = []
    train_y_pred = []

    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)           # (B, T, F)
        y_batch = y_batch.to(device)           # (B, 1)

        optimizer.zero_grad()
        outputs = model(x_batch)               # (B, 1)

        loss = criterion(outputs, y_batch)
        loss.backward()

        # Gradient clipping (helps stability)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        train_losses.append(loss.item())
        train_y_true.append(y_batch.detach().cpu().numpy())
        train_y_pred.append(outputs.detach().cpu().numpy())

    train_y_true = np.concatenate(train_y_true).reshape(-1)
    train_y_pred = np.concatenate(train_y_pred).reshape(-1)
    train_rmse = compute_rmse(train_y_true, train_y_pred)

    # ------ VALIDATION ------
    model.eval()
    val_losses = []
    val_y_true = []
    val_y_pred = []

    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)

            val_losses.append(loss.item())
            val_y_true.append(y_batch.cpu().numpy())
            val_y_pred.append(outputs.cpu().numpy())

    if len(val_y_true) > 0:
        val_y_true = np.concatenate(val_y_true).reshape(-1)
        val_y_pred = np.concatenate(val_y_pred).reshape(-1)
        val_rmse = compute_rmse(val_y_true, val_y_pred)
        val_loss = np.mean(val_losses)
    else:
        val_rmse = np.nan
        val_loss = np.nan

    history["train_rmse"].append(train_rmse)
    history["val_rmse"].append(val_rmse)

    if not np.isnan(val_loss):
        scheduler.step(val_loss)

    if epoch % 10 == 0 or epoch == num_epochs - 1:
        print(
            f"Epoch {epoch}/{num_epochs-1} | "
            f"Train RMSE: {train_rmse:.2f} | "
            f"Val RMSE: {val_rmse:.2f}"
        )
# ==========================================
# 8. PLOT TRAIN vs VALIDATION RMSE
# ==========================================
import matplotlib.pyplot as plt

epochs = range(1, num_epochs + 1)

plt.figure(figsize=(10, 6))
plt.plot(epochs, history["train_rmse"], label="Train Loss (RMSE)", linewidth=2)
plt.plot(epochs, history["val_rmse"], label="Val Loss (RMSE)", linewidth=2)

plt.title("Transformer Model – Train vs Validation Loss", fontsize=14)
plt.xlabel("Epoch", fontsize=12)
plt.ylabel("Loss (RMSE)", fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

plt.show()



In [None]:
# ==========================================
# 8. EVALUATE ON TEST SET
# ==========================================
model.eval()
test_y_true = []
test_y_pred = []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        outputs = model(x_batch)          # (B, 1)

        test_y_true.append(y_batch.cpu().numpy())
        test_y_pred.append(outputs.cpu().numpy())

# Concatenate all batches
test_y_true = np.concatenate(test_y_true).reshape(-1)
test_y_pred = np.concatenate(test_y_pred).reshape(-1)

test_rmse = compute_rmse(test_y_true, test_y_pred)
test_mae  = compute_mae(test_y_true, test_y_pred)
test_r2   = compute_r2(test_y_true, test_y_pred)

print("\n=== Test Set Performance ===")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Test MAE : {test_mae:.2f}")
print(f"Test R²  : {test_r2:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
plt.scatter(test_y_true, test_y_pred, alpha=0.4)
min_val = min(test_y_true.min(), test_y_pred.min())
max_val = max(test_y_true.max(), test_y_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], "r--", linewidth=2)

plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title("Transformer – Test Set: Actual vs Predicted")
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# ==========================================
# 8. OPTUNA TUNING FOR RMSE
# ==========================================
import optuna
import math
import gc

def create_dataloaders(seq_len, batch_size=16):
    """Rebuild datasets & dataloaders for a given seq_len."""
    train_dataset = RiceYieldTFTDatasetFull(
        full_df,
        seq_len=seq_len,
        feature_cols=FEATURE_COLS,
        target_col=TARGET_COL,
        mode="train",
        train_end=TRAIN_END_YEAR,
        val_end=VAL_END_YEAR,
    )

    val_dataset = RiceYieldTFTDatasetFull(
        full_df,
        seq_len=seq_len,
        feature_cols=FEATURE_COLS,
        target_col=TARGET_COL,
        mode="val",
        train_end=TRAIN_END_YEAR,
        val_end=VAL_END_YEAR,
    )

    test_dataset = RiceYieldTFTDatasetFull(
        full_df,
        seq_len=seq_len,
        feature_cols=FEATURE_COLS,
        target_col=TARGET_COL,
        mode="test",
        train_end=TRAIN_END_YEAR,
        val_end=VAL_END_YEAR,
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader


def build_model_from_trial(trial, input_dim):
    """Sample hyperparameters and build a TemporalFusionTransformer."""
    hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128])
    # num_heads must divide hidden_size; choose safe combos
    possible_heads = [h for h in [2, 4, 8] if hidden_size % h == 0]
    num_heads = trial.suggest_categorical("num_heads", possible_heads)
    dropout = trial.suggest_float("dropout", 0.05, 0.3)

    model = TemporalFusionTransformer(
        input_size=input_dim,
        hidden_size=hidden_size,
        num_heads=num_heads,
        dropout=dropout,
    ).to(device)

    return model


def objective(trial):
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # --------- Hyperparameters to tune ----------
    seq_len = trial.suggest_categorical("seq_len", [3, 5, 7])
    lr      = trial.suggest_float("lr", 1e-4, 3e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    batch_size   = trial.suggest_categorical("batch_size", [8,16,32])

    # Rebuild loaders for this seq_len / batch_size
    train_loader, val_loader, _ = create_dataloaders(seq_len=seq_len, batch_size=batch_size)

    # Build model
    model = build_model_from_trial(trial, input_dim=input_dim)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Slightly fewer epochs for tuning
    max_epochs = 60
    best_val_rmse = math.inf

    for epoch in range(max_epochs):
        # ---------- TRAIN ----------
        model.train()
        train_y_true = []
        train_y_pred = []

        for x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(x_batch)

            loss = criterion(outputs, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            train_y_true.append(y_batch.detach().cpu().numpy())
            train_y_pred.append(outputs.detach().cpu().numpy())

        # ---------- VALIDATION ----------
        model.eval()
        val_y_true = []
        val_y_pred = []

        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)

                outputs = model(x_batch)
                val_y_true.append(y_batch.cpu().numpy())
                val_y_pred.append(outputs.cpu().numpy())

        if len(val_y_true) == 0:
            # No validation windows (shouldn't happen, but just in case)
            return math.inf

        val_y_true = np.concatenate(val_y_true).reshape(-1)
        val_y_pred = np.concatenate(val_y_pred).reshape(-1)
        val_rmse = compute_rmse(val_y_true, val_y_pred)

        # Track best RMSE for this trial
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse

        # Report to Optuna & allow pruning
        trial.report(val_rmse, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return best_val_rmse


# ==========================================
# 9. RUN OPTUNA STUDY
# ==========================================
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=None)  # adjust n_trials as you like

print("Best trial:")
best_trial = study.best_trial
print("  Value (best Val RMSE):", best_trial.value)
print("  Params:")
for k, v in best_trial.params.items():
    print(f"    {k}: {v}")
