# ANN Forecasting of Weekly Hydro Energy Generation in New Zealand

This notebook addresses:
- **RQ1**: How accurately can ANN forecast weekly hydro energy compared to SARIMA?
- **RQ2**: Do lagged climate features improve ANN forecasting performance?

Hydro generation data is merged with NASA climate variables and aggregated to weekly frequency.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-vintage')

## Load and Prepare Weekly Aggregated Hydro + Climate Dataset

In [None]:
# Load weekly dataset (assumes already prepared like SARIMA)
weekly_df = pd.read_csv('weekly_hydro_climate.csv', parse_dates=['DATE'], index_col='DATE')
weekly_df = weekly_df.dropna()
weekly_df.head()

In [None]:
# Check if 'Total_TP' exists in the dataset
if 'Total_TP' in weekly_df.columns:
    # Compute correlation
    corr_tp = weekly_df[['GENERATION', 'Total_TP']].corr()
    print('Correlation between GENERATION and Total_TP:')
    print(corr_tp)

    # Optional: Add rolling mean of Total_TP
    weekly_df['TP_rolling_avg_2w'] = weekly_df['Total_TP'].rolling(window=2).mean()
    weekly_df['TP_rolling_avg_4w'] = weekly_df['Total_TP'].rolling(window=4).mean()
    weekly_df.dropna(inplace=True)
else:
    print("'Total_TP' not found in the dataset.")

## Feature Scaling and Train-Test Split

In [None]:
# Define features and target
X = weekly_df.drop(columns='GENERATION')
y = weekly_df['GENERATION']

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split into train/test (last 10 weeks for test)
X_train, X_test = X_scaled[:-10], X_scaled[-10:]
y_train, y_test = y[:-10], y[-10:]

## 🧠 ANN Model without Lagged Climate Features (RQ1)

In [None]:
# Build simple ANN
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

# Train model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=200, batch_size=8, callbacks=[es], verbose=0)

## Forecast Evaluation and RQ1 Analysis

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'ANN MAE: {mae:.2f}')
print(f'ANN RMSE: {rmse:.2f}')
print(f'ANN MAPE: {mape:.2f}%')

In [None]:
plt.figure(figsize=(12,4))
plt.plot(y_test.values, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.title('ANN Prediction vs Actual (Weekly Hydro Generation)')
plt.legend()
plt.show()

### 🔍 Interpretation (RQ1)
The ANN model's accuracy metrics (MAPE, MAE, RMSE) are used to compare against SARIMA and SARIMAX models.
Lower errors here support ANN as a better forecasting tool under similar data conditions.

## Add Lagged Climate Features for RQ2

In [None]:
# Manually shift climate variables by 1 week to simulate lag
lagged_df = weekly_df.copy()
lagged_df['T2M_lag1'] = lagged_df['T2M'].shift(1)
lagged_df['PS_lag1'] = lagged_df['PS'].shift(1)
lagged_df['WS50M_lag1'] = lagged_df['WS50M'].shift(1)
lagged_df['RH2M_lag1'] = lagged_df['RH2M'].shift(1)
lagged_df['PRECTOTCORR_lag1'] = lagged_df['PRECTOTCORR'].shift(1)
lagged_df['EVLAND_lag1'] = lagged_df['EVLAND'].shift(1)

# Drop original variables and NAs from lag
lagged_df = lagged_df[['GENERATION', 'T2M_lag1', 'PS_lag1', 'WS50M_lag1', 'RH2M_lag1', 'PRECTOTCORR_lag1', 'EVLAND_lag1']].dropna()

## Preprocessing Lagged Inputs

In [None]:
X_lag = lagged_df.drop(columns='GENERATION')
y_lag = lagged_df['GENERATION']

X_lag_scaled = scaler.fit_transform(X_lag)

X_train_lag, X_test_lag = X_lag_scaled[:-10], X_lag_scaled[-10:]
y_train_lag, y_test_lag = y_lag[:-10], y_lag[-10:]

## 🌦️ ANN Model with Lagged Climate Features (RQ2)

In [None]:
model_lag = Sequential()
model_lag.add(Dense(64, activation='relu', input_shape=(X_train_lag.shape[1],)))
model_lag.add(Dropout(0.2))
model_lag.add(Dense(32, activation='relu'))
model_lag.add(Dense(1))

model_lag.compile(optimizer='adam', loss='mse')
es_lag = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history_lag = model_lag.fit(X_train_lag, y_train_lag, validation_split=0.2,
                            epochs=200, batch_size=8, callbacks=[es_lag], verbose=0)

## Forecast Evaluation for RQ2 (Lagged ANN)

In [None]:
y_pred_lag = model_lag.predict(X_test_lag).flatten()
mae_lag = mean_absolute_error(y_test_lag, y_pred_lag)
rmse_lag = np.sqrt(mean_squared_error(y_test_lag, y_pred_lag))
mape_lag = np.mean(np.abs((y_test_lag - y_pred_lag) / y_test_lag)) * 100

print(f'Lagged ANN MAE: {mae_lag:.2f}')
print(f'Lagged ANN RMSE: {rmse_lag:.2f}')
print(f'Lagged ANN MAPE: {mape_lag:.2f}%')

In [None]:
plt.figure(figsize=(12,4))
plt.plot(y_test_lag.values, label='Actual')
plt.plot(y_pred_lag, label='Predicted (Lagged)')
plt.title('ANN with Lagged Climate Features')
plt.legend()
plt.show()

### 🧠 Interpretation (RQ2)
This ANN model uses **1-week lagged climate inputs**. If its accuracy (MAPE, MAE, RMSE) improves over the non-lagged version,
it indicates that hydro forecasting benefits from incorporating delayed climate effects — thus supporting **RQ2**.

## 🌦️ ANN Model with Lagged Climate Features (RQ2)

In [None]:
# Create lagged features (1-week lag)
weekly_df['T2M_lag1'] = weekly_df['T2M'].shift(1)
weekly_df['PS_lag1'] = weekly_df['PS'].shift(1)
weekly_df['WS50M_lag1'] = weekly_df['WS50M'].shift(1)
weekly_df['RH2M_lag1'] = weekly_df['RH2M'].shift(1)
weekly_df['PRECTOTCORR_lag1'] = weekly_df['PRECTOTCORR'].shift(1)
weekly_df['EVLAND_lag1'] = weekly_df['EVLAND'].shift(1)

# Keep only lagged features and target
lagged_df = weekly_df[['GENERATION', 'T2M_lag1', 'PS_lag1', 'WS50M_lag1', 'RH2M_lag1', 'PRECTOTCORR_lag1', 'EVLAND_lag1']].dropna()

In [None]:
# Define features and target
X_lag = lagged_df.drop(columns='GENERATION')
y_lag = lagged_df['GENERATION']

# Scale
X_lag_scaled = scaler.fit_transform(X_lag)

# Train-test split
X_lag_train, X_lag_test = X_lag_scaled[:-10], X_lag_scaled[-10:]
y_lag_train, y_lag_test = y_lag[:-10], y_lag[-10:]

In [None]:
# ANN for lagged features
model_lag = Sequential()
model_lag.add(Dense(64, activation='relu', input_shape=(X_lag_train.shape[1],)))
model_lag.add(Dropout(0.2))
model_lag.add(Dense(32, activation='relu'))
model_lag.add(Dense(1))

model_lag.compile(optimizer='adam', loss='mse')
es_lag = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history_lag = model_lag.fit(X_lag_train, y_lag_train, validation_split=0.2, epochs=200, batch_size=8, callbacks=[es_lag], verbose=0)

In [None]:
# Predict and evaluate
y_lag_pred = model_lag.predict(X_lag_test).flatten()
mae_lag_ann = mean_absolute_error(y_lag_test, y_lag_pred)
rmse_lag_ann = np.sqrt(mean_squared_error(y_lag_test, y_lag_pred))
mape_lag_ann = np.mean(np.abs((y_lag_test - y_lag_pred) / y_lag_test)) * 100

print(f'ANN (Lagged) MAE: {mae_lag_ann:.2f}')
print(f'ANN (Lagged) RMSE: {rmse_lag_ann:.2f}')
print(f'ANN (Lagged) MAPE: {mape_lag_ann:.2f}%')

In [None]:
plt.figure(figsize=(12,4))
plt.plot(y_lag_test.values, label='Actual')
plt.plot(y_lag_pred, label='Predicted')
plt.title('ANN Prediction vs Actual (Lagged Climate Features)')
plt.legend()
plt.show()

### 🔍 Interpretation (RQ2)
By adding lagged climate variables, the ANN model may better anticipate how prior weather affects current hydro generation.
If MAPE is improved over the non-lagged ANN model, this supports the hypothesis that **lagged climate inputs enhance forecast accuracy**, answering **RQ2**.