In [2]:
import os, numpy as np, pandas as pd
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import ParameterGrid
import holidays                         # still used for is_holiday feature
import matplotlib.pyplot as plt         # optional – for future plotting

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("df2.csv")
df["unique_id"] = 0
df["time"] = pd.to_datetime(df["time"])
df.set_index("time", inplace=True, drop=False)

# drop unused cols
df = df.drop(["Wind", "unique_id"], axis=1)

In [4]:
# spike flags + magnitude (MW scale)
df["mu_24h"]  = df["DAP"].rolling(24, min_periods=1).mean()
df["std_24h"] = df["DAP"].rolling(24, min_periods=1).std().fillna(0)

k = 2.0
df["is_spike"]  = (df["DAP"] > df["mu_24h"] + k*df["std_24h"]).astype(int)
df["spike_mag"] = np.where(df["is_spike"], df["DAP"] - df["mu_24h"], 0.0)

df.drop(["mu_24h", "std_24h"], axis=1, inplace=True)

In [5]:
df

Unnamed: 0_level_0,time,DAP,SCED,F_TLF,NG Price,Non-Renew,is_holiday,Wind_Delta,is_spike,spike_mag
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-02 00:00:00,2019-01-02 00:00:00,23.9250,26.216109,0.016499,3.250000,36508.703164,0,0.000000,0,0.0
2019-01-02 01:00:00,2019-01-02 01:00:00,23.3140,25.671136,0.016441,3.227917,35983.092128,0,-138.755164,0,0.0
2019-01-02 02:00:00,2019-01-02 02:00:00,23.3475,24.970952,0.016411,3.205833,35550.176684,0,275.613272,0,0.0
2019-01-02 03:00:00,2019-01-02 03:00:00,23.0595,24.787712,0.016441,3.183750,35511.693684,0,435.428944,0,0.0
2019-01-02 04:00:00,2019-01-02 04:00:00,25.2672,24.887786,0.016534,3.161667,36565.635248,0,160.403476,0,0.0
...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,2023-12-31 19:00:00,19.3715,14.422107,0.021356,2.580000,29417.400000,0,3754.400000,0,0.0
2023-12-31 20:00:00,2023-12-31 20:00:00,16.7739,12.597410,0.021367,2.580000,26543.200000,0,1858.700000,0,0.0
2023-12-31 21:00:00,2023-12-31 21:00:00,15.0035,10.170172,0.021383,2.580000,24706.800000,0,1062.300000,0,0.0
2023-12-31 22:00:00,2023-12-31 22:00:00,15.7699,9.872549,0.021408,2.580000,23992.100000,0,-124.300000,0,0.0


In [6]:
log_cols = ["DAP", "SCED", "F_TLF", "NG Price", "Non-Renew", "spike_mag"]
offset   = {c: df[c].min() for c in log_cols}

log_df = df.copy()
for c in log_cols:
    log_df[c] = np.log(df[c] - offset[c] + 1)

feature_cols = ["DAP", "SCED", "F_TLF", "NG Price", "Non-Renew", "is_holiday"]

y_cls = df["is_spike"]
y_reg = log_df["spike_mag"]

train_mask = (df.index <= "2021-12-31 23:55:00")
val_mask   = (df.index >= "2022-01-01") & (df.index <= "2022-12-31 23:55:00")
test_mask  = (df.index >= "2023-01-01")

X_train, X_val, X_test = log_df[feature_cols][train_mask], \
                         log_df[feature_cols][val_mask],   \
                         log_df[feature_cols][test_mask]

y_tr_cls, y_va_cls, y_te_cls = y_cls[train_mask], y_cls[val_mask], y_cls[test_mask]
y_tr_reg, y_va_reg, y_te_reg = y_reg[train_mask], y_reg[val_mask], y_reg[test_mask]

# standardise features with *train* statistics
x_mean, x_std = X_train.mean(), X_train.std() + 1e-6
X_train_std = (X_train - x_mean)/x_std
X_val_std   = (X_val   - x_mean)/x_std
X_test_std  = (X_test  - x_mean)/x_std

# numpy views (all rows)
X_tr_all = X_train_std.values
X_va_all = X_val_std.values
X_te_all = X_test_std.values

In [7]:
# ---------------------------------------------------------------------
# 3. ***Spike-only*** masks for the **regressor**
# ---------------------------------------------------------------------
mask_tr_reg = y_tr_reg.values > 0      # True only where spike_mag > 0
mask_va_reg = y_va_reg.values > 0

X_tr_reg = X_tr_all[mask_tr_reg]
y_tr_reg = y_tr_reg.values[mask_tr_reg]

X_va_reg = X_va_all[mask_va_reg]
y_va_reg = y_va_reg.values[mask_va_reg]

In [8]:
# ---------------------------------------------------------------------
# 4. Model helpers
# ---------------------------------------------------------------------
class MLP(nn.Module):
    def __init__(self, sizes, final_act=None):
        super().__init__()
        layers = []
        for i in range(len(sizes)-1):
            layers.append(nn.Linear(sizes[i], sizes[i+1]))
            if i < len(sizes)-2:
                layers.append(nn.ReLU())
        if final_act is not None:
            layers.append(final_act)
        self.net = nn.Sequential(*layers)
    def forward(self, x): return self.net(x)

def run_cfg(cfg, Xtr, ytr, Xva, yva, is_cls):
    in_dim = Xtr.shape[1]
    model  = MLP([in_dim, *cfg["hidden_sizes"], 1],
                 nn.Sigmoid() if is_cls else None).to(device)
    loss_fn = nn.BCELoss() if is_cls else nn.MSELoss()
    opt     = optim.Adam(model.parameters(), lr=cfg["lr"])

    ds_tr = TensorDataset(torch.from_numpy(Xtr).float(),
                          torch.from_numpy(ytr).float().unsqueeze(1))
    ds_va = TensorDataset(torch.from_numpy(Xva).float(),
                          torch.from_numpy(yva).float().unsqueeze(1))
    dl_tr = DataLoader(ds_tr, batch_size=cfg["batch_size"], shuffle=True)
    dl_va = DataLoader(ds_va, batch_size=cfg["batch_size"])

    # train
    for _ in range(cfg["epochs"]):
        model.train()
        for xb,yb in dl_tr:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward(); opt.step()

    # validate
    model.eval()
    tot_loss, correct, tot = 0., 0, 0
    with torch.no_grad():
        for xb,yb in dl_va:
            xb,yb = xb.to(device), yb.to(device)
            pred   = model(xb)
            tot_loss += loss_fn(pred, yb).item()*xb.size(0)
            if is_cls:
                correct += ((pred>0.5)==yb).sum().item()
            tot += xb.size(0)

    if is_cls:
        return correct/tot
    else:
        return np.sqrt(tot_loss/tot)     # RMSE (log-space)

In [11]:
# ---------------------------------------------------------------------
# 5. Hyper-parameter grid search
# ---------------------------------------------------------------------
param_grid = {
    "hidden_sizes": [[64], [128], [64,32], [128,64]],
    "lr":           [1e-3, 1e-4],
    "batch_size":   [64,128],
    "epochs":       [20]
}

# (a) classifier – uses **all** rows
best_acc, best_cfg_cls = 0., None
for cfg in ParameterGrid(param_grid):
    acc = run_cfg(cfg, X_tr_all, y_tr_cls.values, X_va_all, y_va_cls.values, True)
    if acc > best_acc: best_acc, best_cfg_cls = acc, cfg
print(f"Best classifier: {best_cfg_cls} | val acc = {best_acc:.3f}")


# (b) regressor – uses **spike rows only**
best_rmse, best_cfg_reg = np.inf, None
for cfg in ParameterGrid(param_grid):
    rmse = run_cfg(cfg, X_tr_reg, y_tr_reg, X_va_reg, y_va_reg, False)
    if rmse < best_rmse: best_rmse, best_cfg_reg = rmse, cfg
print(f"Best regressor : {best_cfg_reg} | val RMSE (log) = {best_rmse:.3f}")

Best classifier: {'batch_size': 128, 'epochs': 20, 'hidden_sizes': [64], 'lr': 0.001} | val acc = 0.945
Best regressor : {'batch_size': 64, 'epochs': 20, 'hidden_sizes': [128, 64], 'lr': 0.001} | val RMSE (log) = 0.300


In [12]:
# ---------------------------------------------------------------------
# 6. Retrain on train+val (same split philosophy)
# ---------------------------------------------------------------------
def retrain(cfg, Xtr, ytr, is_cls):
    in_dim = Xtr.shape[1]
    model  = MLP([in_dim, *cfg["hidden_sizes"], 1],
                 nn.Sigmoid() if is_cls else None).to(device)
    loss_fn = nn.BCELoss() if is_cls else nn.MSELoss()
    opt     = optim.Adam(model.parameters(), lr=cfg["lr"])

    ds = TensorDataset(torch.from_numpy(Xtr).float(),
                       torch.from_numpy(ytr).float().unsqueeze(1))
    dl = DataLoader(ds, batch_size=cfg["batch_size"], shuffle=True)

    for _ in range(cfg["epochs"]):
        model.train()
        for xb,yb in dl:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward(); opt.step()
    return model

In [13]:
# ---- (a) classifier on ALL rows
X_full_cls = np.vstack([X_tr_all, X_va_all])
y_full_cls = np.concatenate([y_tr_cls.values, y_va_cls.values])
final_clf  = retrain(best_cfg_cls, X_full_cls, y_full_cls, is_cls=True)

# ---- (b) regressor on SPIKE rows only
mask_full_reg = np.concatenate([y_tr_reg, y_va_reg]) > 0
X_full_reg    = np.vstack([X_tr_reg, X_va_reg])
y_full_reg    = np.concatenate([y_tr_reg, y_va_reg])
final_reg     = retrain(best_cfg_reg, X_full_reg, y_full_reg, is_cls=False)


In [14]:
# ---------------------------------------------------------------------
# 7. Inference on **every** 2023 hour
# ---------------------------------------------------------------------
with torch.no_grad():
    probs = final_clf(torch.from_numpy(X_te_all).float().to(device)) \
                .cpu().numpy().flatten()
    preds_log = final_reg(torch.from_numpy(X_te_all).float().to(device)) \
                .cpu().numpy().flatten()

preds_mag = np.expm1(preds_log)                # back to MW scale
actual_mag = df.loc[test_mask, "spike_mag"].values


In [15]:
# ---------------------------------------------------------------------
# 8. Quick metrics & dataframe
# ---------------------------------------------------------------------
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

pred_labels = (probs > 0.5).astype(int)
tn, fp, fn, tp = confusion_matrix(y_te_cls, pred_labels).ravel()
print(f"2023  |  TN:{tn} FP:{fp} FN:{fn} TP:{tp}")
print(f"      |  Precision:{precision_score(y_te_cls, pred_labels):.3f} "
      f"Recall:{recall_score(y_te_cls, pred_labels):.3f} "
      f"F1:{f1_score(y_te_cls, pred_labels):.3f}")

rmse_mw = np.sqrt(((preds_mag - actual_mag)**2).mean())
mae_mw  = np.abs(preds_mag - actual_mag).mean()
print(f"Regressor 2023 | MAE:{mae_mw:.2f}  RMSE:{rmse_mw:.2f}")

2023  |  TN:7430 FP:718 FN:134 TP:478
      |  Precision:0.400 Recall:0.781 F1:0.529
Regressor 2023 | MAE:23.84  RMSE:119.63


In [16]:
# --- dataframe of results --------------------------------------------
pred_df = pd.DataFrame({
    "time": df.loc[test_mask].index,
    "pred_spike_prob": probs,
    "pred_mag_MW":     preds_mag,
    "true_spike":      y_te_cls.values,
    "true_mag_MW":     actual_mag
}).set_index("time")

pred_df

Unnamed: 0_level_0,pred_spike_prob,pred_mag_MW,true_spike,true_mag_MW
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01 00:00:00,0.000269,11.166585,0,0.0
2023-01-01 01:00:00,0.000272,12.847394,0,0.0
2023-01-01 02:00:00,0.000188,12.488766,0,0.0
2023-01-01 03:00:00,0.000166,11.972310,0,0.0
2023-01-01 04:00:00,0.000165,11.738667,0,0.0
...,...,...,...,...
2023-12-31 19:00:00,0.001989,9.345063,0,0.0
2023-12-31 20:00:00,0.001092,9.124728,0,0.0
2023-12-31 21:00:00,0.000679,9.029976,0,0.0
2023-12-31 22:00:00,0.001197,9.482147,0,0.0


In [17]:
k = 10
top_spikes = pred_df.nlargest(k, "pred_spike_prob")   # or .sort_values(...).head(k)

print(top_spikes)

                     pred_spike_prob  pred_mag_MW  true_spike  true_mag_MW
time                                                                      
2023-09-06 15:00:00         0.945887   261.696198           1   250.698834
2023-09-06 16:00:00         0.942342   365.656555           1   390.268814
2023-03-27 18:00:00         0.929506    95.228607           1   102.282305
2023-03-27 19:00:00         0.923741   201.895050           1   264.448420
2023-08-04 16:00:00         0.909495   289.380646           1   330.878052
2023-06-20 20:00:00         0.908939   668.973633           0     0.000000
2023-06-20 19:00:00         0.899911  1734.672607           0     0.000000
2023-08-28 19:00:00         0.899480   319.945007           0     0.000000
2023-09-06 19:00:00         0.890731  1033.202393           1  1143.474698
2023-12-08 17:00:00         0.889641    27.504089           1    31.110119


In [18]:
pred_df.to_csv("Final Spike Predictions.csv")