Basic features: day of the week, weekend, ourtages, seasons, special evants, etc.

In [1]:
# - Removes "Market Demand" entirely from the pipeline.
# - Builds only features available at prediction time (known-future calendar + observed-past target lags/rollings).
# - Compatible with LightGBM (testing) and TFT (final model).

import pandas as pd
import numpy as np

# -----------------------------
# Config
# -----------------------------
DATA_PATH = "rawCSV/combinedDatasets/combined_demand_2002_2025.csv"
TARGET_COL = "Ontario Demand"  # prediction target

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(DATA_PATH)
df.columns = [c.strip() for c in df.columns]

# Drop Market Demand entirely if present
df = df.drop(columns=["Market Demand"], errors="ignore")

# Parse timestamp from Date + Hour (Hour is 1–24 hour-ending => start-of-hour = Hour-1)
df["Date"] = pd.to_datetime(df["Date"])
df["Hour"] = pd.to_numeric(df["Hour"], errors="coerce").astype("Int64")
df = df.dropna(subset=["Date", "Hour"])
df["timestamp"] = df["Date"] + pd.to_timedelta(df["Hour"] - 1, unit="h")

# Sort by time
df = df.sort_values("timestamp").reset_index(drop=True)

# Ensure numeric target
if TARGET_COL in df.columns:
    df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors="coerce")

# -----------------------------
# Time calendar features (known-future)
# -----------------------------
ts = df["timestamp"]
df["hour"]        = ts.dt.hour.astype("int16")
df["dow"]         = ts.dt.dayofweek.astype("int8")          # Monday=0
df["is_weekend"]  = df["dow"].isin([5, 6]).astype("int8")
df["day"]         = ts.dt.day.astype("int16")
df["week"]        = ts.dt.isocalendar().week.astype("int16")
df["month"]       = ts.dt.month.astype("int8")
df["quarter"]     = ts.dt.quarter.astype("int8")
df["dayofyear"]   = ts.dt.dayofyear.astype("int16")
df["is_month_start"] = ts.dt.is_month_start.astype("int8")
df["is_month_end"]   = ts.dt.is_month_end.astype("int8")
df["is_qtr_end"]     = ts.dt.is_quarter_end.astype("int8")
df["is_year_end"]    = ts.dt.is_year_end.astype("int8")

# Seasons (meteorological)
season_map = {
    12: "winter", 1: "winter", 2: "winter",
    3: "spring",  4: "spring",  5: "spring",
    6: "summer",  7: "summer",  8: "summer",
    9: "autumn", 10: "autumn", 11: "autumn"
}
df["season"] = df["month"].map(season_map)
df["season_code"] = df["season"].map({"winter":0, "spring":1, "summer":2, "autumn":3}).astype("Int8")

# -----------------------------
# Holidays (Ontario) — special events (known-future at daily level)
# -----------------------------
try:
    import holidays
    years = np.arange(ts.dt.year.min(), ts.dt.year.max() + 1).tolist()
    on_holidays = holidays.Canada(prov="ON", years=years)
    dates_norm = ts.dt.normalize()
    df["is_holiday"]        = dates_norm.dt.date.map(lambda d: d in on_holidays).astype("int8")
    df["is_holiday_eve"]    = (dates_norm + pd.Timedelta(days=1)).dt.date.map(lambda d: d in on_holidays).astype("int8")
    df["is_holiday_morrow"] = (dates_norm - pd.Timedelta(days=1)).dt.date.map(lambda d: d in on_holidays).astype("int8")
except Exception:
    df["is_holiday"]        = 0
    df["is_holiday_eve"]    = 0
    df["is_holiday_morrow"] = 0

df["is_business_day"] = ((df["dow"] < 5) & (df["is_holiday"] == 0)).astype("int8")
df["weekend_or_holiday"] = ((df["is_weekend"] == 1) | (df["is_holiday"] == 1)).astype("int8")

# -----------------------------
# DST flag (America/Toronto)
# -----------------------------
try:
    localized = df["timestamp"].dt.tz_localize("America/Toronto", nonexistent="NaT", ambiguous="NaT")
    df["is_dst"] = localized.dt.dst().ne(pd.Timedelta(0)).fillna(False).astype("int8")
except Exception:
    df["is_dst"] = 0

# -----------------------------
# Cyclical encodings (known-future)
# -----------------------------
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24.0)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24.0)
df["dow_sin"]  = np.sin(2 * np.pi * df["dow"]  / 7.0)
df["dow_cos"]  = np.cos(2 * np.pi * df["dow"]  / 7.0)
df["doy_sin"]  = np.sin(2 * np.pi * (df["dayofyear"] - 1) / 366.0)
df["doy_cos"]  = np.cos(2 * np.pi * (df["dayofyear"] - 1) / 366.0)

# -----------------------------
# Simple domain flags (known-future)
# -----------------------------
df["is_peak_hour"] = df["hour"].between(7, 20).astype("int8")

# -----------------------------
# Target lags and rollings (observed-past only)
# -----------------------------
if TARGET_COL in df.columns:
    for lag in [1, 24, 168]:  # 1h, 1d, 1w
        df[f"{TARGET_COL}_lag_{lag}h"] = df[TARGET_COL].shift(lag)

    # Rolling windows on historical data only (causal)
    df[f"{TARGET_COL}_rollmean_24h"]  = df[TARGET_COL].rolling(window=24, min_periods=1).mean()
    df[f"{TARGET_COL}_rollstd_24h"]   = df[TARGET_COL].rolling(window=24, min_periods=2).std()
    df[f"{TARGET_COL}_rollmean_168h"] = df[TARGET_COL].rolling(window=168, min_periods=1).mean()
    df[f"{TARGET_COL}_rollstd_168h"]  = df[TARGET_COL].rolling(window=168, min_periods=2).std()

    # Exponential moving averages
    df[f"{TARGET_COL}_ewm_24h"]  = df[TARGET_COL].ewm(span=24,  adjust=False).mean()
    df[f"{TARGET_COL}_ewm_168h"] = df[TARGET_COL].ewm(span=168, adjust=False).mean()

# -----------------------------
# TFT helpers
# -----------------------------
df["series_id"] = "ontario"
df["time_idx"] = ((df["timestamp"] - df["timestamp"].min()) / pd.Timedelta(hours=1)).astype(int)

# -----------------------------
# Column selection
# -----------------------------
known_future = [
    "hour","dow","day","week","month","quarter","dayofyear",
    "is_weekend","is_business_day","is_holiday","is_holiday_eve","is_holiday_morrow",
    "season_code","is_month_start","is_month_end","is_qtr_end","is_year_end",
    "is_dst","is_peak_hour",
    "hour_sin","hour_cos","dow_sin","dow_cos","doy_sin","doy_cos"
]

observed_past = []
if TARGET_COL in df.columns:
    observed_past += [
        f"{TARGET_COL}_lag_1h", f"{TARGET_COL}_lag_24h", f"{TARGET_COL}_lag_168h",
        f"{TARGET_COL}_rollmean_24h", f"{TARGET_COL}_rollstd_24h",
        f"{TARGET_COL}_rollmean_168h", f"{TARGET_COL}_rollstd_168h",
        f"{TARGET_COL}_ewm_24h", f"{TARGET_COL}_ewm_168h"
    ]

base_cols = ["timestamp","series_id","time_idx","Date","Hour", TARGET_COL]
feature_cols = base_cols + known_future + observed_past
feature_cols = [c for c in feature_cols if c in df.columns]

df_features = df[feature_cols].copy()

# Example save:
df_features.to_parquet("engineeredDatasets/ontario_demand_basic_features.parquet", index=False)


In [4]:
# Show all columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

df_features.tail()

Unnamed: 0,timestamp,series_id,time_idx,Date,Hour,Ontario Demand,hour,dow,day,week,month,quarter,dayofyear,is_weekend,is_business_day,is_holiday,is_holiday_eve,is_holiday_morrow,season_code,is_month_start,is_month_end,is_qtr_end,is_year_end,is_dst,is_peak_hour,hour_sin,hour_cos,dow_sin,dow_cos,doy_sin,doy_cos,Ontario Demand_lag_1h,Ontario Demand_lag_24h,Ontario Demand_lag_168h,Ontario Demand_rollmean_24h,Ontario Demand_rollstd_24h,Ontario Demand_rollmean_168h,Ontario Demand_rollstd_168h,Ontario Demand_ewm_24h,Ontario Demand_ewm_168h
205675,2025-10-16 20:00:00,ontario,205676,2025-10-16,21,16204,20,3,16,42,10,4,289,0,1,0,0,0,3,0,0,0,0,0,1,-0.866025,0.5,0.433884,-0.900969,-0.973264,0.229688,16622.0,15757.0,16293.0,14801.25,1227.987156,14437.946429,1280.683824,15305.026363,14742.813152
205676,2025-10-16 21:00:00,ontario,205677,2025-10-16,22,15274,21,3,16,42,10,4,289,0,1,0,0,0,3,0,0,0,0,0,0,-0.707107,0.707107,0.433884,-0.900969,-0.973264,0.229688,16204.0,14996.0,15317.0,14812.833333,1231.210986,14437.690476,1280.511373,15302.544254,14749.099387
205677,2025-10-16 22:00:00,ontario,205678,2025-10-16,23,14615,22,3,16,42,10,4,289,0,1,0,0,0,3,0,0,0,0,0,0,-0.5,0.866025,0.433884,-0.900969,-0.973264,0.229688,15274.0,14228.0,14446.0,14828.958333,1225.740574,14438.696429,1280.58432,15247.540714,14747.512412
205678,2025-10-16 23:00:00,ontario,205679,2025-10-16,24,13987,23,3,16,42,10,4,289,0,1,0,0,0,3,0,0,0,0,0,0,-0.258819,0.965926,0.433884,-0.900969,-0.973264,0.229688,14615.0,13499.0,13784.0,14849.291667,1206.61764,14439.904762,1280.058528,15146.697457,14738.512265
205679,2025-10-17 00:00:00,ontario,205680,2025-10-17,1,13927,0,4,17,42,10,4,290,0,1,0,0,0,3,0,0,0,0,0,0,0.0,1.0,-0.433884,-0.900969,-0.969178,0.246361,13987.0,13165.0,13370.0,14881.041667,1169.836313,14443.220238,1277.990448,15049.12166,14728.908569


IESSO performance

In [None]:
import requests
import pandas as pd

# 1) Fetch JSON once
endpoint = "https://www.ieso.ca/ieso/api/heatmapWebApi/getDemandChartData"
params = {"startDate": "2025-01-22", "endDate": "2025-10-23"}
headers = {
    "Accept": "*/*",
    "Referer": "https://www.ieso.ca/power-data",
    "User-Agent": "Mozilla/5.0"
}
data = requests.get(endpoint, params=params, headers=headers, timeout=30).json()
print(list(data.keys()))  # e.g., ['DayAhead', 'RealTime', 'Dam']

# 2) Predicted (Ontario) from Dam if present else DayAhead
pred_source = "Dam" if data.get("Dam") else "DayAhead"
pred_rows = []
for day in data.get(pred_source, []):
    d = day["ReportForDate"][:10]
    for h in day.get("HourlyForecastDemand", []):
        pred_rows.append({"date": d, "hour": h["Hour"], "predicted_mw": h["EnergyMW"]})
pred_df = pd.DataFrame(pred_rows)

# 3) Actual (Ontario) by summing RealTime zonal hourly AvgDemand
act_rows = []
for day in data.get("RealTime", []):
    d = day["ReportForDate"][:10]
    # Sum across zones per hour
    # Build a dict hour -> sum
    sums = {}
    for zone, hours in day.get("HourlyData", {}).items():
        for rec in hours:
            he = rec["Hour"]
            sums[he] = sums.get(he, 0) + rec["AvgDemand"]
    for he, val in sums.items():
        act_rows.append({"date": d, "hour": he, "actual_mw": val})
act_df = pd.DataFrame(act_rows)

# 4) Join predicted vs actual on date + hour
ontario_hourly = pred_df.merge(act_df, on=["date","hour"], how="left").sort_values(["date","hour"])

import numpy as np
import pandas as pd

# Start from your joined DataFrame
df = ontario_hourly.copy()

# Keep hours where both predicted and actual are present
df_valid = df.dropna(subset=["predicted_mw", "actual_mw"]).copy()

y_true = df_valid["actual_mw"].astype(float).to_numpy()
y_pred = df_valid["predicted_mw"].astype(float).to_numpy()
diff = y_pred - y_true

# MAE
mae = np.mean(np.abs(diff))

# RMSE
rmse = np.sqrt(np.mean(diff**2))

# sMAPE (%), handle zero denominator safely
den = np.abs(y_true) + np.abs(y_pred)
ratio = np.zeros_like(den)
mask = den > 0
ratio[mask] = 2.0 * np.abs(diff[mask]) / den[mask]
smape = ratio.mean() * 100.0

print(f"n={len(df_valid)} matched rows")
print(f"MAE: {mae:.2f} MW")
print(f"RMSE: {rmse:.2f} MW")
print(f"sMAPE: {smape:.3f}%")



['DayAhead', 'RealTime', 'Dam']
n=4152 matched rows
MAE: 312.19 MW
RMSE: 456.84 MW
sMAPE: 1.821%


In [None]:
import requests
import pandas as pd
import numpy as np

# 1) Fetch JSON once
endpoint = "https://www.ieso.ca/ieso/api/heatmapWebApi/getDemandChartData"
params = {"startDate": "2025-10-29", "endDate": "2025-10-29"}
headers = {
    "Accept": "*/*",
    "Referer": "https://www.ieso.ca/power-data",
    "User-Agent": "Mozilla/5.0"
}
data = requests.get(endpoint, params=params, headers=headers, timeout=30).json()

# 2) Predicted (Ontario) from Dam if present else DayAhead
pred_source = "Dam" if data.get("Dam") else "DayAhead"
pred_rows = []
for day in data.get(pred_source, []):
    d = day["ReportForDate"][:10]
    for h in day.get("HourlyForecastDemand", []):
        pred_rows.append({"date": d, "hour": h["Hour"], "predicted_mw": h["EnergyMW"]})
pred_df = pd.DataFrame(pred_rows)

# 3) Actual (Ontario) by summing RealTime zonal hourly AvgDemand
act_rows = []
for day in data.get("RealTime", []):
    d = day["ReportForDate"][:10]
    # Sum across zones per hour
    sums = {}
    for zone, hours in day.get("HourlyData", {}).items():
        for rec in hours:
            he = rec["Hour"]
            sums[he] = sums.get(he, 0) + rec["AvgDemand"]
    for he, val in sums.items():
        act_rows.append({"date": d, "hour": he, "actual_mw": val})
act_df = pd.DataFrame(act_rows)

# Handle case where act_df might be empty
if len(act_df) == 0:
    act_df = pd.DataFrame(columns=["date", "hour", "actual_mw"])

# 4) Join predicted vs actual on date + hour (keep all predictions)
ontario_hourly = pred_df.merge(act_df, on=["date","hour"], how="left").sort_values(["date","hour"]).reset_index(drop=True)

# Add a datetime column for easier reading
ontario_hourly['datetime'] = pd.to_datetime(ontario_hourly['date']) + pd.to_timedelta(ontario_hourly['hour'] - 1, unit='h')

# Add error columns for historical data
ontario_hourly['error_mw'] = ontario_hourly['predicted_mw'] - ontario_hourly['actual_mw']
ontario_hourly['abs_error_mw'] = ontario_hourly['error_mw'].abs()
ontario_hourly['pct_error'] = (ontario_hourly['error_mw'] / ontario_hourly['actual_mw'] * 100).round(2)

# Add a flag to identify historical vs future
ontario_hourly['is_future'] = ontario_hourly['actual_mw'].isna()

# Separate into historical and future
historical_data = ontario_hourly[~ontario_hourly['is_future']].copy()
future_data = ontario_hourly[ontario_hourly['is_future']].copy()

# Display side-by-side comparison
print(f"{'='*90}")
print(f"COMPLETE DATA - ACTUAL vs PREDICTED DEMAND (Side by Side)")
print(f"{'='*90}\n")
print(ontario_hourly[['datetime', 'date', 'hour', 'actual_mw', 'predicted_mw', 'error_mw', 'is_future']].to_string(index=False))

print(f"\n{'='*90}")
print(f"SUMMARY")
print(f"{'='*90}")
print(f"Total predictions: {len(ontario_hourly)}")
print(f"Historical (with actuals): {len(historical_data)}")
print(f"Future (predictions only): {len(future_data)}")

# Calculate error metrics on historical data
if len(historical_data) > 0:
    y_true = historical_data["actual_mw"].astype(float).to_numpy()
    y_pred = historical_data["predicted_mw"].astype(float).to_numpy()
    diff = y_pred - y_true
    
    mae = np.mean(np.abs(diff))
    rmse = np.sqrt(np.mean(diff**2))
    
    den = np.abs(y_true) + np.abs(y_pred)
    ratio = np.zeros_like(den)
    mask = den > 0
    ratio[mask] = 2.0 * np.abs(diff[mask]) / den[mask]
    smape = ratio.mean() * 100.0
    
    print(f"\nHISTORICAL PERFORMANCE METRICS:")
    print(f"MAE: {mae:.2f} MW")
    print(f"RMSE: {rmse:.2f} MW")
    print(f"sMAPE: {smape:.3f}%")
else:
    print("\nNo historical data available yet.")


COMPLETE DATA - ACTUAL vs PREDICTED DEMAND (Side by Side)

           datetime       date  hour actual_mw  predicted_mw error_mw  is_future
2025-10-30 00:00:00 2025-10-30     1       NaN       14138.0      NaN       True
2025-10-30 01:00:00 2025-10-30     2       NaN       13925.0      NaN       True
2025-10-30 02:00:00 2025-10-30     3       NaN       13779.0      NaN       True
2025-10-30 03:00:00 2025-10-30     4       NaN       13842.0      NaN       True
2025-10-30 04:00:00 2025-10-30     5       NaN       14730.0      NaN       True
2025-10-30 05:00:00 2025-10-30     6       NaN       15805.0      NaN       True
2025-10-30 06:00:00 2025-10-30     7       NaN       16438.0      NaN       True
2025-10-30 07:00:00 2025-10-30     8       NaN       16857.0      NaN       True
2025-10-30 08:00:00 2025-10-30     9       NaN       16799.0      NaN       True
2025-10-30 09:00:00 2025-10-30    10       NaN       16637.0      NaN       True
2025-10-30 10:00:00 2025-10-30    11       NaN    

==========================================================================================
COMPLETE DATA - ACTUAL vs PREDICTED DEMAND (Side by Side)
==========================================================================================

           datetime       date  hour actual_mw  predicted_mw error_mw  is_future
2025-10-30 00:00:00 2025-10-30     1       NaN       14138.0      NaN       True
2025-10-30 01:00:00 2025-10-30     2       NaN       13925.0      NaN       True
2025-10-30 02:00:00 2025-10-30     3       NaN       13779.0      NaN       True
2025-10-30 03:00:00 2025-10-30     4       NaN       13842.0      NaN       True
2025-10-30 04:00:00 2025-10-30     5       NaN       14730.0      NaN       True
2025-10-30 05:00:00 2025-10-30     6       NaN       15805.0      NaN       True
2025-10-30 06:00:00 2025-10-30     7       NaN       16438.0      NaN       True
2025-10-30 07:00:00 2025-10-30     8       NaN       16857.0      NaN       True
2025-10-30 08:00:00 2025-10-30     9       NaN       16799.0      NaN       True
2025-10-30 09:00:00 2025-10-30    10       NaN       16637.0      NaN       True
2025-10-30 10:00:00 2025-10-30    11       NaN       16454.0      NaN       True
2025-10-30 11:00:00 2025-10-30    12       NaN       16419.0      NaN       True
2025-10-30 12:00:00 2025-10-30    13       NaN       16402.0      NaN       True
2025-10-30 13:00:00 2025-10-30    14       NaN       16407.0      NaN       True
2025-10-30 14:00:00 2025-10-30    15       NaN       16435.0      NaN       True
2025-10-30 15:00:00 2025-10-30    16       NaN       16717.0      NaN       True
2025-10-30 16:00:00 2025-10-30    17       NaN       17016.0      NaN       True
2025-10-30 17:00:00 2025-10-30    18       NaN       17458.0      NaN       True
2025-10-30 18:00:00 2025-10-30    19       NaN       17495.0      NaN       True
2025-10-30 19:00:00 2025-10-30    20       NaN       17157.0      NaN       True
2025-10-30 20:00:00 2025-10-30    21       NaN       16589.0      NaN       True
2025-10-30 21:00:00 2025-10-30    22       NaN       15737.0      NaN       True
2025-10-30 22:00:00 2025-10-30    23       NaN       14983.0      NaN       True
2025-10-30 23:00:00 2025-10-30    24       NaN       14373.0      NaN       True

==========================================================================================
SUMMARY
==========================================================================================
Total predictions: 24
Historical (with actuals): 0
Future (predictions only): 24

No historical data available yet.