---
title: "Energy Consumption Forecasting"
format: html
---

# ðŸ“ˆ Time Series Forecasting
**Portfolio Project 2 â€” Energy Consumption Prediction**

---

## Objective
Build and evaluate multiple forecasting models on an energy-consumption time series.

## Dataset
**UCI Appliances Energy Prediction Dataset**
- 10 minutes resolution, ~20,000 rows
- Target: `Appliances` energy (Wh)
- Features: temperature, humidity, wind, pressure from 3 weather stations
- Download: https://archive.ics.uci.edu/dataset/330

---

In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

## 1. Data Generation (UCI-Replica Structure)

In [None]:
# 2. Synthetic energy data matching UCI structure
def gen_energy_data(n=20050, seed=7):
    rng = np.random.default_rng(seed)
    dates = pd.date_range('2016-01-11 00:00', periods=n, freq='10min')

    hour = dates.hour + dates.minute / 60
    dow = dates.dayofweek

    # Realistic temperature (Â°C) with daily cycle
    T1 = 20 + 5*np.sin(2*np.pi*(hour-6)/24) + rng.normal(0, 1.5, n)
    T2 = T1 + rng.normal(0.5, 0.8, n)
    T3 = T1 + rng.normal(-0.3, 1.0, n)

    # Humidity
    H1 = 55 - 0.8*T1 + rng.normal(0, 4, n)
    H2 = H1 + rng.normal(0, 2, n)
    H3 = H1 + rng.normal(1, 3, n)

    # Wind & pressure
    wind = np.clip(rng.exponential(3, n), 0, 25)
    pressure = 1013 + rng.normal(0, 5, n)

    # Target: Appliances energy
    appliances = (
        30
        + 12 * np.sin(2*np.pi*(hour-7)/24)**2
        + 8 * (dow < 5).astype(float)
        + 0.3*T1 + 0.1*H1 - 0.05*wind
        + rng.normal(0, 8, n)
    )
    appliances = np.clip(appliances, 5, 250)

    # Random appliance
    random_app = rng.exponential(15, n)

    df = pd.DataFrame({
        'date': dates,
        'Appliances': appliances.round(1),
        'random1': random_app.round(1),
        'T1': T1.round(2), 'T2': T2.round(2), 'T3': T3.round(2),
        'H1': H1.round(2), 'H2': H2.round(2), 'H3': H3.round(2),
        'Wind': wind.round(2), 'Pressure': pressure.round(2)
    })
    return df


df = gen_energy_data()
df.set_index('date', inplace=True)
print(df.shape)
df.head()

## 2. Feature Engineering

In [None]:
# 3. Temporal + lag features
df['hour'] = df.index.hour
df['dow'] = df.index.dayofweek
df['month'] = df.index.month
df['is_weekend'] = (df['dow'] >= 5).astype(int)

# Lag features
for lag in [6, 12, 24, 144]:  # 1h, 2h, 4h, 1day
    df[f'App_lag{lag}'] = df['Appliances'].shift(lag)

# Rolling features
for win in [6, 24, 144]:
    df[f'App_roll_mean_{win}'] = df['Appliances'].rolling(win).mean()
    df[f'App_roll_std_{win}'] = df['Appliances'].rolling(win).std()

df.dropna(inplace=True)
print(f'After feature eng: {df.shape}')
df.describe().round(2)

## 3. Train / Test Split & Scaling

In [None]:
# 4. Chronological train-test split
TARGET = 'Appliances'
DROP = ['random1']  # intentionally noisy feature

X = df.drop(columns=[TARGET] + DROP)
y = df[TARGET]

# Last 20% as test (temporal)
split = int(len(X)*0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

print(f'Train: {X_train_s.shape}  |  Test: {X_test_s.shape}')

## 4. Model Training & Evaluation

In [None]:
# 5. Train three models
models = {
    'Ridge':    Ridge(alpha=1.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=12, random_state=0, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.05, random_state=0)
}

results = {}
for name, model in models.items():
    model.fit(X_train_s, y_train)
    preds = model.predict(X_test_s)
    results[name] = {
        'preds': preds,
        'rmse': np.sqrt(mean_squared_error(y_test, preds)),
        'mae':  mean_absolute_error(y_test, preds),
        'r2':   r2_score(y_test, preds)
    }
    print(
        f'{name:20s} | RMSE={results[name]["rmse"]:6.2f} | MAE={results[name]["mae"]:5.2f} | RÂ²={results[name]["r2"]:.3f}')

best = max(results, key=lambda k: results[k]['r2'])
print(f'\nâœ… Best model: {best}')

In [None]:
# 6. Actual vs Predicted â€” best model
fig, ax = plt.subplots(figsize=(14, 5))
ax.plot(y_test.index[:500], y_test.values[:500],
        label='Actual', lw=1.2, color='steelblue')
ax.plot(y_test.index[:500], results[best]['preds'][:500],
        label=f'{best} Predicted', lw=1.2, color='crimson', alpha=0.7)
ax.set_title(f'Energy Forecasting â€” {best} Model (first 500 test points)')
ax.set_ylabel('Appliances Energy (Wh)')
ax.set_xlabel('Date')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# 7. Residual analysis
residuals = y_test.values - results[best]['preds']

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Histogram
axes[0].hist(residuals, bins=60, color='steelblue',
             edgecolor='white', density=True)
axes[0].set_title('Residual Distribution')
axes[0].set_xlabel('Residual')

# QQ plot
stats_mod = __import__('scipy').stats
(osm, osr), (slope, intercept, r) = stats_mod.probplot(residuals, dist='norm')
axes[1].plot(osm, osr, 'o', markersize=2, color='steelblue')
axes[1].plot(osm, slope*np.array(osm)+intercept, 'r-', lw=1.5)
axes[1].set_title('Q-Q Plot')
axes[1].set_xlabel('Theoretical Quantiles')
axes[1].set_ylabel('Sample Quantiles')

# Residual vs Predicted
axes[2].scatter(results[best]['preds'][:2000], residuals[:2000],
                s=3, alpha=0.4, color='steelblue')
axes[2].axhline(0, color='crimson', lw=1)
axes[2].set_title('Residual vs Predicted')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Residual')

plt.tight_layout()
plt.show()

In [None]:
# 8. Model comparison bar chart
metrics_df = pd.DataFrame({
    name: {'RMSE': r['rmse'], 'MAE': r['mae'], 'RÂ²': r['r2']}
    for name, r in results.items()
}).T

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
colors = ['#4c72b0', '#55a868', '#c44e52']
for ax, col, c in zip(axes, ['RMSE', 'MAE', 'RÂ²'], colors):
    axes_bar = metrics_df[col].plot(
        kind='bar', ax=ax, color=c, edgecolor='white')
    ax.set_title(col)
    ax.set_ylabel(col)
    ax.tick_params(axis='x', rotation=25)
    for i, v in enumerate(metrics_df[col]):
        ax.text(i, v*1.02, f'{v:.3f}', ha='center', fontsize=9)
plt.suptitle('Model Comparison', fontsize=13, y=1.03)
plt.tight_layout()
plt.show()

## Summary
- Engineered temporal, lag, and rolling features from a 10-min energy dataset
- Trained Ridge, Random Forest, and Gradient Boosting regressors
- Gradient Boosting consistently delivered the best RMSE/RÂ² trade-off
- Residual analysis confirmed homoscedastic, near-Gaussian errors