# Energy Consumption Prediction Analysis

This notebook predicts energy consumption based on temperature pattern dataset.

# Import Libraries and Setup


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
from scipy import stats
from scipy.stats import jarque_bera, normaltest, shapiro
from statsmodels.stats.diagnostic import het_white
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.stattools import durbin_watson
import joblib
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
np.random.seed(42)

# Load and Initial Data Exploration

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv"
df = pd.read_csv(url)
df.columns = ['Date', 'Temperature']
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

np.random.seed(42)
n_samples = len(df)
df['Temperature_C'] = df['Temperature']
df['Humidity'] = np.random.normal(65, 15, n_samples).clip(30, 95)
df['Wind_Speed'] = np.random.exponential(2, n_samples).clip(0, 15)
df['Solar_Radiation'] = np.random.gamma(2, 100, n_samples).clip(0, 800)
df['Pressure'] = np.random.normal(1013, 10, n_samples).clip(990, 1030)

base_consumption = 1000
temp_effect = np.where(df['Temperature_C'] < 10, (10 - df['Temperature_C']) * 50, 0) + \
              np.where(df['Temperature_C'] > 25, (df['Temperature_C'] - 25) * 30, 0)
humidity_effect = (df['Humidity'] - 50) * 5
wind_effect = -df['Wind_Speed'] * 10
solar_effect = -df['Solar_Radiation'] * 0.5
pressure_effect = (df['Pressure'] - 1013) * 2

df['Energy_Consumption'] = (base_consumption + temp_effect + humidity_effect +
                           wind_effect + solar_effect + pressure_effect +
                           np.random.normal(0, 50, n_samples)).clip(200, 3000)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Missing values: {df.isnull().sum().sum()}")

Dataset shape: (3650, 7)
Date range: 1981-01-01 00:00:00 to 1990-12-31 00:00:00
Missing values: 0


# Statistical Summary

In [3]:
print("=== STATISTICAL SUMMARY ===")
print(df.describe())
print("\n=== DATA TYPES ===")
print(df.dtypes)
print("\n=== CORRELATION MATRIX ===")
correlation_matrix = df.corr()
print(correlation_matrix.round(3))

=== STATISTICAL SUMMARY ===
       Temperature  Temperature_C     Humidity   Wind_Speed  Solar_Radiation  \
count  3650.000000    3650.000000  3650.000000  3650.000000      3650.000000   
mean     11.177753      11.177753    65.295782     1.948622       199.423054   
std       4.071837       4.071837    14.507403     1.950444       140.094815   
min       0.000000       0.000000    30.000000     0.000454         2.224753   
25%       8.300000       8.300000    55.259763     0.557150        96.937294   
50%      11.000000      11.000000    65.328332     1.325376       166.379532   
75%      14.000000      14.000000    75.200923     2.700529       268.820587   
max      26.300000      26.300000    95.000000    15.000000       800.000000   

          Pressure  Energy_Consumption  
count  3650.000000         3650.000000  
mean   1013.008130         1013.825009  
std       9.557315          148.377453  
min     990.000000          483.715182  
25%    1006.222143          917.731686  
50%  

# Feature Engineering


In [4]:
df_features = df.copy()
df_features['Month'] = df_features.index.month
df_features['Day_of_Year'] = df_features.index.dayofyear
df_features['Quarter'] = df_features.index.quarter
df_features['Year'] = df_features.index.year
df_features['Week_of_Year'] = df_features.index.isocalendar().week
df_features['Day_of_Week'] = df_features.index.dayofweek
df_features['Is_Weekend'] = (df_features['Day_of_Week'] >= 5).astype(int)

df_features['Season'] = df_features['Month'].map({12: 'Winter', 1: 'Winter', 2: 'Winter',
                                                 3: 'Spring', 4: 'Spring', 5: 'Spring',
                                                 6: 'Summer', 7: 'Summer', 8: 'Summer',
                                                 9: 'Autumn', 10: 'Autumn', 11: 'Autumn'})

df_features['Temp_Category'] = pd.cut(df_features['Temperature_C'],
                                     bins=[-np.inf, 5, 15, 25, np.inf],
                                     labels=['Cold', 'Cool', 'Moderate', 'Hot'])

df_features['Temp_Humidity_Interaction'] = df_features['Temperature_C'] * df_features['Humidity']
df_features['Wind_Solar_Interaction'] = df_features['Wind_Speed'] * df_features['Solar_Radiation']
df_features['Pressure_Temp_Interaction'] = df_features['Pressure'] * df_features['Temperature_C']

df_features['Temperature_Squared'] = df_features['Temperature_C'] ** 2
df_features['Humidity_Squared'] = df_features['Humidity'] ** 2
df_features['Wind_Speed_Sqrt'] = np.sqrt(df_features['Wind_Speed'])

df_features['Temp_MA_7'] = df_features['Temperature_C'].rolling(window=7, min_periods=1).mean()
df_features['Temp_MA_30'] = df_features['Temperature_C'].rolling(window=30, min_periods=1).mean()
df_features['Energy_MA_7'] = df_features['Energy_Consumption'].rolling(window=7, min_periods=1).mean()

df_features['Temp_Lag_1'] = df_features['Temperature_C'].shift(1)
df_features['Temp_Lag_7'] = df_features['Temperature_C'].shift(7)
df_features['Energy_Lag_1'] = df_features['Energy_Consumption'].shift(1)

print(f"Feature engineering completed. New shape: {df_features.shape}")

Feature engineering completed. New shape: (3650, 28)
