# Energy Consumption Prediction Analysis

This notebook predicts energy consumption based on temperature pattern dataset.

# Import Libraries and Setup


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
from scipy import stats
from scipy.stats import jarque_bera, normaltest, shapiro
from statsmodels.stats.diagnostic import het_white
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.stattools import durbin_watson
import joblib
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
np.random.seed(42)

# Load and Initial Data Exploration

In [2]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/daily-min-temperatures.csv"
df = pd.read_csv(url)
df.columns = ['Date', 'Temperature']
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

np.random.seed(42)
n_samples = len(df)
df['Temperature_C'] = df['Temperature']
df['Humidity'] = np.random.normal(65, 15, n_samples).clip(30, 95)
df['Wind_Speed'] = np.random.exponential(2, n_samples).clip(0, 15)
df['Solar_Radiation'] = np.random.gamma(2, 100, n_samples).clip(0, 800)
df['Pressure'] = np.random.normal(1013, 10, n_samples).clip(990, 1030)

base_consumption = 1000
temp_effect = np.where(df['Temperature_C'] < 10, (10 - df['Temperature_C']) * 50, 0) + \
              np.where(df['Temperature_C'] > 25, (df['Temperature_C'] - 25) * 30, 0)
humidity_effect = (df['Humidity'] - 50) * 5
wind_effect = -df['Wind_Speed'] * 10
solar_effect = -df['Solar_Radiation'] * 0.5
pressure_effect = (df['Pressure'] - 1013) * 2

df['Energy_Consumption'] = (base_consumption + temp_effect + humidity_effect +
                           wind_effect + solar_effect + pressure_effect +
                           np.random.normal(0, 50, n_samples)).clip(200, 3000)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Missing values: {df.isnull().sum().sum()}")

Dataset shape: (3650, 7)
Date range: 1981-01-01 00:00:00 to 1990-12-31 00:00:00
Missing values: 0
