In [176]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,  PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR




In [124]:
df=pd.read_csv('Energy_consumption_dataset.csv')
df.head()

Unnamed: 0,Month,Hour,DayOfWeek,Holiday,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,EnergyConsumption
0,1,0,Saturday,No,25.139433,43.431581,1565.693999,5,On,Off,2.774699,75.364373
1,1,1,Saturday,No,27.731651,54.225919,1411.064918,1,On,On,21.831384,83.401855
2,1,2,Saturday,No,28.704277,58.907658,1755.715009,2,Off,Off,6.764672,78.270888
3,1,3,Saturday,No,20.080469,50.371637,1452.316318,1,Off,On,8.623447,56.51985
4,1,4,Saturday,No,23.097359,51.401421,1094.130359,9,On,Off,3.071969,70.811732


In [125]:
df.drop(columns=['Holiday'],inplace=True)

In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Month              5000 non-null   int64  
 1   Hour               5000 non-null   int64  
 2   DayOfWeek          5000 non-null   object 
 3   Temperature        5000 non-null   float64
 4   Humidity           5000 non-null   float64
 5   SquareFootage      5000 non-null   float64
 6   Occupancy          5000 non-null   int64  
 7   HVACUsage          5000 non-null   object 
 8   LightingUsage      5000 non-null   object 
 9   RenewableEnergy    5000 non-null   float64
 10  EnergyConsumption  5000 non-null   float64
dtypes: float64(5), int64(3), object(3)
memory usage: 429.8+ KB


In [127]:
df.duplicated().sum()

0

In [128]:
df['Weekend'] = df['DayOfWeek'].isin(['Saturday', 'Sunday']).astype(int)


In [129]:
df = df.drop('DayOfWeek', axis=1)

In [130]:
df['HVACUsage'].unique()
df



Unnamed: 0,Month,Hour,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,EnergyConsumption,Weekend
0,1,0,25.139433,43.431581,1565.693999,5,On,Off,2.774699,75.364373,1
1,1,1,27.731651,54.225919,1411.064918,1,On,On,21.831384,83.401855,1
2,1,2,28.704277,58.907658,1755.715009,2,Off,Off,6.764672,78.270888,1
3,1,3,20.080469,50.371637,1452.316318,1,Off,On,8.623447,56.519850,1
4,1,4,23.097359,51.401421,1094.130359,9,On,Off,3.071969,70.811732,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,12,6,26.338718,52.580000,1563.567259,7,On,On,20.591717,70.270344,1
4996,12,17,20.007565,42.765607,1999.982252,5,Off,On,7.536319,73.943071,0
4997,12,13,26.226253,30.015975,1999.982252,5,Off,On,28.162193,85.784613,0
4998,12,8,24.673206,50.223939,1240.811298,2,On,On,20.918483,63.784001,1


In [131]:
df['HVACUsage'] = df['HVACUsage'].map({'Off': 0, 'On': 1})
df['LightingUsage'] = df['LightingUsage'].map({'Off': 0, 'On': 1})
df



Unnamed: 0,Month,Hour,Temperature,Humidity,SquareFootage,Occupancy,HVACUsage,LightingUsage,RenewableEnergy,EnergyConsumption,Weekend
0,1,0,25.139433,43.431581,1565.693999,5,1,0,2.774699,75.364373,1
1,1,1,27.731651,54.225919,1411.064918,1,1,1,21.831384,83.401855,1
2,1,2,28.704277,58.907658,1755.715009,2,0,0,6.764672,78.270888,1
3,1,3,20.080469,50.371637,1452.316318,1,0,1,8.623447,56.519850,1
4,1,4,23.097359,51.401421,1094.130359,9,1,0,3.071969,70.811732,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,12,6,26.338718,52.580000,1563.567259,7,1,1,20.591717,70.270344,1
4996,12,17,20.007565,42.765607,1999.982252,5,0,1,7.536319,73.943071,0
4997,12,13,26.226253,30.015975,1999.982252,5,0,1,28.162193,85.784613,0
4998,12,8,24.673206,50.223939,1240.811298,2,1,1,20.918483,63.784001,1


In [132]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Month              5000 non-null   int64  
 1   Hour               5000 non-null   int64  
 2   Temperature        5000 non-null   float64
 3   Humidity           5000 non-null   float64
 4   SquareFootage      5000 non-null   float64
 5   Occupancy          5000 non-null   int64  
 6   HVACUsage          5000 non-null   int64  
 7   LightingUsage      5000 non-null   int64  
 8   RenewableEnergy    5000 non-null   float64
 9   EnergyConsumption  5000 non-null   float64
 10  Weekend            5000 non-null   int32  
dtypes: float64(5), int32(1), int64(5)
memory usage: 410.3 KB


# Modelling

## linear regression

In [133]:
corr = df.corr()['EnergyConsumption'].sort_values(ascending=False)
print(corr)


EnergyConsumption    1.000000
Temperature          0.535338
Occupancy            0.139919
HVACUsage            0.063065
RenewableEnergy      0.060490
Weekend              0.011368
SquareFootage        0.009470
LightingUsage        0.005100
Month               -0.007009
Hour                -0.014272
Humidity            -0.057593
Name: EnergyConsumption, dtype: float64


In [134]:
X = df.drop('EnergyConsumption', axis=1)
y = df['EnergyConsumption']


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [136]:
model = LinearRegression()
model.fit(X_train, y_train)


In [137]:
y_pred = model.predict(X_test)

In [138]:
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R² Score: 0.2678788501873447
RMSE: 7.817055624546971


In [139]:
df['EnergyConsumption'].describe()


count    5000.000000
mean       76.794919
std         9.231573
min        53.263278
25%        70.419588
50%        76.696267
75%        83.246274
max        99.201120
Name: EnergyConsumption, dtype: float64

## Lasso Regression

In [161]:
lasso_model=Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(alpha=1.0))])

In [162]:
lasso_model.fit(X_train, y_train)
y_pred1 = lasso_model.predict(X_test)

In [166]:
print("R²:", r2_score(y_test, y_pred1))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred1)))

R²: 0.2603068491140317
RMSE: 7.857375791568226


## Polynomial Linear Regression

In [167]:
poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # create squared/cross features
    ('scaler', StandardScaler()),                                 # scale features
    ('linear', LinearRegression())                                 # linear regression
])

In [168]:
# Train the model
poly_model.fit(X_train, y_train)

# Predict
y_pred2 = poly_model.predict(X_test)

In [169]:
print("R²:", r2_score(y_test, y_pred2))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred2)))

R²: 0.25289108859937626
RMSE: 7.896664449801982


## Random Forest

In [170]:
rf_model = RandomForestRegressor(
    n_estimators=200,    # number of trees
    max_depth=None,      # allow full depth (can tune later)
    random_state=42,
    n_jobs=-1            # use all CPU cores
)

In [172]:
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred3 = rf_model.predict(X_test)

# Evaluate
print("R²:", r2_score(y_test, y_pred3))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred3)))


R²: 0.24088637739874075
RMSE: 7.959854299554896


## Gradient Boosting

In [173]:
gb_model = GradientBoostingRegressor(
    n_estimators=300,      
    learning_rate=0.05,   
    max_depth=4,        
    subsample=0.8,         
    random_state=42
)

# Train model
gb_model.fit(X_train, y_train)

# Predict
y_pred4 = gb_model.predict(X_test)

# Evaluate
r2 = r2_score(y_test, y_pred4)
rmse = np.sqrt(mean_squared_error(y_test, y_pred4))

print("R² Score:", r2)
print("RMSE:", rmse)


R² Score: 0.2582684276950383
RMSE: 7.8681948884724475


## XG Boost

In [175]:
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred5 = xgb_model.predict(X_test)

print("R²:", r2_score(y_test, y_pred5))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred5)))

R²: 0.2570450804385147
RMSE: 7.874680771897443


## Support Vector Regressor (SVR)

In [178]:
svr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR(kernel='rbf', C=100, gamma=0.1))
])

svr_model.fit(X_train, y_train)
y_pred5 = svr_model.predict(X_test)

print("R²:", r2_score(y_test, y_pred5))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred5)))

R²: 0.108068832355534
RMSE: 8.628144221969038
