# Energy Data Regression

## Import libraries

In [1]:
import gdown
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from zipfile import ZipFile
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

## Get and load data

In [2]:
# Download the data from gdrive
url = 'https://drive.google.com/uc?id=1Eru_UHVc3WLHVveC9Q8K9QUxlzYeHt18'
output = 'energydata_complete.csv'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1Eru_UHVc3WLHVveC9Q8K9QUxlzYeHt18
To: /home/jovyan/work/Documents/DS-Resources/HAMOYE/Regression/energydata_complete.csv
100%|██████████| 12.0M/12.0M [00:24<00:00, 499kB/s]


'energydata_complete.csv'

In [3]:
data = pd.read_csv(output)

In [4]:
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [5]:
data.shape

(19735, 29)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [7]:
data.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [8]:
data.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

## Task 1
From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [9]:
X = data['T2'].values.reshape(-1,1)
y = data['T6'].values
lr = LinearRegression()
lr.fit(X,y)
r_2 = lr.score(X,y)
print(f"R_Squared value: {r_2:.2f}")

R_Squared value: 0.64


## Task 2
Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

What is the Mean Absolute Error (in two decimal places)?


In [10]:
# Remove columns : [“date”, “lights”]

lr_data = data.copy()
lr_data.drop(columns= ["date", "lights"], inplace = True )

scaler = MinMaxScaler()
# Fit and transform the entire dataset using the scaler
scaled_data = scaler.fit_transform(lr_data)

# Split the scaled data into features (X) and target variable (y)
X = scaled_data[:, 1:]  # Select the columns for features
y = scaled_data[:, :1]

# Train Test Split
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state= 42 )

In [11]:
# Linear Regression
lr2 = LinearRegression()
lr2.fit(X_train, y_train)
y_hat = lr2.predict(X_test)

# Regression Metrics
mae = mean_absolute_error(y_test, y_hat)
mse = mean_squared_error(y_test, y_hat)

In [12]:
print(f"Mean Absolute Error: {mae:.2f}")

Mean Absolute Error: 0.05


## Task 3
What is the Residual Sum of Squares (in two decimal places)?

In [13]:
# Get Residual Sum of Squares
rss = mse * len(y_test)
print(f"Residual Sum of Squares: {rss:.2f}")

Residual Sum of Squares: 45.35


## Task 4
What is the Root Mean Squared Error (in three decimal places)?

In [14]:
# Get Root Mean Squared Error
rmse = (mse ** 0.5)
print(f"Root Mean Squared Error: {rmse:.3f}")

Root Mean Squared Error: 0.088


## Task 5
What is the Coefficient of Determination (in two decimal places)?

In [15]:
# Get coefficient of determination (R-Squared)
r2 = r2_score(y_test, y_hat)
print(f"Coefficient of Determination (R-squared): {r2:.2f}")

Coefficient of Determination (R-squared): 0.15


## Task 6
Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [16]:
# Get coefficients and feature names
coefficients = lr2.coef_[0]
feature_names = lr_data.drop('Appliances', axis=1).columns

# Get Index of Max and Min weights
max_index = np.argmax(coefficients)
min_index = np.argmin(coefficients)

# Get the feature with max and min weights
lowest_weight = feature_names[min_index]
highest_weight = feature_names[max_index]

# Print Results
print(f"Feature with Lowest Weight: {lowest_weight}\nFeature with Highest Weight: {highest_weight}")

Feature with Lowest Weight: RH_2
Feature with Highest Weight: RH_1


## Task 7
Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [17]:
# Instantiate and train RIdge model
ridge = Ridge(alpha= 0.4).fit(X_train, y_train)
y_pred = ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred)
rmse_ridge = (mse_ridge ** 0.5)
print(f"Root Mean Squared Error: {rmse_ridge:.3f}")

Root Mean Squared Error: 0.088


In [18]:
# Is there any change
round(rmse_ridge, 3) != round(rmse, 3)

False

## Task 8
Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights

In [19]:
## Instantiate and fit lasso model
lasso = Lasso(alpha = 0.001).fit(X_train, y_train)

## Get number of non-zero coefficients
non_zero = [coef for coef in lasso.coef_ if coef != 0]
len(non_zero)

4

## Task 9
What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)


In [20]:
## Predict and get RMSE
y_pred = lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred)
rmse_lasso = (mse_lasso ** 0.5)
print(f"Root Mean Squared Error: {rmse_lasso:.3f}")

Root Mean Squared Error: 0.094
