In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns

# preprocessing
from sklearn.preprocessing import MinMaxScaler  

# modeling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.model_selection import train_test_split

# performance evaluation
from sklearn.metrics import mean_absolute_error 

import warnings
warnings.filterwarnings("ignore")



In [2]:
energy_df = pd.read_csv("energydata_complete.csv")
energy_df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [3]:
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [4]:
energy_df.shape

(19735, 29)

In [5]:
# checking for missing values
energy_df.isnull().any().sum()

0

In [6]:
# checking for duplicates
energy_df.duplicated().sum()

0

In [7]:
# summary statitsics for the dataset
energy_df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [8]:
energy_df.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

# Answers to the Questions from the Quiz

Qtn 12. From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [9]:
#select a sample of the dataset 
simple_linear_reg_df = energy_df[['T2', 'T6']].sample(15, random_state=2)
simple_linear_reg_df

Unnamed: 0,T2,T6
1117,17.426667,-4.238889
16275,22.76,14.69
13272,21.23,8.926667
3160,21.1,7.69
19210,21.856667,10.8
8260,17.356667,5.0
12299,19.0,5.3
13505,22.7,17.133333
12913,19.593333,7.545
17788,21.89,9.19


In [10]:
# Fit a linear model
lin_reg = LinearRegression()
lin_reg.fit(energy_df[['T2']], energy_df['T6'])

# Get the R^2 value
r_squared = lin_reg.score(energy_df[['T2']], energy_df['T6'])

# Print the R^2 value
print('R^2:', round(r_squared, 2))

R^2: 0.64


Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

Qtn 13. What is the Mean Absolute Error (in two decimal places)?

In [11]:
# Remove the columns "date" and "lights"
energy_df = energy_df.drop(['date', 'lights'], axis=1)

In [12]:
energy_df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [13]:
energy_df.columns

Index(['Appliances', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
       'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9',
       'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [14]:
#Firstly,   we   normalise   our   dataset   to   a   common   scale   using   the   min   max   scaler    
scaler = MinMaxScaler()  
normalised_df = pd.DataFrame(scaler.fit_transform(energy_df), columns=energy_df.columns)  
features_df = normalised_df.drop(columns=['Appliances'])  
appliances_target = normalised_df['Appliances']  

In [15]:
# Use a 70-30 train-test set split with a random state of 42
x_train, x_test, y_train, y_test = train_test_split(features_df, appliances_target, test_size= 0.3, random_state= 42)  

In [16]:
 
# Create the linear regression model
model = LinearRegression()

#fit the model to the training dataset  
model.fit(x_train, y_train)  

#obtain predictions  
predicted_values = model.predict(x_test)  

#MAE 
mae = mean_absolute_error(y_test,   predicted_values)  

print ("MAE:", round(mae, 2)) 


MAE: 0.05


Qtn 14. What is the Residual Sum of Squares (in two decimal places)?

In [17]:
# Residual Sum of Squares (RSS)  
rss = np.sum(np.square(y_test - predicted_values))  
print("RSS:", round(rss, 2))

RSS: 45.35


Qtn 15. What is the Root Mean Squared Error (in three decimal places)?

In [18]:
# Root Mean Square Error (RMSE)  
from sklearn.metrics import mean_squared_error  
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))  
print("RMSE:", round(rmse, 3))

RMSE: 0.088


Qtn 16. What is the Coefficient of Determination (in two decimal places)?

In [19]:
#R-Squared  
from sklearn.metrics import r2_score  
r2_score = r2_score(y_test, predicted_values)  
print("R-Squared:", round(r2_score, 2)) 

R-Squared: 0.15


Qtn 17. Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [20]:
# Obtain feature weights
feature_weights = model.coef_

# Find features with lowest and highest weights
lowest_weight_feature = None
highest_weight_feature = None
lowest_weight = float('inf')
highest_weight = float('-inf')

for feature, weight in zip(x_train.columns, feature_weights):
    if weight < lowest_weight:
        lowest_weight = weight
        lowest_weight_feature = feature
    if weight > highest_weight:
        highest_weight = weight
        highest_weight_feature = feature

print("Feature with lowest weight:", lowest_weight_feature)
print("Feature with highest weight:", highest_weight_feature)

Feature with lowest weight: RH_2
Feature with highest weight: RH_1


Qtn 18. Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [21]:
# Ridge Regression
ridge_reg = Ridge(alpha= 0.4 )
ridge_reg.fit(x_train, y_train)

In [22]:
# Make predictions on the test set
y_pred_ridge = ridge_reg.predict(x_test)

# Calculate RMSE for Ridge regression
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))

print("RMSE with Ridge Regression:", round(rmse_ridge, 3))

RMSE with Ridge Regression: 0.088


Qtn 19. Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [23]:
# Feature Selection and Lasso Regression
lasso_reg = Lasso(alpha= 0.001)
lasso_reg.fit(x_train, y_train)

In [24]:
#comparing the effects of regularisation
def get_weights_df(model, feat, col_name):
    '''
    this function returns the weight of every feature
    '''
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = [ 'Features' , col_name]
    weights_df[col_name].round( 3 )
    return weights_df

In [25]:
linear_model_weights = get_weights_df(model, x_train, 'Linear_Model_Weight' )
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight' )
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight' )

In [26]:
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on= 'Features' )
final_weights = pd.merge(final_weights, lasso_weights_df, on= 'Features' )

In [27]:
final_weights

Unnamed: 0,Features,Linear_Model_Weight,Ridge_Weight,Lasso_weight
0,RH_2,-0.456698,-0.411071,-0.0
1,T_out,-0.32186,-0.262172,0.0
2,T2,-0.236178,-0.201397,0.0
3,T9,-0.189941,-0.188916,-0.0
4,RH_8,-0.157595,-0.15683,-0.00011
5,RH_out,-0.077671,-0.054724,-0.049557
6,RH_7,-0.044614,-0.045977,-0.0
7,RH_9,-0.0398,-0.041367,-0.0
8,T5,-0.015657,-0.019853,-0.0
9,T1,-0.003281,-0.018406,0.0


Qtn 20. What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [28]:
# Make predictions on the test set
y_pred = lasso_reg.predict(x_test)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE with Lasso Regression:", round(rmse, 3))

RMSE with Lasso Regression: 0.094
