In [122]:
# importing required libraries
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

### Loading the Dataset

In [2]:
# reading the csv file and assigning it to a variable
energy_data = pd.read_csv("datasets/energydata_complete.csv")

In [3]:
# loading the dataset
energy_data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [4]:
# getting information about the dataset
energy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [5]:
# getting a description of the dataset
energy_data.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [6]:
# checking for empty fields in the dataset
energy_data.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

since there is no empty field in the dataset, we proceed further

In [7]:
# dropping the date and lights columns
new_df = energy_data.drop(["date", "lights"], axis = 1)

In [8]:
new_df

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,17.166667,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,60,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,17.166667,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,50,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,17.166667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,50,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,17.166667,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,60,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,17.200000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,23.200000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,90,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,23.230000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,270,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,23.230000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,420,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,23.200000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [9]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 27 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Appliances   19735 non-null  int64  
 1   T1           19735 non-null  float64
 2   RH_1         19735 non-null  float64
 3   T2           19735 non-null  float64
 4   RH_2         19735 non-null  float64
 5   T3           19735 non-null  float64
 6   RH_3         19735 non-null  float64
 7   T4           19735 non-null  float64
 8   RH_4         19735 non-null  float64
 9   T5           19735 non-null  float64
 10  RH_5         19735 non-null  float64
 11  T6           19735 non-null  float64
 12  RH_6         19735 non-null  float64
 13  T7           19735 non-null  float64
 14  RH_7         19735 non-null  float64
 15  T8           19735 non-null  float64
 16  RH_8         19735 non-null  float64
 17  T9           19735 non-null  float64
 18  RH_9         19735 non-null  float64
 19  T_ou

In [10]:
# rearranging the order of the new_df dataframe by moving
# the "Appliances" column to the last column
mid = new_df['Appliances']
new_df.drop(labels = ['Appliances'], axis=1, inplace = True)
new_df.insert(26, 'Appliances', mid)
new_df

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,17.166667,55.200000,...,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433,60
1,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,17.166667,55.200000,...,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195,60
2,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,17.166667,55.090000,...,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668,50
3,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,17.166667,55.090000,...,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389,50
4,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,17.200000,55.090000,...,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,23.200000,52.400000,...,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812,100
19731,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,23.230000,52.326667,...,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940,90
19732,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,23.230000,52.266667,...,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117,270
19733,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,23.200000,52.200000,...,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784,420


In [11]:
new_df.count()

T1             19735
RH_1           19735
T2             19735
RH_2           19735
T3             19735
RH_3           19735
T4             19735
RH_4           19735
T5             19735
RH_5           19735
T6             19735
RH_6           19735
T7             19735
RH_7           19735
T8             19735
RH_8           19735
T9             19735
RH_9           19735
T_out          19735
Press_mm_hg    19735
RH_out         19735
Windspeed      19735
Visibility     19735
Tdewpoint      19735
rv1            19735
rv2            19735
Appliances     19735
dtype: int64

### Scaling

In [12]:
# carrying out scaling on the dataset before splitting
# applying the Min Max Scaler technique using Fit Transform.

# retrieving just the numeric input values
data = new_df.values[:, :]

# performing a robust scaler transform of the dataset
trans = MinMaxScaler()
data = trans.fit_transform(data)

# convering the array back to a dataframe
scaled_dataset = pd.DataFrame(data, columns = ["T1", "RH_1", "T2", "RH_2", "T3", "RH_3", "T4", "RH_4", "T5", "RH_5", "T6", "RH_6", "T7", "RH_7", "T8", "RH_8", "T9", "RH_9", "T_out", "Press_mm_hg", "RH_out", "Windspeed", "Visibility", "Tdewpoint", "rv1", "rv2", "Appliances"])

In [13]:
# loading the scaled dataset
scaled_dataset

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,...,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449,0.046729
1,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,...,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083,0.046729
2,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,...,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848,0.037383
3,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,...,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261,0.037383
4,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,...,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611,0.046729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,0.339590,...,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981,0.084112
19731,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,0.338487,...,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726,0.074766
19732,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,0.337585,...,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979,0.242991
19733,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,0.336583,...,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371,0.383178


We can see that the distributions have been adjusted and that the minimum and maximum values for each variable are now a crisp 0.0 and 1.0 respectively.



In [14]:
# summarizing the dataset
scaled_dataset.describe()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,0.517061,0.364271,0.308303,0.56117,0.421038,0.489601,0.518499,0.485143,0.407272,0.317777,...,0.512655,0.399089,0.609828,0.733558,0.288554,0.574321,0.46881,0.499742,0.499742,0.081958
std,0.169595,0.109512,0.159412,0.114438,0.166676,0.152107,0.184044,0.185289,0.176266,0.135656,...,0.171833,0.170978,0.17208,0.196067,0.175087,0.181457,0.189803,0.289984,0.289984,0.095818
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.419219,0.283735,0.195542,0.490299,0.298272,0.380122,0.399099,0.335894,0.281653,0.234337,...,0.386313,0.278671,0.503101,0.609649,0.142857,0.430769,0.339367,0.249895,0.249895,0.037383
50%,0.50792,0.347675,0.283499,0.563408,0.407112,0.456302,0.501502,0.458387,0.38796,0.289821,...,0.485651,0.383173,0.623256,0.785088,0.261905,0.6,0.453997,0.497934,0.497934,0.046729
75%,0.613516,0.441519,0.392537,0.641016,0.505982,0.60726,0.630631,0.618722,0.50546,0.358586,...,0.627956,0.495445,0.735659,0.890351,0.392857,0.6,0.595777,0.751701,0.751701,0.084112
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Training & Testing Data

In [15]:
# defining the dependent and independent variables
X = scaled_dataset.loc[:, ["T2"]].values
y = scaled_dataset.loc[:, ["T6"]].values

In [16]:
# splitting the scaled_dataset into the Training set and Test set using train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [17]:
# fitting our Multiple Linear Regression to the Training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [18]:
# predicting the Test set results
y_test_pred = regressor.predict(X_test)

In [19]:
y_test_pred

array([[0.23928945],
       [0.46794238],
       [0.23108472],
       ...,
       [0.3001772 ],
       [0.4297256 ],
       [0.3217686 ]])

In [20]:
# printing the intercept of the trained model
print('Intercept: \n', regressor.intercept_)

Intercept: 
 [0.13176428]


In [21]:
# printing the coefficients of the trained model
print('Coefficients: \n', regressor.coef_)

Coefficients: 
 [[0.8910771]]


In [22]:
# showing the output of the model prediction in a dataframe
y_test_pred_df = pd.DataFrame(y_test_pred, columns = ["Predicted Appliances"])

In [23]:
# displaying the first 20 field of the dataframe
y_test_pred_df.head(20)

Unnamed: 0,Predicted Appliances
0,0.239289
1,0.467942
2,0.231085
3,0.636355
4,0.155299
5,0.442033
6,0.182936
7,0.423248
8,0.351997
9,0.237346


In [39]:
# R^2 value in two D.P
r2_score(y_test, y_test_pred)

0.6427789637197382

### Building the optimal model using Backward Elimination

In [24]:
# adding the column of ones to our matrix of features X
new_X = np.append(arr = np.ones((19735, 1)).astype(int), values = X, axis = 1)

In [25]:
# loading the new matrix of features
new_X

array([[1.        , 0.22534529],
       [1.        , 0.22534529],
       [1.        , 0.22534529],
       ...,
       [1.        , 0.69265118],
       [1.        , 0.67705355],
       [1.        , 0.66617051]])

In [26]:
# turning the new_X varaible into a dataframe
new_X_df = pd.DataFrame(new_X, columns = ["constant", "T2"])

In [171]:
new_X_df

Unnamed: 0,constant,T2
0,1.0,0.225345
1,1.0,0.225345
2,1.0,0.225345
3,1.0,0.225345
4,1.0,0.225345
...,...,...
19730,1.0,0.711655
19731,1.0,0.701769
19732,1.0,0.692651
19733,1.0,0.677054


In [27]:
# selecting all the independent variables for initial test
# before backward elimination if necessary
X_opt = new_X_df.loc[:, ["constant", "T2"]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.642
Model:,OLS,Adj. R-squared:,0.642
Method:,Least Squares,F-statistic:,35370.0
Date:,"Sun, 29 Aug 2021",Prob (F-statistic):,0.0
Time:,05:13:11,Log-Likelihood:,16273.0
No. Observations:,19735,AIC:,-32540.0
Df Residuals:,19733,BIC:,-32530.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,0.1321,0.002,80.354,0.000,0.129,0.135
T2,0.8910,0.005,188.073,0.000,0.882,0.900

0,1,2,3
Omnibus:,842.252,Durbin-Watson:,0.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,964.072
Skew:,-0.51,Prob(JB):,4.51e-210
Kurtosis:,3.365,Cond. No.,6.88


### Performance Metrics

In [44]:
# calculating the mean absolute error (MAE) in 2 D.P
mae = mean_absolute_error(y_test, y_test_pred)
print("The Mean Absolute Error (MAE) is %.3f" %mae)

The Mean Absolute Error (MAE) is 0.082


In [46]:
# calculating the sum of squared residual in 2 D.P
print(regressor_OLS.ssr)

222.087783828228


In [48]:
# calculating the root mean squared error in 3 D.P
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("The Root Mean Squared Error (RMSE) is %.3f" %rmse)

The Root Mean Squared Error (RMSE) is 0.106


In [51]:
# coefficient of determination in 2 D.P
cod = r2_score(y_test, y_test_pred)
print("The Coefficient of Determination is %.3f" %cod)

The Coefficient of Determination is 0.643


In [92]:
# defining the dependent and independent variables
X = scaled_dataset.iloc[:, :-1 ].values
y = scaled_dataset.loc[:, ["Appliances"] ].values

In [93]:
# splitting the scaled_dataset into the Training set and Test set using train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [94]:
# fitting our Multiple Linear Regression to the Training set
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [95]:
# predicting the Test set results
y_test_pred = regressor.predict(X_test)

In [96]:
y_test_pred

array([[0.03322207],
       [0.24411599],
       [0.03400024],
       ...,
       [0.06844707],
       [0.10032325],
       [0.05722198]])

In [102]:
# printing the intercept of the trained model
print('Intercept: \n', regressor.intercept_)

Intercept: 
 [0.15290296]


In [103]:
# printing the coefficients of the trained model
print('Coefficients: \n', regressor.coef_)

Coefficients: 
 [[-0.00328105  0.5535466  -0.23617792 -0.45669795  0.29062714  0.09604827
   0.028981    0.02638578 -0.01565684  0.01600579  0.23642491  0.03804865
   0.01031878 -0.04461364  0.10199505 -0.15759548 -0.18994077 -0.03980032
  -0.32185967  0.00683933 -0.07767065  0.02918313  0.01230661  0.11775773
   0.0007701   0.0007701 ]]


In [104]:
# showing the output of the model prediction in a dataframe
y_test_pred_df = pd.DataFrame(y_test_pred, columns = ["Predicted Appliances"])

In [107]:
# displaying the first 20 field of the dataframe
y_test_pred_df.head(20)

Unnamed: 0,Predicted Appliances
0,0.033222
1,0.244116
2,0.034
3,0.105377
4,0.02336
5,0.162821
6,0.074288
7,0.163705
8,0.075069
9,0.101717


### Building the optimal model using Backward Elimination

In [108]:
# adding the column of ones to our matrix of features X
new_X = np.append(arr = np.ones((19735, 1)).astype(int), values = X, axis = 1)

In [110]:
# loading the new matrix of features
new_X

array([[1.        , 0.32734952, 0.56618659, ..., 0.53846154, 0.26544891,
        0.26544891],
       [1.        , 0.32734952, 0.54132648, ..., 0.53393665, 0.37208289,
        0.37208289],
       [1.        , 0.32734952, 0.53050179, ..., 0.52941176, 0.57284766,
        0.57284766],
       ...,
       [1.        , 0.91974657, 0.53866618, ..., 0.89894419, 0.58397859,
        0.58397859],
       [1.        , 0.91974657, 0.54949087, ..., 0.8974359 , 0.12637146,
        0.12637146],
       [1.        , 0.91974657, 0.53875791, ..., 0.8959276 , 0.68239057,
        0.68239057]])

In [112]:
# turning the new_X varaible into a dataframe
new_X_df = pd.DataFrame(new_X, columns = ["constant", "T1", "RH_1", "T2", "RH_2", "T3", "RH_3", "T4", "RH_4", "T5", "RH_5", "T6", "RH_6", "T7", "RH_7", "T8", "RH_8", "T9", "RH_9", "T_out", "Press_mm_hg", "RH_out", "Windspeed", "Visibility", "Tdewpoint", "rv1", "rv2"])

In [113]:
new_X_df

Unnamed: 0,constant,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,1.0,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,1.0,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,1.0,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,1.0,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,1.0,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,1.0,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,1.0,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,1.0,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,1.0,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


In [114]:
# selecting all the independent variables for initial test
# before backward elimination if necessary
X_opt = new_X_df.loc[:, ["constant", "T1", "RH_1", "T2", "RH_2", "T3", "RH_3", "T4", "RH_4", "T5", "RH_5", "T6", "RH_6", "T7", "RH_7", "T8", "RH_8", "T9", "RH_9", "T_out", "Press_mm_hg", "RH_out", "Windspeed", "Visibility", "Tdewpoint", "rv1", "rv2"]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.146
Model:,OLS,Adj. R-squared:,0.145
Method:,Least Squares,F-statistic:,135.3
Date:,"Sun, 29 Aug 2021",Prob (F-statistic):,0.0
Time:,06:42:18,Log-Likelihood:,19845.0
No. Observations:,19735,AIC:,-39640.0
Df Residuals:,19709,BIC:,-39430.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,0.1552,0.020,7.638,0.000,0.115,0.195
T1,0.0035,0.016,0.209,0.834,-0.029,0.036
RH_1,0.5457,0.023,23.775,0.000,0.501,0.591
T2,-0.2611,0.021,-12.333,0.000,-0.303,-0.220
RH_2,-0.4791,0.026,-18.612,0.000,-0.530,-0.429
T3,0.2854,0.012,23.858,0.000,0.262,0.309
RH_3,0.1010,0.014,7.404,0.000,0.074,0.128
T4,0.0309,0.010,3.019,0.003,0.011,0.051
RH_4,0.0427,0.014,3.070,0.002,0.015,0.070

0,1,2,3
Omnibus:,14033.514,Durbin-Watson:,0.608
Prob(Omnibus):,0.0,Jarque-Bera (JB):,213085.4
Skew:,3.344,Prob(JB):,0.0
Kurtosis:,17.642,Cond. No.,1.09e+16


from the summary above, it can be observed that the features with the highest and lowest weights are **RH_2** with a weight of **-0.4791** and **RH_1** with a weight of **0.5457**

In [118]:
# training with a Ridge model with an alpha value of 0.4
ridge_reg = Ridge(alpha = 0.4)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=0.4)

In [120]:
y_test_pred = ridge_reg.predict(X_test)

In [121]:
# calculating the root mean squared error in 3 D.P
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("The Root Mean Squared Error (RMSE) is %.3f" %rmse)

The Root Mean Squared Error (RMSE) is 0.088


It can be observed that there is a change in the root mean squared error (RMSE) when trained with a Ridge Regression Model.

In [123]:
# training with a Lassso model with an alpha value of 0.001
lasso_reg = Lasso(alpha = 0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001)

In [124]:
y_test_pred = lasso_reg.predict(X_test)

In [125]:
y_test_pred

array([0.07370267, 0.08143458, 0.07716072, ..., 0.07792848, 0.09034412,
       0.08359255])

In [131]:
# selecting all the independent variables for initial test
# before backward elimination if necessary
X_opt = new_X_df.loc[:, ["constant", "T1", "RH_1", "T2", "RH_2", "T3", "RH_3", "T4", "RH_4", "T5", "RH_5", "T6", "RH_6", "T7", "RH_7", "T8", "RH_8", "T9", "RH_9", "T_out", "Press_mm_hg", "RH_out", "Windspeed", "Visibility", "Tdewpoint", "rv1", "rv2"]]
lasso_reg_OLS = sm.OLS(endog = y, exog = X_opt).fit()
lasso_reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.146
Model:,OLS,Adj. R-squared:,0.145
Method:,Least Squares,F-statistic:,135.3
Date:,"Sun, 29 Aug 2021",Prob (F-statistic):,0.0
Time:,07:28:24,Log-Likelihood:,19845.0
No. Observations:,19735,AIC:,-39640.0
Df Residuals:,19709,BIC:,-39430.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,0.1552,0.020,7.638,0.000,0.115,0.195
T1,0.0035,0.016,0.209,0.834,-0.029,0.036
RH_1,0.5457,0.023,23.775,0.000,0.501,0.591
T2,-0.2611,0.021,-12.333,0.000,-0.303,-0.220
RH_2,-0.4791,0.026,-18.612,0.000,-0.530,-0.429
T3,0.2854,0.012,23.858,0.000,0.262,0.309
RH_3,0.1010,0.014,7.404,0.000,0.074,0.128
T4,0.0309,0.010,3.019,0.003,0.011,0.051
RH_4,0.0427,0.014,3.070,0.002,0.015,0.070

0,1,2,3
Omnibus:,14033.514,Durbin-Watson:,0.608
Prob(Omnibus):,0.0,Jarque-Bera (JB):,213085.4
Skew:,3.344,Prob(JB):,0.0
Kurtosis:,17.642,Cond. No.,1.09e+16


In [135]:
# calculating the root mean squared error in 3 D.P
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("The Root Mean Squared Error (RMSE) is %.3f" %rmse)

The Root Mean Squared Error (RMSE) is 0.094
