### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

### Loading the data set

In [2]:
energy_df = pd.read_csv('energydata_complete.csv')
energy_df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### Exploratory Data Analysis

In [3]:
energy_df.shape

(19735, 29)

In [4]:
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [5]:
energy_df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [6]:
# Checking for null values
missing_data = energy_df.isnull().sum().sort_values(ascending=False)
percentage_missing_data = energy_df.isnull().sum()/energy_df.isnull().count().sort_values(ascending = False) * 100
missing_values = pd.concat([missing_data, percentage_missing_data], axis=1, keys=['missing_data', 'percentage'])
missing_values

Unnamed: 0,missing_data,percentage
rv2,0,0.0
T6,0,0.0
Appliances,0,0.0
lights,0,0.0
T1,0,0.0
RH_1,0,0.0
T2,0,0.0
RH_2,0,0.0
T3,0,0.0
RH_3,0,0.0


### Question 12

The value of R^2 value in two d.p. is 0.65

In [7]:
# Selecting a sample of the dataset
x = energy_df[['T2']]
y = energy_df[['T6']]

In [8]:
#Split the data into the training and testing dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [9]:
#Fit the model to the training set
linear_model=LinearRegression()
linear_model.fit(x_train, y_train)

LinearRegression()

In [10]:
predicted_vals = linear_model.predict(x_test)

In [11]:
r2 = r2_score(y_test, predicted_vals)
round(r2, 2)

0.65

### Question 13

MAE : 0.05

In [12]:
#Removing [“date”, “lights”] column
df1 = energy_df.drop(['date', 'lights'], axis='columns')
df1.columns

Index(['Appliances', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4',
       'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9',
       'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility',
       'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

In [13]:
# Normalizing the dataset using the Min-Max Scaler
scaler = MinMaxScaler()
df2 = pd.DataFrame(scaler.fit_transform(df1), columns=df1.columns)
df2.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [14]:
#target variable is “Appliances”, the rest of the data are features
X = df2.drop(['Appliances'], axis = 'columns')
y = df2['Appliances']

In [15]:
#70-30 train-test set split with a random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
#Run a multiple linear regression using the training set 
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


LinearRegression()

In [17]:
#evaluating the model on the test set.
predictedvals = linear_model.predict(X_test)

In [18]:
#The Mean Absolute Error (in two decimal places)
mae = mean_absolute_error(y_test, predictedvals)
round(mae, 2)

0.05

### Question 14

The Residual Sum of Squares (in two decimal places) : 45.35

In [19]:
#Finding the Residual Sum of Squares
ssr = np.sum(np.square(y_test - predictedvals))
round(ssr, 2)

45.35

### Question 15

Root Mean Squared Error (in three decimal places) : 0.088

In [20]:
# Finding the Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(y_test, predictedvals))
round(rmse, 3)

0.088

### Question 16

Coefficient of Determination (in two decimal places) : 0.15

In [21]:
#Finding the the Coefficient of Determination 
r2_score = r2_score(y_test, predictedvals)
round(r2_score, 2)

0.15

### Question 17

Lowest weights : RH_2 	-0.456698

Highest weights : RH_1 	0.553547

In [22]:
#Obtaining the feature weights from your linear model above
important_values = linear_model.coef_
# summarize feature importance
for i,v in enumerate(important_values):
    print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: -0.00328
Feature: 1, Score: 0.55355
Feature: 2, Score: -0.23618
Feature: 3, Score: -0.45670
Feature: 4, Score: 0.29063
Feature: 5, Score: 0.09605
Feature: 6, Score: 0.02898
Feature: 7, Score: 0.02639
Feature: 8, Score: -0.01566
Feature: 9, Score: 0.01601
Feature: 10, Score: 0.23642
Feature: 11, Score: 0.03805
Feature: 12, Score: 0.01032
Feature: 13, Score: -0.04461
Feature: 14, Score: 0.10200
Feature: 15, Score: -0.15760
Feature: 16, Score: -0.18994
Feature: 17, Score: -0.03980
Feature: 18, Score: -0.32186
Feature: 19, Score: 0.00684
Feature: 20, Score: -0.07767
Feature: 21, Score: 0.02918
Feature: 22, Score: 0.01231
Feature: 23, Score: 0.11776
Feature: 24, Score: 0.00077
Feature: 25, Score: 0.00077


In [23]:
#Obtaining the feature weights from your linear model above
def get_weights_df(model, feat, col_name):
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

In [25]:
linear_model_weights = get_weights_df(linear_model, X_train, 'Linear_Model_Weight')


In [26]:
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


### Question 18

The MSE = 0.088

In [27]:
#Training a ridge regression model with an alpha value of 0.4.
ridge_reg= Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=0.4)

In [28]:
predicted_values2 = ridge_reg.predict(X_test)

In [29]:
#Calculating the root mean squared error (RMSE) 
rmse2 = np.sqrt(mean_squared_error(y_test, predicted_values2))
round(rmse2, 3)

0.088

### Question 19

 #### non-zero feature weights = 4
 
 RH_out 	-0.049557
 
 RH_8 	-0.000110
 
 Windspeed 	0.002912
 
 RH_1 	0.017880

In [30]:
#Training a lasso regression model with an alpha value of 0.001
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001)

In [31]:
#obtaining the new feature weights with lasso regression.
lasso_weights_df = get_weights_df(lasso_reg, X_train, 'Lasso_weight')
lasso_weights_df

Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


### QUESTION 20

The RMSE with Lasso Regression is 0.094

In [32]:
predicted_values3 = lasso_reg.predict(X_test)

In [33]:
# Calculating RMSE with the lasso regression
rmse3 = np.sqrt(mean_squared_error(y_test, predicted_values3))
round(rmse3, 3)

0.094