# **Problem Definition : The dataset for the remainder of this quiz is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network.Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).**

# **To answer the questions, you will need to normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set.**

# **Importing necessary libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot  as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# **Loading the dataset**

In [None]:
energy_data = pd.read_csv('energydata_complete.csv')

In [None]:
energy_data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


# **Dataset Summary**

In [None]:
energy_data.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,25.795,96.321667,28.29,99.9,26.0,51.4,27.23,58.78,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


# **Data Wrangling**

In [None]:
#dropping the 'date' & 'lights' column from the dataset
energy_data.drop(['date','lights'],axis=1,inplace=True)

# **Data preprocessing**

In [None]:
#Using MinMaxScaler to scale the columns
scaler = MinMaxScaler()
normal_energy_data = pd.DataFrame(scaler.fit_transform(energy_data),columns=energy_data.columns)

In [None]:
#Splitting the data into predictor(Independent) & response(Dependent) variables [Since target variable is 'Appliances]

#Predictors
features = normal_energy_data.drop(['Appliances'],axis=1)

#Target
target = normal_energy_data['Appliances']

In [None]:
#Splitting into train & test sets
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size=0.3,random_state=42)

# **Quiz Questions**

**Question 12: From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P?**

**Answer = 0.64**

In [None]:
#Obtaining the training and testing set of T2 & T6 column from the actual dataset
trainx = x_train[['T2']]
trainy = x_train[['T6']]
testx = x_test[['T2']]
testy = x_test[['T6']]

#Initiating the model
model = LinearRegression()

#Fitting the model with the training set
model.fit(trainx,trainy)

#Making predictions
pred = model.predict(testx)

#Calculate the r2 score 
r2_value = round(r2_score(testy,pred),2)
r2_value

0.64

**Question 13: What is the mean absolute error(in two decimal places)?**

**Ans : 0.05**

In [None]:
# Applying the linear model on the normalized data

#Initiating the model
reg = LinearRegression()

#Fitting the model
reg.fit(x_train,y_train)

#Making predictions
predict = reg.predict(x_test)

In [None]:
#Computing MAE
mae = round(mean_absolute_error(y_test,predict),2)
mae

0.05

**Question 14: What is the Residual Sum of Squares(in two decimal places)**

**Ans : 45.35**

In [None]:
#Calculating residual sum of squares

RSS = np.sum(np.square(y_test-predict))
round(RSS,2)

45.35

**Question 15: What is the Root Mean Squared Error(in three decimal places)**

**Ans : 0.088**

In [None]:
#Calculate the root mean squared error

RMSE = np.sqrt(mean_squared_error(y_test,predict))
round(RMSE,3)

0.088

**Question 16: What is the Coefficient of Determination(in two decimal places)**

**Ans : 0.15**

In [None]:
# Calculating the determination (r2 score)

R2_score = r2_score(y_test,predict)
round(R2_score,2)

0.15

**Question 17: Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?**

**Ans : We can see from the Table below that the features with the lowest and highest weights are RH_2, RH_1 respectively.**

In [None]:
def get_weights_df(reg, feat, col_name):
  #this function returns the weight of every feature
  weights = pd.Series(reg.coef_, feat.columns).sort_values()
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df

# Execute the get_weight function and store in dataframe
linear_weights_df = get_weights_df(reg, x_train, 'Linear_weight')

linear_weights_df

Unnamed: 0,Features,Linear_weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


**Question 18: Train a Ridge Regression model with an alpha value of 0.4. Is there any change to the root mean squared error(RMSE) when evaluated on the test set?**

**Ans :There are no differences between the values (both 0.088), hence the answer is No.**

In [None]:
# Initiating the Ridge model[Default alpha value]
ridge = Ridge()

#Fitting the ridge model
ridge.fit(x_train,y_train)

#Making predictions
ridge_pred= ridge.predict(x_test)

#Calculate root mean squared error
ridge_rmse = round(np.sqrt(mean_squared_error(y_test,ridge_pred)),3)

ridge_rmse

0.088

In [None]:
# Initiating the Ridge model[alpha value=0.4]
ridge1 = Ridge(alpha=0.4)

#Fitting the ridge model
ridge1.fit(x_train,y_train)

#Making predictions
ridge_pred1= ridge1.predict(x_test)

#Calculate root mean squared error
ridge_rmse1 = round(np.sqrt(mean_squared_error(y_test,ridge_pred1)),3)

ridge_rmse

0.088

**Question 19: Train a Lasso regression model with an aplha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?**

**Ans : From the table below, we have 4 features that have non-zero feature weights**

In [None]:
#Initiating the Lasso Model[alpha = 0.001]
lasso = Lasso(alpha=0.001)

#Fitting the lasoo model
lasso.fit(x_train,y_train)

#Making Predictions
lasso_pred = lasso.predict(x_test)

# defining a function to get weights
def get_weights_df(reg, feat, col_name):
  #this function returns the weight of every feature
  weights = pd.Series(reg.coef_, feat.columns).sort_values()
  weights_df = pd.DataFrame(weights).reset_index()
  weights_df.columns = ['Features', col_name]
  weights_df[col_name].round(3)
  return weights_df

lasso_weights_df = get_weights_df(lasso, x_train, 'Lasso_weight')

# Non-zero weights
non_zero = lasso_weights_df[lasso_weights_df['Lasso_weight'] != 0]

non_zero


Unnamed: 0,Features,Lasso_weight
0,RH_out,-0.049557
1,RH_8,-0.00011
24,Windspeed,0.002912
25,RH_1,0.01788


**Question 20: What is the new RMSE with Lasso Regression (in 3 decimal places)**

**Ans : 0.094**

In [None]:
#Computing the new RMSE with lasso regression
RMSE_lasso = round(np.sqrt(mean_squared_error(y_test, lasso_pred)), 3)
RMSE_lasso

0.094