In [19]:
import pandas as pd
import numpy as np

In [20]:
#Loading our data
data = pd.read_csv("energydata_complete.csv")
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [21]:
#Describing our data
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Appliances,19735.0,97.694958,102.524891,10.0,50.0,60.0,100.0,1080.0
lights,19735.0,3.801875,7.935988,0.0,0.0,0.0,0.0,70.0
T1,19735.0,21.686571,1.606066,16.79,20.76,21.6,22.6,26.26
RH_1,19735.0,40.259739,3.979299,27.023333,37.333333,39.656667,43.066667,63.36
T2,19735.0,20.341219,2.192974,16.1,18.79,20.0,21.5,29.856667
RH_2,19735.0,40.42042,4.069813,20.463333,37.9,40.5,43.26,56.026667
T3,19735.0,22.267611,2.006111,17.2,20.79,22.1,23.29,29.236
RH_3,19735.0,39.2425,3.254576,28.766667,36.9,38.53,41.76,50.163333
T4,19735.0,20.855335,2.042884,15.1,19.53,20.666667,22.1,26.2
RH_4,19735.0,39.026904,4.341321,27.66,35.53,38.4,42.156667,51.09


In [22]:
#Checking on data types
data.dtypes

date            object
Appliances       int64
lights           int64
T1             float64
RH_1           float64
T2             float64
RH_2           float64
T3             float64
RH_3           float64
T4             float64
RH_4           float64
T5             float64
RH_5           float64
T6             float64
RH_6           float64
T7             float64
RH_7           float64
T8             float64
RH_8           float64
T9             float64
RH_9           float64
T_out          float64
Press_mm_hg    float64
RH_out         float64
Windspeed      float64
Visibility     float64
Tdewpoint      float64
rv1            float64
rv2            float64
dtype: object

In [23]:
#Checking for null values
data.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [24]:
#Checking for duplicated in our dataset
data.duplicated().sum()

0

In [25]:
#Identifying our dependent and independent variables
X = data[['T2']]  # Independent variable (living room temperature)
y = data['T6']    # Dependent variable (outside temperature)

In [26]:
#importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [27]:
# Remove the specified columns
columns_to_remove = ["date", "lights"]
data = data.drop(columns=columns_to_remove)

In [28]:
# Normalize the dataset using MinMaxScaler
scaler = MinMaxScaler()
normalised_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
X = normalised_data.drop(columns=['Appliances'])
y = normalised_data['Appliances']

In [29]:
#Now, we split our dataset into the training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [30]:
#Create and fit the linear regression model
linear_model = LinearRegression()
linear_model.fit(X, y)

LinearRegression()

In [31]:
#Fit the model to the training set
linear_model.fit(X_train, y_train)

LinearRegression()

In [32]:
#obtaining predictions
predicted_values = linear_model.predict(X_test)

In [33]:
#Mean Absolute Error
mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)


0.05

In [34]:
#Finding the R2
r2_score = r2_score(y_test, predicted_values)
round(r2_score, 2)


0.16

In [35]:
#The residual sum of squares
rss = np.sum((y_test - predicted_values) ** 2)
round(rss, 2)

46.1

In [36]:
#The Root Mean Squared Error
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)


0.088

In [37]:
from sklearn.linear_model import Ridge
ridge_reg =  Ridge(alpha = 0.5)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=0.5)

In [38]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001)

In [39]:
#Comparing the effects of regularisation
def get_weights_data(model, feat, col_name):#This function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_data = pd.DataFrame(weights).reset_index()
    weights_data.columns = ['features', col_name]
    weights_data[col_name].round(3)
    return weights_data

In [40]:
linear_model_weights = get_weights_data(linear_model, X_train, 'Linear_Model_Weight')
ridge_weights_data = get_weights_data(ridge_reg, X_train, 'Ridge_Weight')
lasso_weights_data = get_weights_data(lasso_reg, X_train, 'Lasso_weight')
final_weights = pd.merge(linear_model_weights, ridge_weights_data, on='features')
final_weights = pd.merge(final_weights, lasso_weights_data, on='features')

In [41]:
print(final_weights)


       features  Linear_Model_Weight  Ridge_Weight  Lasso_weight
0          RH_2            -0.458176     -0.402797     -0.000000
1         T_out            -0.330375     -0.257457      0.000000
2            T2            -0.249801     -0.207220      0.000084
3            T9            -0.204718     -0.202972     -0.000000
4          RH_8            -0.156534     -0.155679     -0.000000
5        RH_out            -0.076223     -0.048460     -0.047062
6          RH_7            -0.047614     -0.049049     -0.000000
7          RH_9            -0.036034     -0.038121     -0.000000
8           rv2            -0.000350     -0.000355     -0.000000
9           rv1            -0.000350     -0.000355     -0.000000
10  Press_mm_hg             0.004642      0.004321     -0.000000
11           T1             0.005960     -0.012632      0.000000
12   Visibility             0.010992      0.010747      0.000000
13           T7             0.012583      0.011932     -0.000000
14           T5          

In [None]:
# From the question, RH_2 and RH_1 has the lowest and the highest weights respectively, in the linear model.

In [42]:
#Train a ridge regression model with an alpha value of 0.4. 
#Is there any change to the root mean squared error (RMSE) when evaluated on the test set?
ridge_reg =  Ridge(alpha = 0.4)
ridge_reg.fit(X_train, y_train)
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

0.088

In [None]:
# There is no change.