In [1]:
import pandas as pd
import numpy as np

## Dataset

### Description
The dataset for the remainder of this quiz is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters)

### Features
Date, time year-month-day hour:minute:second

Appliances, energy use in Wh

lights, energy use of light fixtures in the house in Wh

T1, Temperature in kitchen area, in Celsius

RH_1, Humidity in kitchen area, in %

T2, Temperature in living room area, in Celsius

RH_2, Humidity in living room area, in %

T3, Temperature in laundry room area

RH_3, Humidity in laundry room area, in %

T4, Temperature in office room, in Celsius

RH_4, Humidity in office room, in %

T5, Temperature in bathroom, in Celsius

RH_5, Humidity in bathroom, in %

T6, Temperature outside the building (north side), in Celsius

RH_6, Humidity outside the building (north side), in %

T7, Temperature in ironing room , in Celsius

RH_7, Humidity in ironing room, in %

T8, Temperature in teenager room 2, in Celsius

RH_8, Humidity in teenager room 2, in %

T9, Temperature in parents room, in Celsius

RH_9, Humidity in parents room, in %

To, Temperature outside (from Chievres weather station), in Celsius

Pressure (from Chievres weather station), in mm Hg

RH_out, Humidity outside (from Chievres weather station), in %

Wind speed (from Chievres weather station), in m/s

Visibility (from Chievres weather station), in km

Tdewpoint (from Chievres weather station), Â°C

rv1, Random variable 1, nondimensional

rv2, Random variable 2, nondimensional

In [2]:
df= pd.read_csv("energydata_complete.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
date,2016-01-11 17:00:00,2016-01-11 17:10:00,2016-01-11 17:20:00,2016-01-11 17:30:00,2016-01-11 17:40:00
Appliances,60,60,50,50,60
lights,30,30,30,40,40
T1,19.89,19.89,19.89,19.89,19.89
RH_1,47.596667,46.693333,46.3,46.066667,46.333333
T2,19.2,19.2,19.2,19.2,19.2
RH_2,44.79,44.7225,44.626667,44.59,44.53
T3,19.79,19.79,19.79,19.79,19.79
RH_3,44.73,44.79,44.933333,45.0,45.0
T4,19.0,19.0,18.926667,18.89,18.89


In [3]:
from sklearn.linear_model import Lasso,Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [4]:
X = df.loc[:,["T2"]]
X.head()

Unnamed: 0,T2
0,19.2
1,19.2
2,19.2
3,19.2
4,19.2


In [5]:
y = df.loc[:, "T6"]
y.head()

0    7.026667
1    6.833333
2    6.560000
3    6.433333
4    6.366667
Name: T6, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15788, 1), (3947, 1), (15788,), (3947,))

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [8]:
y_preds = model.predict(X_test)
y_preds

array([ 2.15562488, 10.01834433,  1.87348764, ..., 14.12418366,
       13.30216726, 20.95487478])

In [9]:
r2 = r2_score(y_test, y_preds)
r2

0.6385443715749319

In [10]:
round(r2,2)

0.64

In [11]:
from sklearn.preprocessing import MinMaxScaler

In [12]:
df2 = df.drop(["date","lights"],axis=1)
df2.head().T

Unnamed: 0,0,1,2,3,4
Appliances,60.0,60.0,50.0,50.0,60.0
T1,19.89,19.89,19.89,19.89,19.89
RH_1,47.596667,46.693333,46.3,46.066667,46.333333
T2,19.2,19.2,19.2,19.2,19.2
RH_2,44.79,44.7225,44.626667,44.59,44.53
T3,19.79,19.79,19.79,19.79,19.79
RH_3,44.73,44.79,44.933333,45.0,45.0
T4,19.0,19.0,18.926667,18.89,18.89
RH_4,45.566667,45.9925,45.89,45.723333,45.53
T5,17.166667,17.166667,17.166667,17.166667,17.2


In [13]:
scaler = MinMaxScaler()
scaler

MinMaxScaler()

In [14]:
normalized_df = pd.DataFrame(scaler.fit_transform(df2), columns=df2.columns)
normalized_df.head().T

Unnamed: 0,0,1,2,3,4
Appliances,0.046729,0.046729,0.037383,0.037383,0.046729
T1,0.32735,0.32735,0.32735,0.32735,0.32735
RH_1,0.566187,0.541326,0.530502,0.52408,0.531419
T2,0.225345,0.225345,0.225345,0.225345,0.225345
RH_2,0.684038,0.68214,0.679445,0.678414,0.676727
T3,0.215188,0.215188,0.215188,0.215188,0.215188
RH_3,0.746066,0.748871,0.755569,0.758685,0.758685
T4,0.351351,0.351351,0.344745,0.341441,0.341441
RH_4,0.764262,0.782437,0.778062,0.770949,0.762697
T5,0.175506,0.175506,0.175506,0.175506,0.178691


In [15]:
y = normalized_df.pop("Appliances")
y.head()

0    0.046729
1    0.046729
2    0.037383
3    0.037383
4    0.046729
Name: Appliances, dtype: float64

In [16]:
X = normalized_df.loc[:, :]
X.head().T

Unnamed: 0,0,1,2,3,4
T1,0.32735,0.32735,0.32735,0.32735,0.32735
RH_1,0.566187,0.541326,0.530502,0.52408,0.531419
T2,0.225345,0.225345,0.225345,0.225345,0.225345
RH_2,0.684038,0.68214,0.679445,0.678414,0.676727
T3,0.215188,0.215188,0.215188,0.215188,0.215188
RH_3,0.746066,0.748871,0.755569,0.758685,0.758685
T4,0.351351,0.351351,0.344745,0.341441,0.341441
RH_4,0.764262,0.782437,0.778062,0.770949,0.762697
T5,0.175506,0.175506,0.175506,0.175506,0.178691
RH_5,0.381691,0.381691,0.380037,0.380037,0.380037


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13814, 26), (5921, 26), (13814,), (5921,))

In [18]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [19]:
y_preds = model.predict(X_test)
y_preds

array([0.03322282, 0.24411482, 0.03403573, ..., 0.06842656, 0.1003157 ,
       0.0572441 ])

In [20]:
mae = mean_absolute_error(y_test, y_preds)
mae

0.05013340192512841

In [21]:
round(mae,2)

0.05

In [22]:
rss = np.sum(np.square(y_test - y_preds))
round(rss,2)

45.35

In [23]:
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
round(rmse,3)

0.088

In [24]:
r_squared = model.score(X_train, y_train)
r_squared

0.1447194374397689

In [25]:
weights = model.coef_
weights

array([-3.28105119e-03,  5.53543181e-01, -2.36182781e-01, -4.56704738e-01,
        2.90629361e-01,  9.60513654e-02,  2.89690357e-02,  2.63952124e-02,
       -1.56527025e-02,  1.60059493e-02,  2.36426608e-01,  3.80508971e-02,
        1.03187989e-02, -4.46095766e-02,  1.01996341e-01, -1.57601566e-01,
       -1.89931250e-01, -3.98024576e-02, -3.21849866e-01,  6.84024354e-03,
       -7.76631636e-02,  2.91789260e-02,  1.23057649e-02,  1.17747893e-01,
        3.33294104e+10, -3.33294104e+10])

In [26]:
feature_index_with_lowest_weight = weights.argmin()
feature_index_with_lowest_weight

25

In [27]:
lowest_weight = weights[feature_index_with_lowest_weight]
lowest_weight

-33329410353.026226

In [28]:
feature_names = df2.columns.to_list()
feature_names

['Appliances',
 'T1',
 'RH_1',
 'T2',
 'RH_2',
 'T3',
 'RH_3',
 'T4',
 'RH_4',
 'T5',
 'RH_5',
 'T6',
 'RH_6',
 'T7',
 'RH_7',
 'T8',
 'RH_8',
 'T9',
 'RH_9',
 'T_out',
 'Press_mm_hg',
 'RH_out',
 'Windspeed',
 'Visibility',
 'Tdewpoint',
 'rv1',
 'rv2']

In [29]:
feature_name = feature_names[feature_index_with_lowest_weight]
feature_name

'rv1'

In [30]:
feature_index_with_highest_weight = weights.argmax()
feature_index_with_highest_weight

24

In [31]:
highest_weight = weights[feature_index_with_highest_weight]
highest_weight

33329410353.027695

In [32]:
feature_name = feature_names[feature_index_with_highest_weight]
feature_name

'Tdewpoint'

In [33]:
model = Ridge(alpha=.4)
model.fit(X_train, y_train)

Ridge(alpha=0.4)

In [34]:
y_preds = model.predict(X_test)
y_preds

array([0.03321872, 0.24043824, 0.03461337, ..., 0.06872351, 0.10025536,
       0.05851175])

In [35]:
# mae = mean_absolute_error(y_test, y_preds)
# mae

In [36]:
rmse2 = np.sqrt(mean_squared_error(y_test, y_preds))
round(rmse2,3)

0.088

In [37]:
round(rmse,3)

0.088

In [39]:
model = Lasso(alpha=.001)
model.fit(X_train, y_train)

Lasso(alpha=0.001)