In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv('energydata_complete.csv')

In [5]:
df.drop(['lights', 'date'], axis=1, inplace=True)

In [6]:
scaler = MinMaxScaler()

normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [7]:
df_new = normalised_df.copy()

# QUESTION 12

In [8]:
# reshape features to 2d array to signify single feature.
features = normalised_df['T2'].values.reshape(-1, 1)
output = normalised_df['T6']

In [9]:
linear_model = LinearRegression()
linear_model.fit(features, output)

LinearRegression()

In [10]:
linear_model.score(features, output)

0.6418990830855493

- r-squared value to 2d.p is **0.64**

In [11]:
features_df = df_new.drop('Appliances', axis='columns')
heating_target = df_new['Appliances']

X_train, X_test, y_train, y_test = train_test_split(features_df, heating_target, test_size=0.3, random_state=42)

In [12]:
def my_model(model, Xtrain, ytrain, Xtest, ytest):
    model.fit(Xtrain, ytrain)
    pred_val = model.predict(Xtest)
    print('RMSE: ', np.sqrt(mean_squared_error(ytest, pred_val)))
    print('MAE: ', mean_absolute_error(ytest, pred_val))
    print('r2: ', r2_score(ytest, pred_val))
    print('rss: ', np.sum(np.square(y_test - pred_val)))

# QUESTION 13, 14, 14, 16

In [13]:
my_model(linear_model, X_train, y_train, X_test, y_test)

RMSE:  0.08751308708368538
MAE:  0.05013310576571433
r2:  0.14892896213691764
rss:  45.34621777303779


Mean Absolute Error: **0.05** to 2d.p

residual Sum of Squares: **45.35** to 2d.p

Root Mean Squared Error: **0.088** to 3d.p

Coefficient of Determination: **0.15** to 2d.p

# QUESTION 17

In [14]:
def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    return weights_df

In [15]:
get_weights_df(linear_model, X_train, 'Linear_Model_Weight')

Unnamed: 0,Features,Linear_Model_Weight
0,rv2,-63471560000.0
1,RH_2,-0.4566222
2,T_out,-0.3218423
3,T2,-0.2361131
4,T9,-0.1899177
5,RH_8,-0.1575747
6,RH_out,-0.07767868
7,RH_7,-0.04462568
8,RH_9,-0.03980029
9,T5,-0.01566246


From the options in the MCQ and the dataframe above,
**RH_2** and **RH_1** have the lowest and highest weights

# QUESTION 18

In [16]:
ridge_reg = Ridge(alpha=0.4)

In [17]:
my_model(ridge_reg, X_train, y_train, X_test, y_test)

RMSE:  0.08753385704628003
MAE:  0.050087445840923825
r2:  0.14852493545092593
rss:  45.36774486216903


RMSE for linear model is **0.08751308708368538**

RMSE for ridge is **0.08753385704628003**

This shows that there is a change in the value

Answer: **YES**

# QUESTION 19

In [18]:
lasso_reg = Lasso(alpha=0.001)

In [19]:
my_model(lasso_reg, X_train, y_train, X_test, y_test)

RMSE:  0.09358170467245137
MAE:  0.055256639821262235
r2:  0.026800880567125818
rss:  51.85336739590869


In [20]:
ridge = get_weights_df(lasso_reg, X_train, 'Ridge_Weight')

In [21]:
answer = ridge.loc[ridge['Ridge_Weight'] != 0]
answer

Unnamed: 0,Features,Ridge_Weight
0,RH_out,-0.049557
1,RH_8,-0.00011
24,Windspeed,0.002912
25,RH_1,0.01788


In [22]:
len(answer)

4

The number of non-zero feature weights is **4**

# QUESTION 20

From Question 19 above, RMSE for Lasso Regression is **0.094** in 3d.p