# Stage Two Quiz Session

In [2]:
# import relevant libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing as prep
from sklearn import linear_model as lm
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [3]:
# load dataset into dataframe
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'
df = pd.read_csv(url)
df.head(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195


In [4]:
df.drop(columns=['date', 'lights'], inplace=True)

In [5]:
scaler = prep.MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
features = normalised_df.drop(columns=['Appliances'])
target = normalised_df['Appliances']

### Question 12

From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P?

In [6]:
x = features['T2'].values.reshape(-1,1)
y = features['T6']

lr = lm.LinearRegression()
model = lr.fit(x, y)
y_pred = model.predict(x)
print(round(metrics.r2_score(y, y_pred), 2))

0.64


### Question 13

In [7]:
X = features
Y = target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
lr = lm.LinearRegression()
model = lr.fit(x_train, y_train)
y_pred = model.predict(x_test)

What is the Mean Absolute Error (in two decimal places)?

In [8]:
print(round(metrics.mean_absolute_error(y_test, y_pred), 2))

0.05


### Question 14

What is the Residual Sum of Squares (in two decimal places)?

In [9]:
print(round(np.sum(np.square(y_test - y_pred)), 2))

45.35


### Question 15

What is the Root Mean Squared Error (in three decimal places)?

In [10]:
print(round(metrics.mean_squared_error(y_test, y_pred, squared=False), 3))

0.088


### Question 16

What is the Coefficient of Determination (in two decimal places)? 

In [11]:
print(round(metrics.r2_score(y_test, y_pred), 2))

0.15


### Question 17

Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [12]:
weight = pd.DataFrame()
weight['features'] = X.columns
weight['weights'] = model.coef_
weight = weight.sort_values(by='weights', ascending=True)
weight

Unnamed: 0,features,weights
24,rv1,-29473750000.0
3,RH_2,-0.4567267
18,T_out,-0.3218675
2,T2,-0.2362102
16,T9,-0.1899489
15,RH_8,-0.1576046
20,RH_out,-0.07766786
13,RH_7,-0.04460378
17,RH_9,-0.03979767
8,T5,-0.01565445


### Question 18

Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [13]:
X = features
Y = target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
ridge = lm.Ridge(alpha=0.4)
model = ridge.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(round(metrics.mean_squared_error(y_test, y_pred, squared=False), 3))

0.088


### Question 19

Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [14]:
X = features
Y = target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
lasso = lm.Lasso(alpha=0.001)
model = lasso.fit(x_train, y_train)
y_pred = model.predict(x_test)
weight = pd.DataFrame()
weight['features'] = X.columns
weight['weights'] = model.coef_
weight.loc[weight['weights'] != 0]

Unnamed: 0,features,weights
1,RH_1,0.01788
15,RH_8,-0.00011
20,RH_out,-0.049557
21,Windspeed,0.002912


### Question 20

What is the new RMSE with the Lasso Regression (in 3 decimal places)?

In [15]:
print(round(metrics.mean_squared_error(y_test, y_pred, squared=False), 3))

0.094
