In [2]:
# Load the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

energy_data_link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'
energy_df = pd.read_csv(energy_data_link)

# Drop the date and light columns
energy_df = energy_df.drop(['date', 'lights'], axis=1)

  import pandas.util.testing as tm


In [3]:
energy_df.shape

(19735, 27)

Question 12
*(R2 value for T2 as X and T6 as y)*

In [4]:
# Normalize the dataset
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_df = pd.DataFrame(scaler.fit_transform(energy_df), columns=energy_df.columns)

In [39]:
# extract T2 as X
X = normalized_df['T2'].values.reshape(-1,1)

# extract T6 as y
y = normalized_df['T6'].values.reshape(-1,1)

In [40]:
# Data Split for X and y into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [20]:
X_train.shape

(13814, 1)

In [21]:
X.shape

(19735, 1)

In [22]:
X_test.shape

(5921, 1)

In [23]:
y_test.shape

(5921, 1)

In [24]:
# Linear model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

In [26]:
# R2 value for linear model
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2.round(2)

0.64

Question 13 
*Mean Absolute Error in 2 decimal places*

In [27]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)

mae.round(2)

0.08

Question 14
*Residual sum of squares*

In [28]:
rss = np.sum(np.square(y_test - y_pred))
rss.round(2)

66.12

Question 15
*Root mean squared error*

In [29]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse.round(3)

0.106

Question 16
*Coefficient of determination*

In [30]:
tss = np.sum(np.square(y_test - y_test.mean()))
coef_det = 1 - (rss/tss)
coef_det.round(2)

0.64

Question 17

In [44]:
X = normalized_df.drop('Appliances', axis=1)
y = normalized_df['Appliances']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [46]:
def get_weights_df(model, features, column_name):
    weights_series = pd.Series(model.coef_, features.columns)
    weights_df = pd.DataFrame(weights_series).reset_index()
    weights_df.columns = ['Features', column_name]
    weights_df[column_name].round(3)
    return weights_df

In [49]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
linear_model_weights = get_weights_df(linear_model, X_train, 'Linear_Model_Weight')

In [53]:
linear_model_weights

Unnamed: 0,Features,Linear_Model_Weight
0,T1,-0.003281
1,RH_1,0.553547
2,T2,-0.236178
3,RH_2,-0.456698
4,T3,0.290627
5,RH_3,0.096048
6,T4,0.028981
7,RH_4,0.026386
8,T5,-0.015657
9,RH_5,0.016006


Lowest is RH_2 with -0.456 while
Highest is RH_1 ith 0.553

Question 18

In [54]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [55]:
y_pred = ridge_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse.round(3)

0.088

Question 19

In [56]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [57]:
lasso_reg_weights = get_weights_df(lasso_reg, X_train, 'Lasso_Reg_Weight')

In [58]:
lasso_reg_weights

Unnamed: 0,Features,Lasso_Reg_Weight
0,T1,0.0
1,RH_1,0.01788
2,T2,0.0
3,RH_2,-0.0
4,T3,0.0
5,RH_3,0.0
6,T4,-0.0
7,RH_4,0.0
8,T5,-0.0
9,RH_5,0.0


4 non zero

Question 20

In [59]:
y_pred = lasso_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse.round(3)

0.094