### Problem 

The task is to predict the energy usage of a house based on Internet of Things (IoT) measurements of temperature, humidity, and weather observations. 

### Relevant Paper

Data driven prediction models of energy use of appliances in a low-energy house. Luis M. Candanedo, Véronique Feldheim, Dominique Deramaix. Energy and Buildings, Volume 140, 1 April 2017, Pages 81-97, ISSN 0378-7788, http://dx.doi.org/10.1016/j.enbuild.2017.01.083.

### Data Source

The authors provide their code and data in a [Github Repository](https://github.com/LuisM78/Appliances-energy-prediction-data)

The repository contains the complete data (energydata_complete.csv) as well as the training and testing splits used in the paper.

In [None]:
import os
import re
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFE, RFECV

## Reading Data

In [None]:
complete_data = pd.read_csv('appliances-energy-prediction-data/energydata_complete.csv')
train_data = pd.read_csv('appliances-energy-prediction-data/training.csv')
test_data = pd.read_csv('appliances-energy-prediction-data/testing.csv')
print("complete_data shape: ", complete_data.shape)
print("train_data shape: ", train_data.shape)
print("test_data shape: ", test_data.shape)

In [None]:
# checking head of training data
print(train_data.head())

In [None]:
print(set(train_data['WeekStatus']))

In [None]:
print(set(train_data['Day_of_week']))

In [None]:
# Data preparation for modeling
complete_data.index = pd.to_datetime(complete_data['date'])
train_data.index = pd.to_datetime(train_data['date'])
test_data.index = pd.to_datetime(test_data['date'])
train_data.head()

### Converting 'Day_of_week' from 'string' to 'Numbers'

In [None]:
train_data['Day_of_week'] = train_data.index.weekday
test_data['Day_of_week'] = test_data.index.weekday

### Converting WeekStatus from 'string' to 'Numbers' using get_dummies

In [None]:
# Changing 'Weekstatus'
train_data['WeekStatus'] = pd.get_dummies(train_data['WeekStatus'])
test_data['WeekStatus'] = pd.get_dummies(test_data['WeekStatus'])
train_data.head()

### Split the data into train data and train label, test data and test label

In [None]:
# data split
X_train = train_data.drop(['Appliances', 'date'], axis=1)
y_train = train_data['Appliances']
X_test = test_data.drop(['Appliances', 'date'], axis=1)
y_test = test_data['Appliances']
print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)

In [None]:
lr_model = linear_model.LinearRegression()
lr_model.fit(X_train, y_train)

### Evaluation metrics on test data

In [None]:
y_test_pred = lr_model.predict(X_test)
print("MSE: ", mean_squared_error(y_test, y_test_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print("R^2: ", r2_score(y_test, y_test_pred))
print("MAE: ", mean_absolute_error(y_test, y_test_pred))

### Evaluation metrics on train data

In [None]:
y_train_pred = lr_model.predict(X_train)
print("MSE: ", mean_squared_error(y_train, y_train_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print("R^2: ", r2_score(y_train, y_train_pred))
print("MAE: ", mean_absolute_error(y_train, y_train_pred))

### Checking the impact of number of features

In [None]:
RMSE_list = []
for i in range(1, 30):
    model = linear_model.LinearRegression()
    selector = RFE(model, i)
    selector.fit(X_train, y_train)
    y_predicted_test = selector.predict(X_test)
    RMSE_list.append(np.sqrt(mean_squared_error(y_test, y_predicted_test)))

In [None]:
# Plot
RMSE_df = pd.DataFrame(RMSE_list, columns=['RMSE'])
plt.figure(figsize=(20,10))
sns.lineplot(data=RMSE_df)
plt.title('Effect of increasing number of features on RMSE value')
plt.xlabel('No. of features')
plt.ylabel('RMSE')

### Finding most important features

In [None]:
model = linear_model.LinearRegression()
top_15 = 15
selector = RFE(model, top_15)
selector.fit(X_train, y_train)
y_predicted_test = selector.predict(X_test)
support_var = selector.get_support()
print("Top 15 important features: ", X_train.columns[support_var])