# Analysis of Energy Data
## Standard imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

## Import [data](https://github.com/LuisM78/Appliances-energy-prediction-data)

In [2]:
data = pd.read_csv("energydata_complete.csv")
data['date'] = pd.to_datetime(data['date'])
data.index = data['date']

## Recreate "seconds from midnight" data column calculated from date field with help from [here](https://stackoverflow.com/a/41252517)

In [3]:
def secsFromDT (row):
    time = str(row[0].time())
    return sum(x * int(t) for x, t in zip([3600, 60, 1], time.split(":")))
data['NSM'] = data.apply(secsFromDT, axis=1)
del data['date']

## Run linear regression on all fields
Our result is roughly RMSE: 92, R^2: 0.17  
Compared to the [paper](http://dx.doi.org/10.1016/j.enbuild.2017.01.083)'s result of RMSE: 94, R^2: 0.18

In [98]:
# Load all fields but Appliances and test against Appliances
X, y = [data.iloc[:, 1:], data.Appliances]

# Save 75% of data for training, 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Train model and use to predict on our test data
reg = LinearRegression()
model = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

# Compute the root-mean-square-error and root-mean-square
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2:", r2_score(y_test, y_pred))

RMSE: 95.1658137458819
R^2: 0.16999258490857394
