# Forecasting Energy Demand 

## Exploratory Data Analysis (EDA)
The main question we want to answer is what is the relationship between the independent variables (weather features) and the dependent variable (electricity demand).

In [1]:
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [6]:
import datetime
import pandas as pd
from scipy import stats
from pandas.io.json import json_normalize
import numpy as np

import scipy
import statsmodels.api as sm 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
%matplotlib inline
register_matplotlib_converters()


import warnings
warnings.filterwarnings('ignore')

In [None]:
CITY = 'LosAngeles'
df_file = 'dataframes/%s_dataset.csv' % CITY

df_location = 's3://{}/{}'.format(bucket, df_file)
df = pd.read_csv(df_location, index_col ='datetime')
df.tail()

In [5]:
# get correlation coefficients for demand
print('DEMAND CORRELATIONS (PEARSON)')
print(df.corr()['demand'].sort_values(ascending=False)[1:])

# get r^2 values per column and print
demand_r = {}
for col in df.columns:
    if col != 'demand':
        if col != 'date':
            slope, intercept, r_value, p_value, std_err = scipy.stats.stats.linregress(df['demand'], df[col])
            demand_r[col] = float(r_value**2)

print(' ')
print('***')
print(' ')
print('DEMAND CORRELATIONS (r^2)')
demand_r_df = pd.DataFrame({'col': list(demand_r.keys()), 'r^2': list(demand_r.values())})
print(demand_r_df.sort_values(by='r^2', ascending=False))

DEMAND CORRELATIONS (PEARSON)
dailycoolingdegreedays       0.555718
hourlyrelativehumidity       0.363718
hourlydewpointtemperature    0.361610
isbusinessday                0.151979
quarter                      0.123807
weekofyear                   0.118949
dayofyear                    0.118331
month                        0.117201
hourlyskyconditions_CLR      0.030972
dayofmonth                   0.026249
hourlydrybulbtemperature     0.008105
hourlyskyconditions_OVC      0.007924
isholiday                    0.003312
hourlyvisibility            -0.004904
hourlyprecipitation         -0.019708
hourlyskyconditions_BKN     -0.030940
hourlyskyconditions_SCT     -0.035543
hourlyskyconditions_FEW     -0.036784
hourlycoolingdegrees        -0.052520
hour                        -0.066183
hourlyheatingdegrees        -0.078769
dayofweek                   -0.102261
year                        -0.164289
dailyheatingdegreedays      -0.210142
hourlywindspeed             -0.226223
hourlystationpressur

NameError: name 'scipy' is not defined

## Feature selection

In [None]:
#importing libraries
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [None]:
columns = df.columns.tolist()
columns = [c for c in df.columns if c not in ['date', 'demand']]
X = df[columns]  
y = df["demand"]          

In [None]:
df.shape

In [None]:
# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
## colocar em ordem de importância para o modelo e testar com todas features e com as selecionadas
## criar season decompose!!!


cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model, nof)     

#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  

#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

In [None]:
# save as csv file to continue in another notebook
selected = df[selected_features_rfe].copy()
selected['demand'] = df['demand']

csv_buffer = io.StringIO()
s3_resource = boto3.resource('s3')
key = 'dataframes/%s_selectedfeatures.csv' % CITY

selected.to_csv(csv_buffer, compression=None)
s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())