# Forecasting Energy Demand 

## Exploratory Data Analysis (EDA)
The main question we want to answer is what is the relationship between the independent variables (weather features) and the dependent variable (electricity demand).

In [None]:
import boto3
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [None]:
import datetime
import pandas as pd
from scipy import stats
from pandas.io.json import json_normalize
import numpy as np


import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.plotting import register_matplotlib_converters
%matplotlib inline
register_matplotlib_converters()

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
CITY = 'LosAngeles'
df_file = 'dataframes/%s_dataset.csv' % CITY

df_location = 's3://{}/{}'.format(bucket, df_file)
df = pd.read_csv(df_location, index_col ='datetime')
df.head()

In [None]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(y=df['demand'], x=df.index,
                    mode='lines',
                    name='lines'))
fig.update_layout(title='Electricity data in Los Angeles',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

In [None]:
for var in ['hour','dayofweek','weekofyear', 'month', 'year']:
    fig = px.scatter(df, x=var, y="demand", color="hour", opacity=0.1)
    fig.show()

In [None]:
cols_printed = []
for col in df.columns:
    if col == 'hourlyskyconditions' or col == 'demand':
        continue
    cols_printed.append(col)
    fig = px.scatter(df, x=col, y="demand", trendline="ols")
    fig.show()

In [None]:
df.head()

In [None]:
# get pearson correlation coefficients for demand
print('DEMAND CORRELATIONS (PEARSON) FOR %s' %CITY)
print(df.corr()['demand'].sort_values(ascending=False)[1:])

## Feature selection

In [None]:
#importing libraries
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [None]:
columns = df.columns.tolist()
columns = [c for c in df.columns if c not in ['date', 'demand']]
X = df[columns]  
y = df["demand"]          

In [None]:
df.shape

In [None]:
# Number of features
nof_list = np.arange(1,df.shape[1])            
high_score = 0

#Variable to store the optimum features
nof=0           
score_list =[]

for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
## colocar em ordem de importância para o modelo e testar com todas features e com as selecionadas
## criar season decompose!!!


cols = list(X.columns)
model = LinearRegression()

#Initializing RFE model
rfe = RFE(model, nof)     

#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  

#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

In [None]:
# save as csv file to continue in another notebook
selected = df[selected_features_rfe].copy()
selected['demand'] = df['demand']

csv_buffer = io.StringIO()
s3_resource = boto3.resource('s3')
key = 'data/%s_selectedfeatures.pkl' % CITY

selected.to_csv(csv_buffer, compression=None)
s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())