# Forecasting Energy Demand 

## Modeling
My approach treats the problem as a standard supervised regression task. Given a set of features – the time and weather information – we want to build a model that can predict the continuous target, energy consumption. The model is trained on the past historical energy consumption using the features and the target and then can be used to make predictions for future dates where only the features are known.

* Train/Test Split
* Scale all features using a min-max scaler
* Fit data with model
* Evaluate model


In [None]:
# Install xgboost in notebook instance.
#### Command to install xgboost
!conda install -y -c conda-forge xgboost

In [None]:
# Import AWS and Sagemaker SDKs and get files access
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [None]:
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error


# XGBoost 
import xgboost as xgb

In [None]:
# Import data 

CITY = 'LosAngeles'
train_key = 'dataframes/%s_reframed_train.csv' % CITY
validation_key = 'dataframes/%s_reframed_validation.csv' % CITY

train_location = 's3://{}/{}'.format(bucket, train_key)
validation_location = 's3://{}/{}'.format(bucket, validation_key)

df_train = pd.read_csv(train_location, index_col='datetime')
df_validation = pd.read_csv(validation_location, index_col='datetime')

In [None]:
plot_train = pd.DataFrame(df_train['demand(t)'])
plot_validation = pd.DataFrame(df_validation['demand(t)'])
plot_train.index = df_train.index
plot_validation.index = df_validation.index

to_plot = plot_validation \
    .rename(columns={'demand(t)': 'VALIDATION SET'}) \
    .join(plot_train.rename(columns={'demand(t)': 'TRAINING SET'}), how='outer') \


# Create traces
fig = go.Figure()
fig.add_trace(go.Scattergl(y=to_plot['TRAINING SET'], x=to_plot.index,
                    mode='lines',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=to_plot['VALIDATION SET'], x=to_plot.index,
                    mode='lines',
                    name='VALIDATION SET',
                    marker_color='rgb(0, 204, 150)'))
fig.update_layout(title='Los Angeles - Train/Validation Split',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

In [None]:
def data_sppliter(df, label):
    cols = list(df.columns)
    cols.remove(label)
    X = df[cols]
    y = df[label]
    return X, y

X_train, y_train = data_sppliter(df_train, label='demand(t)')
X_validation, y_validation = data_sppliter(df_validation, label='demand(t)')

## Create XGBoost Model

In [None]:
# XGBoost Training Parameter Reference: 
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

regressor = xgb.XGBRegressor(max_depth=5, n_estimators=960)

In [None]:
regressor.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_validation, y_validation)]) 

In [None]:
df_train['demand(t)'].describe()

In [None]:
eval_result = regressor.evals_result()
training_rounds = range(len(eval_result['validation_0']['rmse']))

In [None]:
plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error')
plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error')
plt.grid(True)
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.title('Training Vs Validation Error')
plt.legend()
plt.show()

### Feature importance plot
Feature importance is a great way to get a general idea about which features the model is relying on most to make the prediction. This is a metric that simply sums up how many times each feature is split on.
We can see that the hour was most commonly used to split trees alongside day of year and day of week, while weather features has low importance.

In [None]:
fig, ax = plt.subplots(figsize=(14, 16))
xgb.plot_importance(regressor, ax=ax)
plt.show()

## Forecast on Validation Set

In [None]:
df_validation['demand_prediction'] = regressor.predict(X_validation)
df_all = pd.concat([df_validation, df_train], sort=False)

In [None]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scattergl(y=df_all['demand(t)'], x=df_all.index,
                    mode='markers',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=df_all['demand_prediction'], x=df_all.index,
                    mode='lines',
                    name='PREDICT SET'))
fig.update_layout(title='Los Angeles - Forecast On Test',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

### Zooming-in at first month of predictions

In [None]:
# Create traces
first_month = df_all.head(720)
fig = go.Figure()
fig.add_trace(go.Scattergl(y=first_month['demand(t)'], x=first_month.index,
                    mode='lines',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=first_month['demand_prediction'], x=first_month.index,
                    mode='lines',
                    name='TEST SET'))
fig.update_layout(title='Forecast vs Actuals - First Month',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

In [None]:
# Create traces
first_month = df_all.head(168)
fig = go.Figure()
fig.add_trace(go.Scattergl(y=first_month['demand(t)'], x=first_month.index,
                    mode='markers',
                    name='TEST SET'))
fig.add_trace(go.Scattergl(y=first_month['demand_prediction'], x=first_month.index,
                    mode='lines',
                    name='PREDICT SET'))
fig.update_layout(title='Forecast vs Actuals - First Month',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

### Error Metrics On Test Set

In [None]:
rmse = mean_squared_error(y_true=df_validation['demand(t)'],
                   y_pred=df_validation['demand_prediction'])

mae = mean_absolute_error(y_true=df_validation['demand(t)'],
                   y_pred=df_validation['demand_prediction'])

def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_true=df_validation['demand(t)'],
                   y_pred=df_validation['demand_prediction'])

print(rmse)
print(mae)
print(mape)

### Look at Worst and Best Predicted Days

In [None]:
df_validation['error'] = df_validation['demand(t)'] - df_validation['demand_prediction']
df_validation['abs_error'] = df_validation['error'].apply(np.abs)
error_by_day = df_validation.groupby(['year(t)','month(t)','dayofmonth(t)']) \
    .mean()[['demand(t)','demand_prediction','error','abs_error']]

In [None]:
# Worst predicted days
error_by_day.sort_values('error', ascending=True).head(10)

In [None]:
# Best predicted days
error_by_day.sort_values('abs_error', ascending=True).head(10)

## Create Prophet Model
Prophet model expects the dataset to be named a specific way. We will rename our dataframe columns before feeding it into the model.

In [None]:
prophet_train.head()

In [None]:
# Format data for prophet model using ds and y
prophet_train = df_train.copy()
prophet_train = prophet_train.reset_index() \
                .rename(columns={'datetime':'ds',
                                 'demand(t)':'y'}).head()

[str(i)[:-6] for i in prophet_train['ds']]
prophet_train.ds = prophet_train.ds.dt.tz_convert(tz=None)
prophet_train.dtypes

In [None]:
# Setup and train model
model = Prophet()
model.fit(prophet_train)

In [None]:
prophet__to_test = df_test.reset_index().rename(columns={'datetime':'ds'})
prophet__to_test.ds = prophet__to_test.ds.dt.tz_convert(tz=None)
prophet_test = model.predict(df=prophet__to_test)

In [None]:
# Plot the forecast with the actuals
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
ax.scatter(df_test.index, df_test['demand_prediction'], color='r')
fig = model.plot(prophet_test, ax=ax)