# Forecasting Energy Demand 

## Modeling
My approach treats the problem as a standard supervised regression task. Given a set of features – the time and weather information – we want to build a model that can predict the continuous target, energy consumption. The model is trained on the past historical energy consumption using the features and the target and then can be used to make predictions for future dates where only the features are known.

* Train/Test Split
* Scale all features using a min-max scaler
* Fit data with model
* Evaluate model


In [None]:
import numpy as np 
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# XGBoost
import xgboost as xgb
from xgboost import plot_importance, plot_tree

# Prophet
from fbprophet import Prophet

# LightGBM
import lightgbm as lgbm

In [None]:
df = pd.read_pickle('data/LosAngeles_reframed.pkl')

split_date = '01-Feb-2019'
df_train = df.loc[df.index <= split_date].copy()
df_test = df.loc[df.index > split_date].copy()

In [None]:
plot_train = pd.DataFrame(df_train['demand(t)'])
plot_test = pd.DataFrame(df_test['demand(t)'])
plot_train.index = df_train.index
plot_test.index = df_test.index

to_plot = plot_test \
    .rename(columns={'demand(t)': 'TEST SET'}) \
    .join(plot_train.rename(columns={'demand(t)': 'TRAINING SET'}), how='outer') \


# Create traces
fig = go.Figure()
fig.add_trace(go.Scattergl(y=to_plot['TRAINING SET'], x=to_plot.index,
                    mode='lines',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=to_plot['TEST SET'], x=to_plot.index,
                    mode='lines',
                    name='TEST SET',
                    marker_color='rgb(0, 204, 150)'))
fig.update_layout(title='Los Angeles - Train/Test Split',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

In [None]:
def data_sppliter(df, label):
    cols = list(df.columns)
    cols.remove(label)
    X = df[cols]
    y = df[label]
    return X, y

X_train, y_train = data_sppliter(df_train, label='demand(t)')
X_test, y_test = data_sppliter(df_test, label='demand(t)')

## Create XGBoost Model

In [None]:
reg = xgb.XGBRegressor(n_estimators=960)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        early_stopping_rounds=50,
       verbose=False) 

### Feature importance plot
Feature importance is a great way to get a general idea about which features the model is relying on most to make the prediction. This is a metric that simply sums up how many times each feature is split on.
We can see that the hour was most commonly used to split trees alongside day of year and day of week, while weather features has low importance.

In [None]:
_ = plot_importance(reg, height=0.8)

## Forecast on Test Set

In [None]:
df_test['demand_prediction'] = reg.predict(X_test)
df_all = pd.concat([df_test, df_train], sort=False)

In [None]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scattergl(y=df_all['demand(t)'], x=df_all.index,
                    mode='markers',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=df_all['demand_prediction'], x=df_all.index,
                    mode='lines',
                    name='PREDICT SET'))
fig.update_layout(title='Los Angeles - Forecast On Test',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

### Zooming-in at first month of predictions

In [None]:
# Create traces
first_month = df_all.head(720)
fig = go.Figure()
fig.add_trace(go.Scattergl(y=first_month['demand(t)'], x=first_month.index,
                    mode='lines',
                    name='TRAINING SET'))
fig.add_trace(go.Scattergl(y=first_month['demand_prediction'], x=first_month.index,
                    mode='lines',
                    name='TEST SET'))
fig.update_layout(title='Forecast vs Actuals - First Month',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

In [None]:
# Create traces
first_month = df_all.head(168)
fig = go.Figure()
fig.add_trace(go.Scattergl(y=first_month['demand(t)'], x=first_month.index,
                    mode='markers',
                    name='TEST SET'))
fig.add_trace(go.Scattergl(y=first_month['demand_prediction'], x=first_month.index,
                    mode='lines',
                    name='PREDICT SET'))
fig.update_layout(title='Forecast vs Actuals - First Month',
                   xaxis_title='Date',
                   yaxis_title='Electricity Demand [MWh]')
fig.show()

### Error Metrics On Test Set

In [None]:
rmse = mean_squared_error(y_true=df_test['demand(t)'],
                   y_pred=df_test['demand_prediction'])

mae = mean_absolute_error(y_true=df_test['demand(t)'],
                   y_pred=df_test['demand_prediction'])

def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(y_true=df_test['demand(t)'],
                   y_pred=df_test['demand_prediction'])

print(rmse)
print(mae)
print(mape)

### Look at Worst and Best Predicted Days

In [None]:
df_test['error'] = df_test['demand(t)'] - df_test['demand_prediction']
df_test['abs_error'] = df_test['error'].apply(np.abs)
error_by_day = df_test.groupby(['year(t)','month(t)','dayofmonth(t)']) \
    .mean()[['demand(t)','demand_prediction','error','abs_error']]

In [None]:
# Worst predicted days
error_by_day.sort_values('error', ascending=True).head(10)

In [None]:
# Best predicted days
error_by_day.sort_values('abs_error', ascending=True).head(10)

## Create Prophet Model
Prophet model expects the dataset to be named a specific way. We will rename our dataframe columns before feeding it into the model.

In [None]:
prophet_train.head()

In [None]:
# Format data for prophet model using ds and y
prophet_train = df_train.copy()
prophet_train = prophet_train.reset_index() \
                .rename(columns={'datetime':'ds',
                                 'demand(t)':'y'}).head()

[str(i)[:-6] for i in prophet_train['ds']]
prophet_train.ds = prophet_train.ds.dt.tz_convert(tz=None)
prophet_train.dtypes

In [None]:
# Setup and train model
model = Prophet()
model.fit(prophet_train)

In [None]:
prophet__to_test = df_test.reset_index().rename(columns={'datetime':'ds'})
prophet__to_test.ds = prophet__to_test.ds.dt.tz_convert(tz=None)
prophet_test = model.predict(df=prophet__to_test)

In [None]:
# Plot the forecast with the actuals
f, ax = plt.subplots(1)
f.set_figheight(5)
f.set_figwidth(15)
ax.scatter(df_test.index, df_test['demand_prediction'], color='r')
fig = model.plot(prophet_test, ax=ax)