In [87]:
# Import AWS and Sagemaker SDKs and get files access
import boto3
import io
from sagemaker import get_execution_role

role = get_execution_role()
bucket ='sagemaker-data-energy-demand'

In [88]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from datetime import datetime
import requests
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [136]:
# Import full dataset
CITY = 'LosAngeles'
all_key = 'dataframes/%s/all/data_all.csv' % CITY
all_location = 's3://{}/{}'.format(bucket, all_key)

all_df = pd.read_csv(all_location, index_col='datetime')

xgb_key = 'predict/%s/pred_xgboost.csv' % CITY
xgb_location = 's3://{}/{}'.format(bucket, xgb_key)

xgb_df = pd.read_csv(xgb_location, index_col='datetime')

lgb_key = 'predict/%s/pred_lightgbm.csv' % CITY
lgb_location = 's3://{}/{}'.format(bucket, lgb_key)

lgb_df = pd.read_csv(lgb_location, index_col='datetime')

In [137]:
lgb_df.tail()

Unnamed: 0_level_0,LightGBM_demand_prediction
datetime,Unnamed: 1_level_1
2020-01-11 17:00:00+00:00,2814.571329
2020-01-11 18:00:00+00:00,2807.36513
2020-01-11 19:00:00+00:00,2778.159244
2020-01-11 20:00:00+00:00,2751.57785
2020-01-11 21:00:00+00:00,2734.469753


In [171]:
EIA__API_KEY = '1d48c7c8354cc4408732174250d3e8ff'
REGION_CODE = 'LDWP'
CITY = 'LosAngeles'

def str_to_isodatetime(string):
    year = string[:4]
    month = string[4:6]
    day =  string[6:8]
    time = string[8:11] + ':00:00+0000'
    return year + month + day + time

def eia2dataframe(response):
    '''
    This function unpacks the JSON file from EIA API into a pandas dataframe.
    '''
    data = response['series'][0]['data']
    dates = []
    values = []
    for date, demand in data:
        if demand is None or demand <= 0:
            dates.append(str_to_isodatetime(date))
            values.append(np.nan)      
            continue   
#       dates.append(str_to_isodatetime(date))
        dates.append(date)
        values.append(float(demand))
    df = pd.DataFrame({'datetime': dates, 'EIA_demand_forecast': values})
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['EIA_demand_forecast'] = df['EIA_demand_forecast'].interpolate()
    df.set_index('datetime', inplace=True)
    df.sort_index(ascending=True, inplace=True, kind='mergesort')
    return df

electricity_api_response = requests.get('http://api.eia.gov/series/?api_key=%s&series_id=EBA.%s-ALL.DF.H' % (EIA__API_KEY, REGION_CODE)).json()
electricity_df = eia2dataframe(electricity_api_response)
electricity_df.head()

Unnamed: 0_level_0,EIA_demand_forecast
datetime,Unnamed: 1_level_1
2015-07-01 08:00:00+00:00,3475.0
2015-07-01 09:00:00+00:00,3118.0
2015-07-01 10:00:00+00:00,2938.0
2015-07-01 11:00:00+00:00,2800.0
2015-07-01 12:00:00+00:00,2743.0


In [175]:
electricity_df.tail(150)

Unnamed: 0_level_0,EIA_demand_forecast
datetime,Unnamed: 1_level_1
2020-01-11 03:00:00+00:00,3097.0
2020-01-11 04:00:00+00:00,3039.0
2020-01-11 05:00:00+00:00,2909.0
2020-01-11 06:00:00+00:00,2777.0
2020-01-11 07:00:00+00:00,2659.0
2020-01-11 08:00:00+00:00,2526.0
2020-01-11 09:00:00+00:00,2234.0
2020-01-11 10:00:00+00:00,2184.0
2020-01-11 11:00:00+00:00,2096.0
2020-01-11 12:00:00+00:00,2050.0


In [155]:
print('*** min ***')
print(min(electricity_df.index))
print(min(all_df.index))
print('*** max ***')
print(max(electricity_df.index))
print(max(all_df.index))

*** min ***
2015-07-01 08:00:00+00:00
2015-07-01 08:00:00+00:00
*** max ***
2020-01-17 08:00:00+00:00
2020-01-11 21:00:00+00:00


In [156]:
## Cut dataframes based on date to align sources
electricity_df = electricity_df[:str(all_df.index.max())]
all_df = all_df[str(electricity_df.index.min()):]

print('*** min ***')
print(min(electricity_df.index))
print(min(all_df.index))
print(str(all_df.index.min()) == str(electricity_df.index.min()))
print('*** max ***')
print(max(electricity_df.index))
print(max(all_df.index))
print(str(electricity_df.index.max()) == all_df.index.max())

*** min ***
2015-07-01 08:00:00+00:00
2015-07-01 08:00:00+00:00
True
*** max ***
2020-01-11 21:00:00+00:00
2020-01-11 21:00:00+00:00
True


In [157]:
observed = all_df[['demand(t)']]

# join demand forecast with demand to align dataframes
join = observed.join(electricity_df, how='left')

# delete first entry due to supervised reframe
join = join.iloc[1:]

In [168]:
join.tail()

Unnamed: 0_level_0,demand(t),EIA_demand_forecast
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-11 17:00:00+00:00,2777.0,2398.0
2020-01-11 18:00:00+00:00,2771.0,2468.0
2020-01-11 19:00:00+00:00,2734.0,2500.0
2020-01-11 20:00:00+00:00,2695.0,2502.0
2020-01-11 21:00:00+00:00,2635.0,2488.0


In [159]:
lastrow_date = all_df.tail(1).index.values[0]
               
datetime_lastrow = datetime.strptime(str(lastrow_date)[:-6], '%Y-%m-%d %H:%M:%S')
split_test_date = datetime_lastrow - timedelta(hours=360)
split_test_date_str = str(split_test_date)
#electricity_compare = join[['demand_forecast']].values[split_test_date:, :]

electricity_compare = join[split_test_date_str:]

In [167]:
electricity_compare.tail()

Unnamed: 0_level_0,demand(t),EIA_demand_forecast
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-11 17:00:00+00:00,2777.0,2398.0
2020-01-11 18:00:00+00:00,2771.0,2468.0
2020-01-11 19:00:00+00:00,2734.0,2500.0
2020-01-11 20:00:00+00:00,2695.0,2502.0
2020-01-11 21:00:00+00:00,2635.0,2488.0


In [160]:
# find indices where no value was recorded for demand forecast
nan_inds = np.where(np.isnan(electricity_compare)==True)[0]

In [161]:
nan_percent = len(nan_inds) / float(len(electricity_compare))
print(nan_percent)

0.0


In [162]:
electricity_compare.shape

(361, 2)

In [164]:
lgb_df.tail()

Unnamed: 0_level_0,LightGBM_demand_prediction
datetime,Unnamed: 1_level_1
2020-01-11 17:00:00+00:00,2814.571329
2020-01-11 18:00:00+00:00,2807.36513
2020-01-11 19:00:00+00:00,2778.159244
2020-01-11 20:00:00+00:00,2751.57785
2020-01-11 21:00:00+00:00,2734.469753


In [176]:
electricity_compare = electricity_compare.join(xgb_df, how='left')
electricity_compare = electricity_compare.join(lgb_df, how='left')

In [178]:
def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluate_model(y_true, y_pred, model_name):
    """Evaluates model given y_true and y_pred"""
    rmse = mean_squared_error(y_true=y_true,
                       y_pred=y_pred)

    mae = mean_absolute_error(y_true=y_true,
                       y_pred=y_pred)
    
    mape = mean_absolute_percentage_error(y_true=y_true,
                   y_pred=y_pred)

    return rmse, mae, mape, model_name

rmses = []
maes = []
mapes = []
names = []

for col in electricity_compare.columns:
    if col=='demand(t)':
        continue
    else:
        rmse_score, mae_score, mape_score, model_name = evaluate_model(electricity_compare['demand(t)'], electricity_compare[col], col)

    rmses.append(rmse_score)
    maes.append(mae_score)
    mapes.append(mape_score)
    names.append(model_name.split('_')[0])

model_results = pd.DataFrame({'Model': names, 'RMSE': rmses, 'MAE': maes, 'MAPE':mapes})
print(model_results.sort_values(by='MAPE', ascending=True))

      Model          RMSE         MAE      MAPE
1   XGBoost   2392.739876   32.643458  1.141291
2  LightGBM   3498.306137   42.559748  1.491249
0       EIA  62602.324100  215.144044  7.759170
