In [1]:
import numpy as np
import pandas as pd

Thanks to Nicholas Jhana's example: https://github.com/nicholasjhana/short-term-energy-demand-forecasting

# Problem:

## What forecasting model and supervised learning problem formulation gives the lowest Mean Absolute Error (MAE) given constrained computation power?

## Models Used:

1. SARIMA
2. Prophet
3. Long-Short Term Memory Neural Network

## Analysis Process/Core Tasks:

1. Data Preprocessing/Cleaning
2. Exploratory Data Analysis (EDA)
3. Statistical Modeling
4. Machine Learning (ML)

# Data Processing
## 3 Types of Data: Energy, Weather, Holiday

Looking at samples of each data type.

In [4]:
energy_data = pd.read_csv('data/raw_data/energy_data/load_forecast_2016_2018.csv')
energy_data

Unnamed: 0,time,day_forecast,actual_load
0,2016-01-01 00:00:00,23273.0,22431.0
1,2016-01-01 01:00:00,22495.0,21632.0
2,2016-01-01 02:00:00,21272.0,20357.0
3,2016-01-01 03:00:00,20022.0,19152.0
4,2016-01-01 04:00:00,19148.0,18310.0
...,...,...,...
26299,2018-12-31 19:00:00,30619.0,30653.0
26300,2018-12-31 20:00:00,29932.0,29735.0
26301,2018-12-31 21:00:00,27903.0,28071.0
26302,2018-12-31 22:00:00,25450.0,25801.0


In [5]:
weather_data = pd.read_csv('data/raw_data/weather_data/weather_2013_2019.csv')
weather_data

Unnamed: 0.1,Unnamed: 0,dt,dt_iso,city_id,city_name,temp,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,rain_1h,rain_3h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,0,2013-10-01 02:00:00,2013-10-01 00:00:00 +0000 UTC,2509954,Valencia,299.150000,299.150000,299.150000,1008,61,5,290,0.0,0.0,0.0,20,801,clouds,few clouds,02n
1,1,2013-10-01 03:00:00,2013-10-01 01:00:00 +0000 UTC,2509954,Valencia,298.150000,298.150000,298.150000,1009,65,4,250,0.0,0.0,0.0,20,801,clouds,few clouds,02n
2,2,2013-10-01 04:00:00,2013-10-01 02:00:00 +0000 UTC,2509954,Valencia,296.161000,296.161000,296.161000,1009,71,4,269,0.0,0.0,0.0,10,800,clear,sky is clear,02
3,3,2013-10-01 05:00:00,2013-10-01 03:00:00 +0000 UTC,2509954,Valencia,297.150000,297.150000,297.150000,1008,69,1,250,0.0,0.0,0.0,20,801,clouds,few clouds,02n
4,4,2013-10-01 06:00:00,2013-10-01 04:00:00 +0000 UTC,2509954,Valencia,294.031687,294.031687,294.031687,1009,78,4,288,0.0,0.0,0.0,0,800,clear,sky is clear,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262995,262995,2019-08-25 22:00:00,2019-08-25 20:00:00 +0000 UTC,6361046,Seville,300.490000,299.260000,302.150000,1012,47,3,250,0.0,0.0,0.0,0,800,clear,clear sky,01n
262996,262996,2019-08-25 23:00:00,2019-08-25 21:00:00 +0000 UTC,6361046,Seville,299.750000,298.150000,301.150000,1013,54,3,260,0.0,0.0,0.0,0,800,clear,clear sky,01n
262997,262997,2019-08-26 00:00:00,2019-08-25 22:00:00 +0000 UTC,6361046,Seville,298.830000,297.150000,300.150000,1013,60,2,310,0.0,0.0,0.0,0,800,clear,clear sky,01n
262998,262998,2019-08-26 01:00:00,2019-08-25 23:00:00 +0000 UTC,6361046,Seville,297.890000,296.480000,300.150000,1013,57,2,299,0.0,0.0,0.0,0,800,clear,clear sky,01n


In [7]:
# # Data Creation: Day Types
# 
# This notebook builds a function that adds the type of day for each calendar day.
# 
# Including type of day as a short term predictor in energy load forecasts has been shown as a useful predictor [here](https://www.mdpi.com/1996-1073/12/1/164/pdf) and [here](https://www.mdpi.com/1996-1073/11/5/1120/pdf). The types of days (exogenous varaible) that are generated by this function are:
# 
# - named day of the week
# - weekend or weekday
# - holiday or special event
# 

#import relevant libraries
from datetime import date
import pandas as pd
import holidays

def get_holidays(start='1/1/2018', stop='31/12/2018', country='ES'):
    """
    Takes in a start and stop date and a country.
    
    Produces a dataframe with a daily date time index and columns:
    day_of_week - numerical day of the week identifier 0 for monday
    holiday_bool - boolean true or false for holiday
    holiday_name - name of the holiday if holiday_bool is true
    
    Returns a dataframe
    """
    
    #generate the range of daily dates
    dates = pd.date_range(start=start, end=stop)
    
    #create the holiday object
    country_holidays = holidays.CountryHoliday(country)
    
    #create a dataframe of weekday categories
    days = pd.DataFrame(list(dates.weekday), index=dates, columns=['weekday_id'])

    #create a list for the holiday bool and name
    holiday_list = []
    
    #loop through the dates
    for date in dates:
        #true if holiday in object, false otherwise
        holiday_bool = date in country_holidays
        holiday_names = country_holidays.get(date)
        
        holiday_list.append([holiday_bool, holiday_names])
        
    #create return dataframe
    holidays_data = pd.DataFrame(holiday_list, index=dates, columns=['holiday_bool', 'holiday_name'])
          
    #join the days and the holidays_data dataframes
    data = pd.concat([days, holidays_data], axis=1)
                 
                    
    return data
get_holidays()

Unnamed: 0,weekday_id,holiday_bool,holiday_name
2018-01-01,0,True,Año nuevo
2018-01-02,1,False,
2018-01-03,2,False,
2018-01-04,3,False,
2018-01-05,4,False,
...,...,...,...
2018-12-27,3,False,
2018-12-28,4,False,
2018-12-29,5,False,
2018-12-30,6,False,
