# Implementation

## Packages

In [1]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.metrics import mean_absolute_error

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Configuration.config import cfg_path

# custom functions
from Code.Plotting.plots import Plots
from Code.Regressors.regressors import Regressors
from Code.Scoring.scoring import Scoring
from Code.Scoring.train_test import TrainTest
from Code.Scoring.train import Training
from Code.Scoring.forecast import Forecasting
from Code.Scoring.kpi import Kpi
from Code.Scoring.scoring import Scoring
from Code.Utils.utils import Utils


## Setup

In [2]:
#od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent

## Load Data

In [3]:
df_final = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'df_final.pkl'))
date_var = Utils.find_date(df_final)
df_final.head()

find_date, date_col found: ['timestamp']


Unnamed: 0,site_id,timestamp,obs_id,forecast_id,value,holidays,day_off,surface,base_temperature,wd_mon,...,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,months_days,temperature_asis
0,2,2015-11-02,7390465.0,26.0,1492533.0,0,0,6098.278376,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.247917
1,3,2015-11-02,,,,0,0,10556.293605,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.252083
2,5,2015-11-02,,,,0,0,12541.181277,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.241667
3,6,2015-11-02,1413383.0,129.0,854147.9,0,0,9150.195373,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.24375
4,7,2015-11-02,,,,0,0,15168.125971,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.245833


# Define model_01_thermal

## Parameter setup

In [4]:
dict_models = {}
m = 'model_01_thermal'
dict_models[m] = {}
dict_models[m]['id'] = 'site_id'
dict_models[m]['list_unique_id'] = ['site_id', 'timestamp']
dict_models[m]['y'] = 'value'

# If the following are ='', it will take the latest year as test set and the previous year as train set
dict_models[m]['train_start_date'] = ''
dict_models[m]['train_end_date'] = ''
dict_models[m]['test_start_date'] = ''
dict_models[m]['test_end_date'] = ''
dict_models[m]['test_size'] = 0.33

## Regressors dictionary

In [5]:
# Have a look at the available regressors
df_final.columns

Index(['site_id', 'timestamp', 'obs_id', 'forecast_id', 'value', 'holidays',
       'day_off', 'surface', 'base_temperature', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12', 'months_days',
       'temperature_asis'],
      dtype='object')

In [6]:
# Check regressors availability
regressors_list = [ 'holidays',
       'day_off', 'surface', 'base_temperature', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12', 'months_days',
       'temperature_asis']

forecast_end_date = df_final[date_var].max()
Utils.check_regressors_availability(df_final, date_var, regressors_list, forecast_end_date)

Regressor holidays has all needed values
Regressor day_off has all needed values
Regressor surface has all needed values
Regressor base_temperature has all needed values
Regressor wd_mon has all needed values
Regressor wd_tue has all needed values
Regressor wd_wed has all needed values
Regressor wd_thu has all needed values
Regressor wd_fri has all needed values
Regressor wd_sat has all needed values
Regressor wd_sun has all needed values
Regressor month_01 has all needed values
Regressor month_02 has all needed values
Regressor month_03 has all needed values
Regressor month_04 has all needed values
Regressor month_05 has all needed values
Regressor month_06 has all needed values
Regressor month_07 has all needed values
Regressor month_08 has all needed values
Regressor month_09 has all needed values
Regressor month_10 has all needed values
Regressor month_11 has all needed values
Regressor month_12 has all needed values
Regressor months_days has all needed values
Regressor temperature

In [7]:
# Compile dictionary of regressors
dict_regressors = {             
    'list_temp': ['temperature_asis'],
    'holidays': ['holidays'],
    'wd': ['wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu'],
    'month': ['month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12'],
    'additional_regressors': []}

#### Interaction terms

In [8]:
# Add optional interaction terms
dict_interactions = {'set_01': {'reg_list_01': 'list_temp', 'reg_list_02': 'month'}}
for r in list(dict_interactions.keys()):
    list_element = list(dict_interactions[r].keys())
    if len(list_element)==2:
        reg_list_01 = dict_interactions[r][list_element[0]] 
        reg_list_02 = dict_interactions[r][list_element[1]]
        for i in dict_regressors[reg_list_01]:
            for j in dict_regressors[reg_list_02]:
                Regressors.create_interactions(df_final, i, j)
    else:
        print('Define model: list of elements in interactions is more than 2', list_element)

list_interactions = list(df_final.filter(like='*').columns)
for e in list_interactions:
    dict_regressors['additional_regressors'].append(e)

#### Non linear terms

In [9]:
# Add optional non linear terms
dict_non_linear_terms = {'set_01':{'temperature_asis': 2}}
for r in list(dict_non_linear_terms.keys()):
    list_element = list(dict_non_linear_terms[r].keys())
    for e in list_element:
        if len(list_element)==1:
            var = list(dict_non_linear_terms[r].keys())[0]
            n = dict_non_linear_terms[r][e]
            Regressors.create_non_linear_terms(df_final, var, n)
        else:
            print('Define model: list of elements in non linear terms is more than 1', r, list_element)
        
list_non_linear_terms = list(df_final.filter(like='^').columns)
for e in list_non_linear_terms:
    dict_regressors['additional_regressors'].append(e)

## Algorithms dictionary

In [10]:
# Define algorithms to test, these algorithms will be used in backtesting
n_jobs = -1
dict_algorithms = {}
dict_algorithms['RF_Regressor'] = RandomForestRegressor(n_estimators=200, max_depth = 10, random_state =0, n_jobs=n_jobs)
dict_algorithms['LR_Regressor'] = LinearRegression(n_jobs=n_jobs)
dict_algorithms['XGB_Regressor'] = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5,
                                max_depth = 5, alpha = 10, n_estimators = 50)

# Always add a forecasting algorithm used to produce forecasts out of sample
dict_algorithms['out_of_sample'] = dict_algorithms['LR_Regressor']

## Addind regressors and algorithms to model dictionary

In [11]:
dict_models['model_01_thermal']['regressors'] = dict_regressors
dict_models['model_01_thermal']['algorithms'] = dict_algorithms
print('dict_models is the following:', dict_models)

dict_models is the following: {'model_01_thermal': {'id': 'site_id', 'list_unique_id': ['site_id', 'timestamp'], 'y': 'value', 'train_start_date': '', 'train_end_date': '', 'test_start_date': '', 'test_end_date': '', 'test_size': 0.33, 'regressors': {'list_temp': ['temperature_asis'], 'holidays': ['holidays'], 'wd': ['wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu'], 'month': ['month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12'], 'additional_regressors': ['temperature_asis*month_01', 'temperature_asis*month_02', 'temperature_asis*month_03', 'temperature_asis*month_04', 'temperature_asis*month_05', 'temperature_asis*month_07', 'temperature_asis*month_08', 'temperature_asis*month_09', 'temperature_asis*month_10', 'temperature_asis*month_11', 'temperature_asis*month_12', 'temperature_asis^2']}, 'algorithms': {'RF_Regressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
       

## Training

### Training parameters

In [12]:
date_var = Utils.find_date(df_final)
list_unique_id = dict_models['model_01_thermal']['list_unique_id']
id = dict_models['model_01_thermal']['id']

find_date, date_col found: ['timestamp']


### Site stats

In [13]:
site_stats = Scoring.stats_per_site(df_final, id, date_var)

# Selecting Sites with most samples
top5_sites = site_stats.iloc[:5][id]
print("SitesIds with most samples", top5_sites.to_list())

SitesIds with most samples [42, 19, 22, 50, 49]


### Training dataframe

In [14]:
df = df_final.loc[(df_final[id].isin(top5_sites)), ]
print('Actual id list:', list(df[id].unique()))
print('Actual regressors available:', list(set(list(df.columns)) - set(list_unique_id)))

Actual id list: [19, 22, 42, 49, 50]
Actual regressors available: ['day_off', 'month_12', 'months_days', 'temperature_asis*month_05', 'month_10', 'wd_wed', 'temperature_asis^2', 'wd_sat', 'wd_tue', 'forecast_id', 'month_07', 'temperature_asis*month_09', 'temperature_asis*month_10', 'obs_id', 'temperature_asis*month_08', 'month_05', 'temperature_asis*month_01', 'temperature_asis*month_02', 'month_09', 'temperature_asis*month_07', 'month_04', 'month_02', 'temperature_asis*month_04', 'temperature_asis*month_12', 'temperature_asis', 'wd_fri', 'wd_mon', 'surface', 'value', 'month_08', 'temperature_asis*month_03', 'holidays', 'month_03', 'wd_thu', 'month_11', 'month_01', 'wd_sun', 'base_temperature', 'temperature_asis*month_11', 'month_06']


# Forecasting

In [15]:
dict_results = {}
for m in list(dict_models.keys()):
    print('Forecasting with model', m)
    dict_results[m] = {}
    
    # Get dict of algorithms
    dict_algorithms = dict_models[m]['algorithms']    
    
    # Get list of algorithms
    list_regressors = []
    for reg in list(dict_models[m]['regressors'].keys()):
        list_regressors = list_regressors + dict_models[m]['regressors'][reg]
        
    print('Actual regressors used in model:', m, list_regressors)
        
    # Define columns to keep: list unique id, y and list of regressors
    list_unique_id = dict_models[m]['list_unique_id'] 
    y = dict_models[m]['y']
    id = dict_models[m]['id']
    cols_to_keep = dict_models[m]['list_unique_id'] + [y] + list_regressors 

    for s in list(df[id].unique()):
        print('Forecasting', id, s)
        dict_results[m][s] = {}
        df_sliced = df.loc[df[id]==s, cols_to_keep].copy()
        
        # Adding train and test set dates
        dict_train_test_set = TrainTest.define_train_test_set_dates(df_sliced, y, dict_models[m]['train_start_date'], dict_models[m]['train_end_date'], dict_models[m]['test_start_date'], dict_models[m]['test_end_date'], dict_models[m]['test_size'])
                
        # Scoring
        try:
            dict_train = TrainTest.def_train(df_sliced, y, list_unique_id, dict_train_test_set['train_start_date'], dict_train_test_set['train_end_date'])
            dict_test = TrainTest.def_test(df_sliced, y, list_unique_id, dict_train_test_set['test_start_date'], dict_train_test_set['test_end_date'])
            best_algorithm = Scoring.find_best_algorithm(y, dict_train, dict_test, dict_algorithms, out_of_sample='out_of_sample')
            trained_model = Training.train(dict_train, dict_algorithms[best_algorithm])
            forecasted_model = Forecasting.forecast(dict_test, trained_model)
            dict_results[m][s] = {'best_algorithm': best_algorithm, 'historical_data': dict_train['historical_data'], 'forecast': forecasted_model['df_fcst'], 'train_start_date': dict_train_test_set['train_start_date'], 'train_end_date': dict_train_test_set['train_end_date'], 'test_start_date': dict_train_test_set['test_start_date'], 'test_end_date': dict_train_test_set['test_end_date']}
            print(id, s, 'forecasting completed with model', m)
        except ValueError as e:
            raise Exception(id, s, 'could not be forecasted', e)

Forecasting with model model_01_thermal
Actual regressors used in model: model_01_thermal ['temperature_asis', 'holidays', 'wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu', 'month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12', 'temperature_asis*month_01', 'temperature_asis*month_02', 'temperature_asis*month_03', 'temperature_asis*month_04', 'temperature_asis*month_05', 'temperature_asis*month_07', 'temperature_asis*month_08', 'temperature_asis*month_09', 'temperature_asis*month_10', 'temperature_asis*month_11', 'temperature_asis*month_12', 'temperature_asis^2']
Forecasting site_id 19
find_date, date_col found: ['timestamp']
find_date, date_col found: ['timestamp']
Train start date is already a date
Train start date is 2015-11-02 00:00:00
Train end date is already a date
Train end date is 2017-03-06 00:00:00
Train shape before removing nan is 491
Min date AFTER removing nan is 2015-11-02 00:00:00
Max

# Finalize

### Create csv as per input format of PowerBI

In [16]:
df_pbi = pd.DataFrame({})
for m in list(dict_results.keys()):
    for s in list(dict_results[m].keys()):
        print('Preparing PBI for', id, s)
        df_historical_data = dict_results[m][s]['historical_data'].reset_index(drop=True)
        df_forecast = dict_results[m][s]['forecast'].reset_index(drop=True)
        df_merged = pd.merge(df_historical_data, df_forecast, on = date_var, how='outer', validate='1:1')
        df_merged.loc[:, id] = s
        df_merged.loc[:, 'model'] = m
        df_merged.loc[:, 'best_algorithm'] = dict_results[m][s]['best_algorithm'] 
        df_merged.loc[:, 'train_start_date'] = dict_results[m][s]['train_start_date'] 
        df_merged.loc[:, 'train_end_date'] = dict_results[m][s]['train_end_date'] 
        df_merged.loc[:, 'test_start_date'] = dict_results[m][s]['test_start_date'] 
        df_merged.loc[:, 'test_end_date'] = dict_results[m][s]['test_end_date'] 
        df_pbi = pd.concat([df_pbi, df_merged], axis=0, ignore_index=True)
        df_pbi.loc[:, 'energy_diff'] = df_pbi['fcst'] - df_pbi[y]
        df_pbi.sort_values(by= date_var, inplace=True)
    
df_pbi.head()

Preparing PBI for site_id 19
Preparing PBI for site_id 22
Preparing PBI for site_id 42
Preparing PBI for site_id 49
Preparing PBI for site_id 50


Unnamed: 0,timestamp,value,fcst,site_id,model,best_algorithm,train_start_date,train_end_date,test_start_date,test_end_date,energy_diff
0,2015-11-02,474560.3,,19,model_01_thermal,RF_Regressor,2015-11-02,2017-03-06,2017-03-07,2022-12-31,
1,2015-11-02,637088.0,,42,model_01_thermal,RF_Regressor,2015-11-02,2017-03-26,2017-03-27,2022-12-31,
2,2015-11-02,366825.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2017-03-06,2017-03-07,2022-12-31,
3,2015-11-03,2454051.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2017-03-06,2017-03-07,2022-12-31,
4,2015-11-03,0.0,,19,model_01_thermal,RF_Regressor,2015-11-02,2017-03-06,2017-03-07,2022-12-31,


### Compute KPI

In [17]:
df_pbi = Kpi.compute_error(df_pbi, 'fcst', y)
df_pbi = Kpi.compute_absolute_error(df_pbi, 'fcst', y)
df_pbi = Kpi.compute_absolute_percentage_error(df_pbi, 'fcst', y)

print("MAE:",  round(Kpi.compute_mae(df_pbi, 'fcst', y), 2))
print("MAPE:", round(Kpi.compute_mape(df_pbi, 'fcst', y), 2))


MAE: 347644.38
MAPE: 0.91


### Plotting results

In [18]:
y = 'value'
for s in list(df_pbi[id].unique()):
    best_model = df_pbi.loc[df_pbi[id] == s, 'best_algorithm'].unique()[0]
    chart_title = f"Energy prediction - {id}: {s} - algorithm used: {best_model}"
    df_plot = df_pbi.loc[df_pbi[id]==s, ]
    saving_name = str(id) + '_' + str(s) + '_energy_prediction'
    plot = Plots.sliding_fcst_plot(df_plot, y, 'fcst', chart_title, kpi=True)
    df_plot.to_csv(os.path.join(root, cfg_path.data_dir.output_path, saving_name + ".csv"))
    plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, saving_name + ".html"))
