# Implementation

## Packages

In [1]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.metrics import mean_absolute_error

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Configuration.config import cfg_path

# custom functions
from Code.Plotting.plots import Plots
from Code.Regressors.regressors import Regressors
from Code.Scoring.scoring import Scoring
from Code.Scoring.train_test import TrainTest
from Code.Scoring.train import Training
from Code.Scoring.forecast import Forecasting
from Code.Scoring.kpi import Kpi
from Code.Scoring.scoring import Scoring
from Code.Utils.utils import Utils


## Setup

In [2]:
#od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)

## Load Data

In [3]:
df_final = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'df_final.pkl'))
df_final.head()

Unnamed: 0,site_id,timestamp,obs_id,forecast_id,value,holidays,day_off,surface,base_temperature,wd_mon,...,month_07,month_08,month_09,month_10,month_11,month_12,temperature,distance,DDC_temperature,DDH_temperature
1,13,2015-11-02 00:00:00+00:00,3747176.0,415.0,3870603.0,1,0,891.48785,18.0,1,...,0,0,0,0,1,0,17.333333,28.407896,0.0,0.666667
3,16,2015-11-02 00:00:00+00:00,2912040.0,524.0,2593093.0,1,0,1218.738383,18.0,1,...,0,0,0,0,1,0,24.226667,21.793645,6.226667,0.0
11,21,2015-11-02 00:00:00+00:00,4779740.0,649.0,3349616.0,1,0,10985.292634,18.0,1,...,0,0,0,0,1,0,7.495833,11.902777,0.0,10.504167
13,22,2015-11-02 00:00:00+00:00,662180.0,685.0,366825.0,1,0,7392.365415,18.0,1,...,0,0,0,0,1,0,15.583333,23.726983,0.0,2.416667
17,25,2015-11-02 00:00:00+00:00,3488017.0,773.0,2968195.0,1,0,2201.924904,18.0,1,...,0,0,0,0,1,0,7.6375,16.135872,0.0,10.3625


# Define model_01_thermal

## Parameter setup

In [4]:
dict_models = {}
dict_models['model_01_thermal'] = {}
dict_models['model_01_thermal']['id'] = 'site_id'
dict_models['model_01_thermal']['list_unique_id'] = ['site_id', 'timestamp']
dict_models['model_01_thermal']['y'] = 'value'

# If the following are ='', it will take the latest year as test set and the previous year as train set
dict_models['model_01_thermal']['train_start_date'] = ''
dict_models['model_01_thermal']['train_end_date'] = ''
dict_models['model_01_thermal']['test_start_date'] = ''
dict_models['model_01_thermal']['test_end_date'] = ''

# Forecast scope is the length in days of the desired forecast
dict_models['model_01_thermal']['forecast_scope'] = 730

## Regressors dictionary

In [5]:
# Have a look at the available regressors
df_final.columns

Index(['site_id', 'timestamp', 'obs_id', 'forecast_id', 'value', 'holidays',
       'day_off', 'surface', 'base_temperature', 'wd_mon', 'wd_tue', 'wd_wed',
       'wd_thu', 'wd_fri', 'wd_sat', 'wd_sun', 'month_01', 'month_02',
       'month_03', 'month_04', 'month_05', 'month_06', 'month_07', 'month_08',
       'month_09', 'month_10', 'month_11', 'month_12', 'temperature',
       'distance', 'DDC_temperature', 'DDH_temperature'],
      dtype='object')

In [6]:
# Compile dictionary of regressors
dict_regressors = {             
    'list_temp': ['temperature'],
    'holidays': ['holidays'],
    'wd': ['wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu'],
    'month': ['month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12'],
    'additional_regressors': ['distance']}

#### Interaction terms

In [7]:
# Add optional interaction terms
dict_interactions = {'set_01': {'reg_list_01': 'list_temp', 'reg_list_02': 'month'}}
for r in list(dict_interactions.keys()):
    list_element = list(dict_interactions[r].keys())
    if len(list_element)==2:
        reg_list_01 = dict_interactions[r][list_element[0]] 
        reg_list_02 = dict_interactions[r][list_element[1]]
        for i in dict_regressors[reg_list_01]:
            for j in dict_regressors[reg_list_02]:
                Regressors.create_interactions(df_final, i, j)
    else:
        print('Define model: list of elements in interactions is more than 2', list_element)

list_interactions = list(df_final.filter(like='*').columns)
for e in list_interactions:
    dict_regressors['additional_regressors'].append(e)

#### Non linear terms

In [8]:
# Add optional non linear terms
dict_non_linear_terms = {'set_01':{'temperature': 2}}
for r in list(dict_non_linear_terms.keys()):
    list_element = list(dict_non_linear_terms[r].keys())
    for e in list_element:
        if len(list_element)==1:
            var = list(dict_non_linear_terms[r].keys())[0]
            n = dict_non_linear_terms[r][e]
            Regressors.create_non_linear_terms(df_final, var, n)
        else:
            print('Define model: list of elements in non linear terms is more than 1', r, list_element)
        
list_non_linear_terms = list(df_final.filter(like='^').columns)
for e in list_non_linear_terms:
    dict_regressors['additional_regressors'].append(e)

## Algorithms dictionary

In [9]:
# Define algorithms to test
n_jobs = -1
dict_algorithms = {}
dict_algorithms['RF_Regressor'] = RandomForestRegressor(n_estimators=200, max_depth = 10, random_state =0, n_jobs=n_jobs)
dict_algorithms['LR_Regressor'] = LinearRegression(n_jobs=n_jobs)
dict_algorithms['XGB_Regressor'] = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5,
                                max_depth = 5, alpha = 10, n_estimators = 50)

## Addind regressors and algorithms to model dictionary

In [10]:
dict_models['model_01_thermal']['regressors'] = dict_regressors
dict_models['model_01_thermal']['algorithms'] = dict_algorithms
print('dict_models is the following:', dict_models)

dict_models is the following: {'model_01_thermal': {'id': 'site_id', 'list_unique_id': ['site_id', 'timestamp'], 'y': 'value', 'train_start_date': '', 'train_end_date': '', 'test_start_date': '', 'test_end_date': '', 'forecast_scope': 730, 'regressors': {'list_temp': ['temperature'], 'holidays': ['holidays'], 'wd': ['wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu'], 'month': ['month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12'], 'additional_regressors': ['distance', 'temperature*month_01', 'temperature*month_02', 'temperature*month_03', 'temperature*month_04', 'temperature*month_05', 'temperature*month_07', 'temperature*month_08', 'temperature*month_09', 'temperature*month_10', 'temperature*month_11', 'temperature*month_12', 'temperature^2']}, 'algorithms': {'RF_Regressor': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto',

## Training

### Training parameters

In [11]:
date_var = Utils.find_date(df_final)
list_unique_id = dict_models['model_01_thermal']['list_unique_id']
id = dict_models['model_01_thermal']['id']

find_date, date_col found: ['timestamp']


### Site stats

In [12]:
site_stats = Scoring.stats_per_site(df_final, id, date_var)

# Selecting Sites with most samples
top5_sites = site_stats.iloc[:5][id]
print("SitesIds with most samples", top5_sites.to_list())

SitesIds with most samples [22, 49, 42, 25, 33]


### Training dataframe

In [13]:
df = df_final.loc[(df_final[id].isin(top5_sites)), ]
print('Actual id list:', list(df[id].unique()))
print('Actual regressors available:', list(set(list(df.columns)) - set(list_unique_id)))

Actual id list: [22, 25, 33, 42, 49]
Actual regressors available: ['forecast_id', 'day_off', 'month_03', 'month_05', 'distance', 'surface', 'temperature*month_02', 'month_12', 'value', 'month_04', 'month_08', 'wd_sun', 'temperature*month_04', 'temperature*month_05', 'wd_wed', 'temperature*month_10', 'obs_id', 'holidays', 'month_07', 'wd_thu', 'month_10', 'month_02', 'month_11', 'temperature*month_12', 'wd_tue', 'wd_fri', 'temperature*month_03', 'month_01', 'month_06', 'temperature^2', 'temperature*month_01', 'DDH_temperature', 'temperature*month_11', 'temperature*month_08', 'month_09', 'base_temperature', 'wd_mon', 'temperature*month_07', 'temperature*month_09', 'wd_sat', 'DDC_temperature', 'temperature']


# Forecasting

In [14]:
dict_results = {}
for m in list(dict_models.keys()):
    print('Forecasting with model', m)
    dict_results[m] = {}
    
    # Get dict of algorithms
    dict_algorithms = dict_models[m]['algorithms']    
    
    # Get list of algorithms
    list_regressors = []
    for reg in list(dict_models[m]['regressors'].keys()):
        list_regressors = list_regressors + dict_models[m]['regressors'][reg]
        
    print('Actual regressors used in model:', m, list_regressors)
        
    # Define columns to keep: list unique id, y and list of regressors
    list_unique_id = dict_models[m]['list_unique_id'] 
    y = dict_models[m]['y']
    id = dict_models[m]['id']
    cols_to_keep = dict_models[m]['list_unique_id'] + [y] + list_regressors 
    
    # Define train and test set
    train_start_date = dict_models[m]['train_start_date']
    train_end_date = dict_models[m]['train_end_date']
    test_start_date = dict_models[m]['test_start_date']
    test_end_date = dict_models[m]['test_end_date']
    forecast_scope = dict_models[m]['forecast_scope']

    for s in list(df[id].unique()):
        print('Forecasting', id, s)
        dict_results[m][s] = {}
        df_sliced = df.loc[df[id]==s, cols_to_keep].copy()
        
        # Adding train and test set dates
        dict_train_test_set = TrainTest.define_train_test_set_dates(df_sliced, train_start_date, train_end_date, test_start_date, test_end_date)
        
        test_end_date = dict_train_test_set['test_end_date']
        test_start_date = dict_train_test_set['test_start_date']

        train_start_date = dict_train_test_set['train_start_date']
        train_end_date = dict_train_test_set['train_end_date']
        
        # Scoring
        try:
            dict_train = TrainTest.def_train(df_sliced, y, list_unique_id, train_start_date, train_end_date)
            dict_test = TrainTest.def_test(df_sliced, y, list_unique_id, test_start_date, test_end_date, forecast_scope)
            best_algorithm = Scoring.find_best_algorithm(y, dict_train, dict_test, dict_algorithms)
            trained_model = Training.train(dict_train, dict_algorithms[best_algorithm])
            forecasted_model = Forecasting.forecast(dict_test, trained_model)
            dict_results[m][s] = {'best_algorithm': best_algorithm, 'historical_data': dict_train['historical_data'], 'forecast': forecasted_model['df_fcst'], 'train_start_date': dict_train['train_start_date'], 'train_end_date': dict_train['train_end_date'], 'test_start_date': dict_test['test_start_date'], 'test_end_date': dict_test['test_end_date']}
            print(id, s, 'forecasting completed with model', m)
        except ValueError as e:
            raise Exception(id, s, 'could not be forecasted', e)

Forecasting with model model_01_thermal
Actual regressors used in model: model_01_thermal ['temperature', 'holidays', 'wd_fri', 'wd_mon', 'wd_tue', 'wd_sat', 'wd_sun', 'wd_thu', 'month_01', 'month_02', 'month_03', 'month_04', 'month_05', 'month_07', 'month_08', 'month_09', 'month_10', 'month_11', 'month_12', 'distance', 'temperature*month_01', 'temperature*month_02', 'temperature*month_03', 'temperature*month_04', 'temperature*month_05', 'temperature*month_07', 'temperature*month_08', 'temperature*month_09', 'temperature*month_10', 'temperature*month_11', 'temperature*month_12', 'temperature^2']
Forecasting site_id 22
find_date, date_col found: ['timestamp']
find_date, date_col found: ['timestamp']
Train start date is 2015-11-02 00:00:00
Train end date is 2016-11-03 00:00:00
Train shape before removing nan is 336
Min date AFTER removing nan is 2015-11-02 00:00:00
Max date AFTER removing nan is 2016-11-03 00:00:00
Shape AFTER removing nan is 336
find_date, date_col found: ['timestamp']


# Finalize

### Create csv as per input format of PowerBI

In [15]:
df_pbi = pd.DataFrame({})
for m in list(dict_results.keys()):
    for s in list(dict_results[m].keys()):
        print('Preparing PBI for', id, s)
        df_historical_data = dict_results[m][s]['historical_data']
        df_forecast = dict_results[m][s]['forecast']
        df_merged = pd.merge(df_historical_data, df_forecast, on = date_var, how='outer', validate='1:1')
        df_merged.loc[:, id] = s
        df_merged.loc[:, 'model'] = m
        df_merged.loc[:, 'best_algorithm'] = dict_results[m][s]['best_algorithm'] 
        df_merged.loc[:, 'train_start_date'] = dict_results[m][s]['train_start_date'] 
        df_merged.loc[:, 'train_end_date'] = dict_results[m][s]['train_end_date'] 
        df_merged.loc[:, 'test_start_date'] = dict_results[m][s]['test_start_date'] 
        df_merged.loc[:, 'test_end_date'] = dict_results[m][s]['test_end_date'] 
        df_pbi = pd.concat([df_pbi, df_merged], axis=0)
        df_pbi.loc[:, 'energy_diff'] = df_pbi['fcst'] - df_pbi[y]
    
df_pbi.head()

Preparing PBI for site_id 22
Preparing PBI for site_id 25
Preparing PBI for site_id 33
Preparing PBI for site_id 42
Preparing PBI for site_id 49


Unnamed: 0,timestamp,value,fcst,site_id,model,best_algorithm,train_start_date,train_end_date,test_start_date,test_end_date,energy_diff
0,2015-11-02,366825.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2016-11-03,2016-11-04,2017-11-04,
1,2015-11-03,2454051.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2016-11-03,2016-11-04,2017-11-04,
2,2015-11-04,1673775.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2016-11-03,2016-11-04,2017-11-04,
3,2015-11-05,2509700.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2016-11-03,2016-11-04,2017-11-04,
4,2015-11-06,2899576.0,,22,model_01_thermal,LR_Regressor,2015-11-02,2016-11-03,2016-11-04,2017-11-04,


### Compute KPI

In [16]:
df_pbi.loc[:, 'error'] = df_pbi['fcst'] - df_pbi[y]
df_pbi.loc[:, 'absolute_error'] = abs(df_pbi['fcst'] - df_pbi[y])
df_pbi.loc[:, 'absolute_percentage_error'] = abs(df_pbi['fcst'] - df_pbi[y])/df_pbi[y]

print("MAE:", round(df_pbi.loc[:, 'absolute_error'].mean(), 0))
print("MAPE:", round(df_pbi.loc[:, 'absolute_percentage_error'].mean(), 2))


MAE: 371692.0
MAPE: 0.82


### Plotting results

In [17]:
chart_title = 'Energy prediction'
y = 'value'
for s in list(df_pbi[id].unique()):
    df_plot = df_pbi.loc[df_pbi[id]==s, ]
    saving_name = str(id) + '_' + str(s) + '_energy_prediction'
    plot = Plots.sliding_fcst_plot(df_plot, y, 'fcst', chart_title, kpi=True)
    df_plot.to_csv(os.path.join(root, cfg_path.data_dir.output_path, saving_name + ".csv"))
    plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path, saving_name + ".html"))


# Conversion factors
KGCO2 = 0.2453kg/KWh

Pounds = $0.1189/KWh