# Implementation

## Packages

In [1]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
from sklearn.metrics import mean_absolute_error

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from Configuration.config import cfg_path

# custom functions
from Code.Plotting.plots import Plots
from Code.Regressors.regressors import Regressors
from Code.Scoring.scoring import Scoring
from Code.Scoring.train_test import TrainTest
from Code.Scoring.train import Training
from Code.Scoring.forecast import Forecasting
from Code.Scoring.kpi import Kpi
from Code.Scoring.scoring import Scoring
from Code.Utils.utils import Utils


## Setup

In [3]:
#od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)

## Load Data

In [4]:
df_final = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'df_final.pkl'))
df_final.head()

Unnamed: 0,site_id,timestamp,obs_id,forecast_id,value,holidays,day_off,surface,base_temperature,wd_mon,...,month_07,month_08,month_09,month_10,month_11,month_12,temperature,distance,DDC_temperature,DDH_temperature
0,12,2015-11-02,,,,0,0,33676.246551,18.0,1,...,0,0,0,0,1,0,,,,
1,13,2015-11-02,3747176.0,415.0,3870603.0,1,0,891.48785,18.0,1,...,0,0,0,0,1,0,17.333333,28.407896,0.0,0.666667
3,16,2015-11-02,2912040.0,524.0,2593093.0,1,0,1218.738383,18.0,1,...,0,0,0,0,1,0,24.226667,21.793645,6.226667,0.0
5,17,2015-11-02,,,0.0,1,0,1625.83752,18.0,1,...,0,0,0,0,1,0,,,,
7,18,2015-11-02,,,0.0,1,0,677.533195,18.0,1,...,0,0,0,0,1,0,,,,


# Define model_01_thermal

In [5]:
from dstoolkit.forcaster import TimeSeriesDataset, Trainer

## Parameter setup

In [9]:
ds = TimeSeriesDataset(df_final, "value", sku_col="site_id", datetime_col="timestamp")
month_dummies = ds.add_month_dummies()
weekday_dummies = ds.add_weekdays_dummies()

### Intermittent Profilling

In [None]:
score_mix = ds.enh_idclass5(threshold, perc, quant, highest, lowest)

ds.classify_intermittent(score_mix)
# or
ds.classify_intermittent(score_mix, thres_cv2_constant=0.05)

## Regressors dictionary

#### Interaction terms

In [23]:
interaction_cols = ds.add_interactions(col1=['temperature'], col2=month_dummies)

#### Non linear terms

In [24]:
ds.add_non_linear_term("temperature", 2)
ds.add_non_linear_term("temperature", 3)

## Algorithms dictionary

In [25]:
# Define algorithms to test
# First Algorithm in the list is the default algorithm
n_jobs = -1
algorithms = [
    RandomForestRegressor(n_estimators=200, max_depth = 10, random_state =0, n_jobs=n_jobs),
    LinearRegression(n_jobs=n_jobs),
    xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.5,
                                max_depth = 5, alpha = 10, n_estimators = 50)
]


## Training

### Training parameters

### Site stats

In [28]:
site_stats = ds.stats_per_sku()

# Selecting Sites with most samples
top5_sites = site_stats.iloc[:5]["id"]
print("SitesIds with most samples", top5_sites.to_list())

SitesIds with most samples [42, 22, 25, 49, 16]


### Training dataframe

In [29]:
ds.df = ds.df.loc[(ds.df["site_id"].isin(top5_sites)), ]
print('Actual id list:', list(ds.df[id].unique()))
print('Actual regressors available:', ds.regressors)

Actual id list: [16, 22, 25, 42, 49]
Actual regressors available: ['temperature*month_07', 'DDH_temperature', 'wd_tue', 'surface', 'value', 'temperature^2', 'wd_fri', 'wd_wed', 'wd_sat', 'temperature*month_02', 'forecast_id', 'month_01', 'temperature*month_10', 'temperature*month_03', 'month_04', 'day_off', 'temperature*month_08', 'wd_sun', 'month_12', 'month_03', 'holidays', 'month_06', 'temperature*month_11', 'base_temperature', 'temperature*month_01', 'wd_thu', 'distance', 'DDC_temperature', 'temperature*month_12', 'obs_id', 'month_07', 'month_05', 'month_09', 'month_02', 'temperature*month_09', 'temperature', 'wd_mon', 'temperature*month_05', 'temperature*month_04', 'month_11', 'month_10', 'month_08']


# Forecasting

In [None]:
train_ds, test_ds = ds.get_train_test_split(forecast_scope=730)

trainer = Trainer(algorithms)
trained_model, best_algorithm = trainer.train(train_ds, test_ds)

# Optionally save model to Blob/AzureML

In [None]:
forcasted_df = trainer.forcast(test_ds)

# Finalize

### Create csv as per input format of PowerBI

In [None]:
forcasted_ds = trainer.forcast_to_pbi_csv(test_ds, filename="outputs/energy_pred.csv")

### Compute KPI

In [None]:
mae, mape = trainer.compute_model_kpi(forcasted_ds)

print("MAE:", mae)
print("MAPE:", mape)

### Plotting results

In [None]:
forcasted_ds.plot(chart_title="Energy prediction")

# Conversion factors
KGCO2 = 0.2453kg/KWh

Pounds = $0.1189/KWh