# Implementation

## Packages

In [1]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, scale
from sklearn.metrics import mean_absolute_error
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

# statistical functions
from scipy.stats.mstats import winsorize

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# custom functions
from Code.Profiling.Intermittent.intermittent import Intermittent
from Code.Utils.utils import Utils
from Code.Scoring.kpi import Kpi
from Code.Scoring.forecast import Forecasting
from Code.Scoring.train import Training
from Code.Scoring.train_test import TrainTest
from Code.Scoring.scoring import Scoring
from Code.Regressors.regressors import Regressors
from Code.Plotting.plots import Plots
from Configuration.config import cfg_path

## Setup

In [2]:
# od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)


## Load Data

In [3]:
df_final = pd.read_pickle(os.path.join(
    root, cfg_path.data_dir.output_path, 'df_final.pkl'))
df_final.head()


Unnamed: 0,site_id,timestamp,obs_id,forecast_id,value,holidays,day_off,surface,base_temperature,wd_mon,...,month_07,month_08,month_09,month_10,month_11,month_12,temperature,distance,DDC_temperature,DDH_temperature
1,13,2015-11-02 00:00:00+00:00,3747176.0,415.0,3870603.0,1,0,891.48785,18.0,1,...,0,0,0,0,1,0,17.333333,28.407896,0.0,0.666667
3,16,2015-11-02 00:00:00+00:00,2912040.0,524.0,2593093.0,1,0,1218.738383,18.0,1,...,0,0,0,0,1,0,24.226667,21.793645,6.226667,0.0
11,21,2015-11-02 00:00:00+00:00,4779740.0,649.0,3349616.0,1,0,10985.292634,18.0,1,...,0,0,0,0,1,0,7.495833,11.902777,0.0,10.504167
13,22,2015-11-02 00:00:00+00:00,662180.0,685.0,366825.0,1,0,7392.365415,18.0,1,...,0,0,0,0,1,0,15.583333,23.726983,0.0,2.416667
17,25,2015-11-02 00:00:00+00:00,3488017.0,773.0,2968195.0,1,0,2201.924904,18.0,1,...,0,0,0,0,1,0,7.6375,16.135872,0.0,10.3625


## Parameter setup

In [4]:
id = 'site_id'
list_unique_id = ['site_id', 'timestamp']
list_temp = ['temp']
y = 'value'
date_var = Utils.find_date(df_final)

find_date, date_col found: ['timestamp']


## Working dataframe

In [5]:
# Defining working dataframe by removing potential null values in y variable
df = df_final.loc[~df_final[y].isnull(),].copy()
print('Id available', list(df[id].unique()))

Id available [13, 16, 21, 22, 25, 26, 29, 32, 33, 39, 40, 42, 27, 51, 47, 48, 49, 44, 50, 57, 20, 46, 17, 18, 52, 12]


# Compute intermittent indicators

In [6]:
# Winsorizing parameters
highest = 0.05
lowest = 0.05

# Identifying intermittent time series parameters
threshold = 250
perc = 0.01
quant = 0.999
score = df.loc[:, [id, y]].groupby(id).apply(lambda x: Intermittent.idclass3(
    np.array(x), threshold, perc, quant, highest, lowest)).reset_index(level=id)
score_mix = df.loc[:, [id, y]].groupby(id).apply(lambda x: Intermittent.enh_idclass5(
    np.array(x), threshold, perc, quant, highest, lowest)).reset_index(level=id)


## Profiling intermittent time series

In [7]:
# Excluding the ids for which the indicators are np.NaN
score_no_nan = score_mix.dropna()
list_nan = list(set(score_mix[id]) - set(score_no_nan[id]))
print('List of nan ids', list_nan)

# Classify intermittent TS
thres_cv2_constant = 0.01
thres_cv2 = 2
thres_adi = 3
thres_sddi = 6.2
min_time_cons = 2
type = 'mix_floor_Q_999'

df_profiling = Intermittent.classify_intermittent(
    score_no_nan, type, thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons)

# Dictionary
list_of_profiles = ['regular', 'constant_zero', 'constant', 'intermittent',
                    'lumpy', 'erratic', 'unforecastable_time', 'unforecastable_quantity']
dict_profiling = {}
for c in list_of_profiles:
    dict_profiling[c] = list(
        df_profiling.loc[df_profiling.profile == c, id].unique())
    print(id, c, dict_profiling[c])


List of nan ids []
classify_intermittent: regular ids 26
classify_intermittent: no constant_zero ids
classify_intermittent: no constant ids
classify_intermittent: no intermittent ids
classify_intermittent: no lumpy ids
classify_intermittent: no erratic ids
classify_intermittent: no unforecastable_time ids
classify_intermittent: no unforecastable_quantity ids
site_id regular [12, 13, 16, 17, 18, 20, 21, 22, 25, 26, 27, 29, 32, 33, 39, 40, 42, 44, 46, 47, 48, 49, 50, 51, 52, 57]
site_id constant_zero []
site_id constant []
site_id intermittent []
site_id lumpy []
site_id erratic []
site_id unforecastable_time []
site_id unforecastable_quantity []


### Plotting profiled series

In [8]:
for profile in list_of_profiles:
    df_to_plot = df.loc[df[id].isin(list(dict_profiling[profile])), ]
    count = 1
    for i in list(df_to_plot[id].unique()):
        print('Plotting id:', i, 'as', count, 'of',
              len(list(df_to_plot[id].unique())))
        plot = Plots.sliding_line_plot(df_to_plot, y, id, i, chart_title="")
        plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path,
                        id + '_' + str(i) + '_profile_' + profile + ".html"))
        count = count + 1


Plotting id: 13 as 1 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 13
Plotting id: 16 as 2 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 16
Plotting id: 21 as 3 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 21
Plotting id: 22 as 4 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 22
Plotting id: 25 as 5 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 25
Plotting id: 26 as 6 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 26
Plotting id: 29 as 7 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 29
Plotting id: 32 as 8 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 32
Plotting id: 33 as 9 of 26
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting 

# Saving

In [9]:
# create a binary pickle file 
f = open(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'),"wb")
# write the python object (dict) to pickle file
pickle.dump(dict_profiling,f)
# close file
f.close()