# Implementation

## Packages

In [1]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, scale
from sklearn.metrics import mean_absolute_error
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

# statistical functions
from scipy.stats.mstats import winsorize

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# custom functions
from Code.Profiling.Intermittent.intermittent import Intermittent
from Code.Utils.utils import Utils
from Code.Scoring.kpi import Kpi
from Code.Scoring.forecast import Forecasting
from Code.Scoring.train import Training
from Code.Scoring.train_test import TrainTest
from Code.Scoring.scoring import Scoring
from Code.Regressors.regressors import Regressors
from Code.Plotting.plots import Plots
from Configuration.config import cfg_path

## Setup

In [2]:
# od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)

## Load Data

In [3]:
df_final = pd.read_pickle(os.path.join(
    root, cfg_path.data_dir.output_path, 'df_final.pkl'))
df_final.head()

Unnamed: 0,site_id,timestamp,obs_id,forecast_id,value,holidays,day_off,surface,base_temperature,wd_mon,...,month_05,month_06,month_07,month_08,month_09,month_10,month_11,month_12,months_days,temperature_asis
0,2,2015-11-02,7390465.0,26.0,1492533.0,0,0,6098.278376,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.247917
1,3,2015-11-02,,,,0,0,10556.293605,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.252083
2,5,2015-11-02,,,,0,0,12541.181277,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.241667
3,6,2015-11-02,1413383.0,129.0,854147.9,0,0,9150.195373,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.24375
4,7,2015-11-02,,,,0,0,15168.125971,18.0,1,...,0,0,0,0,0,0,1,0,11/02,13.245833


## Parameter setup

In [4]:
id = 'site_id'
list_unique_id = ['site_id', 'timestamp']
list_temp = ['temp']
y = 'value'
date_var = Utils.find_date(df_final)

find_date, date_col found: ['timestamp']


## Working dataframe

In [5]:
# Defining working dataframe by removing potential null values in y variable
df = df_final.loc[~df_final[y].isnull(),].copy()
print('Id available', list(df[id].unique()))

Id available [2, 6, 8, 9, 10, 13, 16, 17, 18, 19, 21, 22, 25, 26, 27, 29, 32, 33, 39, 40, 41, 42, 51, 45, 47, 48, 50, 49, 57, 7, 44, 20, 46, 38, 52, 12]


# Compute intermittent indicators

In [6]:
# Winsorizing parameters
highest = 0.05
lowest = 0.05

# Identifying intermittent time series parameters
threshold = 250
perc = 0.01
quant = 0.999

### Choose indicator function

In [7]:
# Applying indicator function ('compute_indicator_values' or 'enh_compute_indicator_values')
func = 'compute_indicator_values'
df_scoring = df.loc[:, [id, y]].groupby(id).apply(lambda x: Intermittent.call_intermittent_function(func,
    np.array(x), threshold, perc, quant, highest, lowest)).reset_index(level=id)

## Profiling intermittent time series

In [8]:
# Excluding the ids for which the indicators are np.NaN
df_scoring_no_nan = df_scoring.dropna()
list_nan = list(set(df_scoring[id]) - set(df_scoring_no_nan[id]))
print('List of nan ids', list_nan)

# Classify intermittent TS
thres_cv2_constant = 0.01
thres_cv2 = 2
thres_adi = 3
thres_sddi = 6.2
min_time_cons = 2
type = df_scoring_no_nan['type'].unique()[0]

df_profiling = Intermittent.classify_intermittent(df_scoring_no_nan, type, thres_cv2_constant, thres_cv2, thres_adi, thres_sddi, min_time_cons)

# Dictionary
list_of_profiles = ['regular', 'constant_zero', 'constant', 'intermittent',
                    'lumpy', 'erratic', 'unforecastable_time', 'unforecastable_quantity']
dict_profiling = {}
for c in list_of_profiles:
    dict_profiling[c] = list(
        df_profiling.loc[df_profiling.profile == c, id].unique())
    print(id, c, dict_profiling[c])


List of nan ids []
classify_intermittent: regular ids 35
classify_intermittent: no constant_zero ids
classify_intermittent: constant ids 1
classify_intermittent: no intermittent ids
classify_intermittent: no lumpy ids
classify_intermittent: no erratic ids
classify_intermittent: no unforecastable_time ids
classify_intermittent: no unforecastable_quantity ids
site_id regular [2, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 21, 22, 25, 27, 29, 32, 33, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 57]
site_id constant_zero []
site_id constant [26]
site_id intermittent []
site_id lumpy []
site_id erratic []
site_id unforecastable_time []
site_id unforecastable_quantity []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


### Plotting profiled series

In [12]:
for profile in list_of_profiles:
    df_to_plot = df.loc[df[id].isin(list(dict_profiling[profile])), ]
    count = 1
    for i in list(df_to_plot[id].unique()):
        print('Plotting id:', i, 'as', count, 'of',
              len(list(df_to_plot[id].unique())))
        plot = Plots.sliding_line_plot(df_to_plot, y, id, i, chart_title="")
        folder_name = os.path.join(root, cfg_path.data_dir.plot_path, profile)
        Utils.create_folder_tree(folder_name)
        plot.write_html(os.path.join(folder_name, id + '_' + str(i) + '_profile_' + profile + ".html"))
        count = count + 1


Plotting id: 2 as 1 of 35
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 2
Successfully created the directory c:\Users\mabellani\OneDrive - Microsoft\Documents\REPOSITORY\dstoolkit-forecasting\Data/Plots\regular
Plotting id: 6 as 2 of 35
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 6
Creation of the directory failed or already present c:\Users\mabellani\OneDrive - Microsoft\Documents\REPOSITORY\dstoolkit-forecasting\Data/Plots\regular
Plotting id: 8 as 3 of 35
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 8
Creation of the directory failed or already present c:\Users\mabellani\OneDrive - Microsoft\Documents\REPOSITORY\dstoolkit-forecasting\Data/Plots\regular
Plotting id: 9 as 4 of 35
find_date, date_col found: ['timestamp']
sliding_line_plot: plotting Value site_id 9
Creation of the directory failed or already present c:\Users\mabellani\OneDrive - Microsoft\Documents\REPOSITOR

# Saving

In [10]:
# create a binary pickle file 
f = open(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'),"wb")
# write the python object (dict) to pickle file
pickle.dump(dict_profiling,f)
# close file
f.close()