# Implementation

## Packages

In [None]:
# data elaboration functions
import pandas as pd
from six.moves import collections_abc
import string
import numpy as np

# datetime functions
import datetime as dt

# file management functions
import os
import sys
import opendatasets as od
import pickle
from pathlib import Path

# plot functions
import matplotlib.pyplot as plt
%matplotlib inline

# data science functions
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, scale
from sklearn.metrics import mean_absolute_error
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb

# statistical functions
from scipy.stats.mstats import winsorize

# configuration file
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# custom functions
from Code.Profiling.Intermittent.intermittent import Intermittent
from Code.Utils.utils import Utils
from Code.Scoring.kpi import Kpi
from Code.Scoring.forecast import Forecasting
from Code.Scoring.train import Training
from Code.Scoring.train_test import TrainTest
from Code.Scoring.scoring import Scoring
from Code.Regressors.regressors import Regressors
from Code.Plotting.plots import Plots
from Configuration.config import cfg_path

## Setup

In [None]:
# od.download("https://www.kaggle.com/arashnic/building-sites-power-consumption-dataset/download")
root = Path(os.getcwd()).parent
dataset_path = os.path.join(root, cfg_path.data_dir.input_path)


## Load Data

In [None]:
dict_profiling = pd.read_pickle(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'))
df_final = pd.read_pickle(os.path.join(
    root, cfg_path.data_dir.output_path, 'df_final.pkl'))
df_final.head()


## Parameter setup

In [None]:
id = 'site_id'
list_unique_id = ['site_id', 'timestamp']
list_temp = ['temp']
y = 'value'
date_var = Utils.find_date(df_final)

# Winsorizing parameters
highest = 0.05
lowest = 0.05

# Clustering regular time series

In [None]:
# Define regular ids list
list_id_clustering = list(dict_profiling['regular'])
mask = df_final[id].isin(list(dict_profiling['regular']))
df = df_final.loc[mask, [date_var, id, y]]

# Set seed
sample_seed_kmeans = 789
# Standardizing data
df_win_sum = df.loc[:, [id, y]].groupby(id).apply(
    lambda x: np.sum(winsorize(x, (highest, lowest)))).reset_index()
df_win_sum.columns = [id, "sum_" + y]

# Checking if some ids have 0 values after winsorizing
if len(set(list_id_clustering) - set(list(df_win_sum[id].unique()))) > 0:
    list_id_clustering = list(set(list_id_clustering) - set(list(df_win_sum[id].unique())))
    print(id, list_id_clustering, "has/have 0", y, "after winsorizing")
    mask = (df[y]!=np.nan) & (~df[id].isin(list_id_clustering))
    df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()
    charvec = df_std[date_var].dt.strftime('%Y-%m-%d')
    df_std.set_index(date_var, inplace=True)
else:
    mask = (df[y]!=np.nan)
    df_std = df.loc[mask, ].pivot(index=date_var, columns=id, values=y).reset_index()
    charvec = df_std[date_var].dt.strftime('%Y-%m-%d')
    df_std.set_index(date_var, inplace=True)
    print("NO", id, "has/have 0", y, "after winsorizing")

## Defining a set of ids to cluster with NO nan

### In order to perform cluster analysis, one need to have a matrix with no nan value and set the index of the dataframe with date_var

In [None]:
df_std_no_nan = df_std.dropna()
if len(df_std_no_nan)==0:
    list_id_cluster = [16, 21,22,25,26, 27, 29, 33, 40, 49]
    df_cluster = df_std.loc[:, list_id_cluster].dropna()
else:
    list_id_cluster = list(set(list(df_std.columns)) - set(list(date_var)))
    df_cluster = df_std.loc[:, list_id_cluster].dropna()
print('Clustering regular profiles on ids', list_id_cluster)

### Set the number of cluster you want to try

In [None]:
# Total sum of squares
tot_ss = pd.DataFrame(df_cluster.apply(scale, axis=1)**2).sum(axis=0, skipna=True)

# Setting up charvec
start_date = min(df_cluster.index)
end_date = max(df_cluster.index)

# Define the number of clusters
try_clusters = 11

# K-means setup
kmeans_kwargs = { 
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

### Choosing the Appropriate Number of Clusters
In this section, you’ll look at two methods that are commonly used to evaluate the appropriate number of clusters:

- The elbow method
- The silhouette coefficient

These are often used as complementary evaluation techniques

#### The elbow method

In [None]:
#X = np.array(df_cluster.transpose())
X = np.array(df_cluster)

# A list holds the SSE values for each k

sse = []
for k in range(1, try_clusters):
    kmeans = KMeans(n_clusters = k, **kmeans_kwargs)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

plt.style.use("fivethirtyeight")
plt.plot(range(1, try_clusters), sse)
plt.xticks(range(1, try_clusters))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing")
print("Elbow method: optimal number of clusters is", kl.elbow)

#### The silhouette coefficient
The silhouette coefficient is a measure of cluster cohesion and separation. It quantifies how well a data point fits into its assigned cluster based on two factors:

- How close the data point is to other points in the cluster
- How far away the data point is from points in other clusters

Silhouette coefficient values range between -1 and 1. Larger numbers indicate that samples are closer to their clusters than they are to other clusters.

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, try_clusters):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_)
    silhouette_coefficients.append(score)
    
pd.DataFrame(silhouette_coefficients)
    
plt.style.use("fivethirtyeight")
plt.plot(range(2, try_clusters), silhouette_coefficients)
plt.xticks(range(2, try_clusters))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

df_sil_coeff = pd.DataFrame(silhouette_coefficients).reset_index()
optimal_silhouette_coefficients = df_sil_coeff.loc[df_sil_coeff[0]==max(silhouette_coefficients), 'index'][0] + 2
print("Silhouette coefficients: optimal number of clusters is", optimal_silhouette_coefficients)

## Clustering using the optimal number of clusters chosen

In [None]:
chosen_clusters = 4

In [None]:
kmeans = KMeans(n_clusters=chosen_clusters, **kmeans_kwargs)
identified_clusters = kmeans.fit_predict(X)

df_cluster.loc[:, 'cluster'] = identified_clusters 

# Updating profiling dictionary
dict_profiling['regular']['cluster'] = {}
for c in range(0, len(dict_profiling['regular'])):
    dict_profiling['cluster'] = {dict_profiling['regular'][c]: df_cluster.loc[df_cluster.index==dict_profiling['regular'][c], 'cluster'].unique()[0]}
    print(id, c, dict_profiling[c])

### Plotting clustered regular series

In [None]:
df_to_plot = pd.melt(df_cluster.reset_index(), id_vars=[date_var, 'cluster'])
for cluster in list(df_cluster['cluster'].unique()):
    count = 1
    for i in list(df_to_plot[id].unique()):
        print('Plotting id:', i, 'as', count, 'of',
              len(list(df_to_plot[id].unique())))
        chart_title =  id + ' ' + str(i) + " - Profile regular cluster " +  str(cluster)
        plot = Plots.sliding_line_plot(df_to_plot, y, id, i, chart_title)
        plot.write_html(os.path.join(root, cfg_path.data_dir.plot_path,
                        id + '_' + str(i) + '_profile_regular_cluster_' + str(cluster) + ".html"))
        count = count + 1


# Saving

In [None]:
# create a binary pickle file 
f = open(os.path.join(root, cfg_path.data_dir.output_path, 'dict_profiling.pkl'),"wb")
# write the python object (dict) to pickle file
pickle.dump(dict_profiling,f)
# close file
f.close()
