# Creation of Clusters Classifier

## Load Data

In [1]:
from utils import *

file_path = '../../QuickStart/Data/Electricity/residential_all_hour_with_date_time.pkl'
x_date_time = pd.read_pickle(file_path)
ids = x_date_time.columns.values

  from .autonotebook import tqdm as notebook_tqdm


**Smooth Data**

In [2]:
x_date_time = gaussian_smoothing(x_date_time)

**Keep 1st Year**

In [3]:
df = x_date_time.sort_values(by='date_time')["2009-07-15" : "2010-07-14"]
x = np.array(df).T

**Removing Outlier**

In [4]:
id_remove = []

# maximum and variance based outlier removal
for i in np.argsort(-x.max(1))[:10]:
    id_remove.append(i)
for i in np.argsort(-x.std(1))[:10]:
    id_remove.append(i)
id_remove = np.asarray(id_remove)

# detect outliers with Isolation Forest
clf = IsolationForest(random_state=0, contamination = 0.01, n_jobs=-1).fit(x)
pred = clf.predict(x)

id_remove = np.concatenate((id_remove, np.squeeze(np.argwhere((pred==-1)))))
id_remove = np.unique(id_remove)

print(f'Shape before removing outlier {x.shape}.')

x = np.delete(x, id_remove, 0)
ids = np.delete(ids, id_remove, 0)

print(f'Shape after removing outlier {x.shape}.')

Shape before removing outlier (3639, 8760).
Shape after removing outlier (3595, 8760).


## 0. Define Parameters

In [11]:
n_components = 10
n_cluster = 4
do_plot = False

## 1. KMeans Clustering

### Prepare signals

**Generate KMeans clustering**

In [12]:
print(f'Number of PCA components: {n_components}')
print(f'Number of clusters: {n_cluster}')

t = time.time()

x_pca = PCA(n_components=n_components).fit_transform(x)
kmeans = KMeans(n_clusters=n_cluster, random_state=0).fit(x_pca)

print(f'Time to cluster the dataset {time.time()-t:.2f} s.')

if do_plot:
    plot_clusters(x, kmeans.predict(x_pca))

Number of PCA components: 10
Number of clusters: 4
Time to cluster the dataset 5.63 s.


**Compute Centroids**

In [13]:
clusters = []

for c in range(n_cluster):
    print(f'Population of cluster {c+1} for kmean is {(kmeans.predict(x_pca)==c).sum()}')
    tmp = {}
    tmp['labels'] = c
    tmp['centroid'] = x[kmeans.predict(x_pca) == c].mean(0)
    tmp['ids'] = ids[kmeans.predict(x_pca) == c]
    clusters.append(tmp)

Population of cluster 1 for kmean is 970
Population of cluster 2 for kmean is 978
Population of cluster 3 for kmean is 1139
Population of cluster 4 for kmean is 508


## 2. Spectral Clustering

### Prepare signals

**Generate Spectral clustering**

In [17]:
print(f'Number of PCA components: {n_components}')
print(f'Number of clusters: {n_cluster}')

t = time.time()

x_pca = PCA(n_components=n_components).fit_transform(x)
scluster = SpectralClustering(n_clusters=n_cluster, random_state=0, affinity = "nearest_neighbors").fit(x_pca)

print(f'Time to cluster the dataset {time.time()-t:.2f} s.')

if do_plot:
    plot_clusters(x, kmeans.predict(x_pca))

Number of PCA components: 10
Number of clusters: 4
Time to cluster the dataset 5.04 s.


**Compute Centroids**

In [15]:
clusters = []

for c in range(n_cluster):
    print(f'Population of cluster {c+1} for kmean is {(scluster.labels_==c).sum()}')
    tmp = {}
    tmp['labels'] = c
    tmp['centroid'] = x[scluster.labels_ == c].mean(0)
    tmp['ids'] = ids[scluster.labels_== c]
    clusters.append(tmp)

Population of cluster 1 for kmean is 1160
Population of cluster 2 for kmean is 1555
Population of cluster 3 for kmean is 232
Population of cluster 4 for kmean is 648


## 3. Self-Organised Maps

### Prepare signals

**Generate SOM clustering**

In [16]:
print(f'Number of PCA components: {n_components}')
print(f'Number of clusters: {n_cluster}')

t = time.time()

x_pca = PCA(n_components=n_components).fit_transform(x)

som = susi.SOMClustering(n_rows=1,n_columns=n_cluster)
som.fit(x_pca)
cluster_coord = pd.DataFrame(np.array(som.get_clusters(x_pca)), columns = ["dim1", "dim2"])
print(f'Time to cluster the dataset {time.time()-t:.2f} s.')

labels_ = cluster_coord.groupby(['dim1', 'dim2']).grouper.group_info[0]

if do_plot:
    plot_clusters(x, labels_)

Number of PCA components: 10
Number of clusters: 4
Time to cluster the dataset 4.63 s.


**Compute Centroids**

In [18]:
clusters = []

for c in range(n_cluster):
    print(f'Population of cluster {c+1} for kmean is {(labels_==c).sum()}')
    tmp = {}
    tmp['labels'] = c
    tmp['centroid'] = x[labels_ == c].mean(0)
    tmp['ids'] = ids[labels_ == c]
    clusters.append(tmp)

Population of cluster 1 for kmean is 1223
Population of cluster 2 for kmean is 845
Population of cluster 3 for kmean is 711
Population of cluster 4 for kmean is 816


## X. Training Deep Learning Model for Clustering/Forecasting

### Train Models

**Generate Training Signals**

In [7]:
c = 0

df = pd.DataFrame()
df['consumption'] = clusters[c]['centroid']
df['date_time'] = pd.date_range(start="2009-07-15", end="2010-07-14 23:00:00", freq="60T", name='date_time')
df = df.set_index('date_time')

train_serie = darts.TimeSeries.from_dataframe(df)

#train_serie.plot()

In [8]:
conso = x_date_time.sort_values(by='date_time')["2010-07-14" : "2010-07-21"][clusters[c]['ids']].values.mean(1)
df = pd.DataFrame()
df['date_time'] = pd.date_range(start="2010-07-14", end="2010-07-21 23:00:00", freq="60T", name='date_time')
df['consumption'] = conso
df = df.set_index('date_time')

val_series = darts.TimeSeries.from_dataframe(df)

#val_series.plot()

**Create the model**

In [9]:
directory = 'cluster/kmeans/'
model_name = f"RNN_test_{c}"

torch_metrics = MeanSquaredError()

my_stopper = EarlyStopping(
    monitor="val_MeanSquaredError",  # "val_loss",
    patience= 7,
    min_delta=0.0025,
    mode='min',)

kwargs = {"accelerator": "gpu", "gpus": [0], "auto_select_gpus": True, "callbacks": [my_stopper]}

model = RNNModel(model = "LSTM" , input_chunk_length=7*24, training_length=7*24, random_state = 42, n_epochs = 20, 
                 save_checkpoints = True, work_dir = directory, n_rnn_layers = 3, hidden_dim = 25, model_name = model_name, 
                 log_tensorboard = True, force_reset = True, torch_metrics = torch_metrics, pl_trainer_kwargs = kwargs)

**Train the model**

In [10]:
model.fit(series=train_serie, val_series=val_series)

  if isinstance(time_idx, pd.Int64Index) and not isinstance(
  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type             | Params
---------------------------------------------------
0 | criterion     | MSELoss          | 0     
1 | train_metrics | MetricCollection | 0     
2 | val_metrics   | MetricCollection | 0     
3 | rnn           | LSTM             | 13.2 K
4 | V             | Linear           | 26    
---------------------------------------------------
13.2 K    Trainable params
0         Non-trainable params
13.2 K    Total params
0.106     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

<darts.models.forecasting.rnn_model.RNNModel at 0x7fa5ff75d8d0>

**Validation and Metrics**

In [11]:
pred_serie = model.predict(n=len(val_series))
pred, true = compute_intersect(pred_serie, val_series)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 269it [00:00, ?it/s]

In [12]:
metrics = [rmse(true, pred), coefficient_of_variation(true, pred), biased_error(true.values(), pred.values())]
np.save(os.path.join(directory, model_name, 'logs', 'metrics'), metrics)