In [13]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


# NOTEBOOK DESCRIPTION
The goal of this notebook is to train localized models.

# LIBS

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import warnings
import random

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import *
from tensorflow.keras.regularizers import *

In [3]:
cwd = os.path.dirname(os.getcwd())

# run tensorflow_helper_func.py
tensorflow_helper_func_path =    '"{}/modules/tensorflow_helper_func.py"'.format(cwd)
%run $tensorflow_helper_func_path

# INPUT DATA

In [16]:
folder_gen = cwd + "/generated_data/"
folder_mod_global = cwd + "/models/global N-BEATS-exog/"
folder_mod_local = cwd + "/models/localized N-BEATS-exog/"

train_dict = pickle.load(open(folder_gen + "train_dict.p",'rb'))
val_dict = pickle.load(open(folder_gen + "val_dict.p",'rb'))

# LOAD CLUSTERING RESULTS
clust_all = pd.read_csv(folder_gen + "cluster_data.csv", index_col="ts_id")
clust_all.columns = clust_all.columns.astype(int)

# OTHER
freq = "30min"
periods = 48
idx = pd.date_range("2009-07-20", "2009-12-07", freq="30min", closed="left")
y_cols = ["H_{}".format(i) for i in range(1, periods+1)]

train_idx = pd.date_range("2009-07-27", "2010-07-12", freq="30min", closed="left", name="timestamp")
val_idx = pd.date_range("2010-07-12", freq="30min", periods=periods*7*12, name="timestamp")
test_idx = pd.date_range("2010-10-04", freq="30min", periods=periods*7*12, name="timestamp")

# 1. TRAINING

## 1.1. Create dict. <i>clusters_dict</i> which holds information about clusters

In [21]:
clusters_dict = {}

for n_clusters in range(2, 21, 1):
    n_clusters_dict = {}
    for clust_n in range(n_clusters):
        clust_list = clust_all.loc[:, n_clusters][clust_all.loc[:, n_clusters] == clust_n].index.tolist()
        n_clusters_dict["clust={}".format(clust_n)] = clust_list
        
    clusters_dict[n_clusters] = n_clusters_dict

DataFrame <i> clust_all </i> stores clustering results. Rows represent ts_id's and columns represent cluster labels. E.g. column 3 denotes results when using 3 cluster centroids (k=3), therefore resulting labels for each ts_id can be 0, 1, or 2 etc.

In [25]:
clust_all.head()

Unnamed: 0_level_0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
ts_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,1,0,1,1,1,3,2,1,8,0,1,1,10,2,2,1,1,6,12
1,1,0,1,1,1,3,2,1,8,0,1,1,10,2,2,1,1,6,12
2,1,0,1,1,1,3,2,1,9,5,8,12,0,2,11,1,1,6,12
3,1,0,1,1,1,3,2,7,9,5,8,12,0,9,11,6,10,17,11
4,0,1,0,2,3,0,1,7,2,3,7,6,8,12,10,15,5,18,14


Dictionary <i> clusters_dict </i> is created from clust_all. Keys represent number of clusters used as shown below:

In [23]:
clusters_dict.keys()

dict_keys([2, 3, 4])

When having 3 clusters (k=3), we have 3 subsets...

In [28]:
clusters_dict[3].keys()

dict_keys(['clust=0', 'clust=1', 'clust=2'])

In cluster = 0, following are ts_id's that belong to this cluster subset.

In [31]:
clusters_dict[3]['clust=0'][:3]

[0, 1, 2]

# 1.2. Create localized models
- Fine-tune initial global model on a subsets of original set of time series to create localized models.
- To check whether models need additional training, you can explore results stored in val_results folder.

In [None]:
warnings.filterwarnings('ignore')
np.random.seed(0)
random.seed(0)

# TRAINING PARAMS
lr = 0.0001
batch_size = 256

n_steps_per_epoch = 50

# MODEL PARAMS
params_dict = {"input_size": [7*periods, 12+7+48],
               "output_size": periods,
               "block_layers": 3,
               "hidden_units": 512, 
               "n_blocks": 3,
               "block_sharing": False}

# FINE-TUNE ON SUB-SETS OF ORIGINAL SET
for n_clusters in clusters_dict:
    for clust_set in clusters_dict[n_clusters]:
        print("n_clusters: {}, clust_set: {} ------------------- \n".format(n_clusters, clust_set))

        # GET ts ids for a specific cluster (subset)
        ts_ids_list = clusters_dict[n_clusters][clust_set]

        # INDICES for all ts
        train_idx_all, train_ids = create_ts_idx(train_idx, ts_ids_list, train_dict)
        val_idx_all, val_ids = create_ts_idx(val_idx, ts_ids_list, val_dict)
        test_idx_all, test_ids = create_ts_idx(test_idx, ts_ids_list)

        # DATA GENERATORS
        train_generator = TSGenerator(set_type="train",
                                      batch_size=batch_size, 
                                      n_steps_per_epoch=n_steps_per_epoch,
                                      ts_ids_list=ts_ids_list)
        val_generator = TSGenerator(set_type="val", ts_ids_list=ts_ids_list)
        test_generator = TSGenerator(set_type="test", ts_ids_list=ts_ids_list)

        # CREATE NN MODEL
        model = NBeats_exog(params_dict)
        optimizer = Adam(lr=lr)
        model.compile(optimizer, loss="mae", 
                      metrics=["mae"])

        # CALLBACKS
        csvlogger = CSVLogger(folder_mod_local + 'temp_log.csv')
        weights_path = folder_mod_local + "weights/val_best_weights, n_clusters={}, {}.h5".format(n_clusters, clust_set)
        save_val_weights = ModelCheckpoint(weights_path, monitor="val_mae", save_best_only=True)
        callbacks = [csvlogger] + [save_val_weights]

        # TRAINING
        ## load global model weights
        model.load_weights(folder_mod_global + "val_best_weights-global.h5")
        history = model.fit(train_generator,
                            validation_data=val_generator,
                            verbose=1,
                            epochs=100,
                            callbacks=callbacks)

        results_val_all = []
        results_val = pd.DataFrame(history.history)
        results_val.index = results_val.index + 1
        results_val_all.append(results_val)
        best_epoch = results_val.val_mae.idxmin()
        
        # you could also use tf.keras.callbacks.EarlyStopping, 
        # but this still worked well.
        for i in range(5):
            if best_epoch > 70:
                model.load_weights(weights_path)
                history = model.fit(train_generator,
                                    validation_data=val_generator,
                                    verbose=1,
                                    epochs=100,
                                    callbacks=callbacks)

                results_val = pd.DataFrame(history.history)
                results_val.index = results_val.index + 1
                results_val_all.append(results_val)
                best_epoch = results_val.val_mae.idxmin()
            else: break

        results_val_all = pd.concat(results_val_all).reset_index(drop=True)
        results_val_all.index = results_val_all.index + 1

        # INFERENCE
        ## load best weights
        model.load_weights(weights_path)

        ## predict on val
        y_val_pred = model.predict(val_generator)
        y_val_pred = pd.DataFrame(y_val_pred, index=val_idx_all, columns=y_cols)
        y_val_pred = pd.concat([val_ids, y_val_pred], axis=1)

        ## predict on test
        y_test_pred = model.predict(test_generator)
        y_test_pred = pd.DataFrame(y_test_pred, index=test_idx_all, columns=y_cols)
        y_test_pred = pd.concat([test_ids, y_test_pred], axis=1)

        # SAVE
        results_val_all.to_pickle(folder_mod_local + "val_results/val_results, n_clusters={}, {}.p".format(n_clusters, clust_set))
        y_val_pred.clip(0).to_pickle(folder_mod_local + "y_val_pred/y_val_pred, n_clusters={}, {}.p".format(n_clusters, clust_set))
        y_test_pred.clip(0).to_pickle(folder_mod_local + "y_test_pred/y_test_pred, n_clusters={}, {}.p".format(n_clusters, clust_set))