In [1]:
import json, os, uuid, joblib, multiprocessing, sys
import numpy as np
import pandas as pd
import argparse as arp
import tensorflow as tf

from time import time
from config import *
from windside_extract_data import create_dataset, split_dataset

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.linear_model import LinearRegression as LR
from sklearn.linear_model import ElasticNet as ENR
from sklearn.linear_model import SGDRegressor as SGDR
from sklearn.linear_model import BayesianRidge as BRR
from sklearn.svm import SVR
from lightning.regression import LinearSVR as LSVR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost.sklearn import XGBRFRegressor as XGBRFR
from sklearn.ensemble import GradientBoostingRegressor as GBR
from xgboost.sklearn import XGBRegressor as XGBR
from lightgbm.sklearn import LGBMRegressor as LGBMR

from windside_manual_keras import parse_layers

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


<h3>Data preprocessing</h3>

In [32]:
seed = 42
chunk_size = None

series_step = 1
window_size = 60
feature_cols = ['Current']

weights = 'discrete'
#weights = 'continuous'

In [33]:
data_fpath = osp.join(DATA_DIR, 'windside', 'features.csv')
dataset_fpath_pattern = osp.join(DATA_DIR, 'windside', '{0}.{1}')

X, T, Y, D, W, y0_mean, y0_error = create_dataset(
    data_fpath,
    output_fpath_pattern=dataset_fpath_pattern,
    feature_cols=feature_cols,
    series_step=series_step,
    window_size=window_size,
    window_step=1,
    chunk_size=chunk_size,
    weights=weights
)

Dataset exists: True


In [34]:
#order = 'date'
order = 'windspeed'

val_split = 0.2
inf_split = 0.1

shuffle_train = False

In [35]:
X_tr, T_tr, Y_tr, D_tr, W_tr, X_val, T_val, Y_val, D_val, W_val, X_inf, T_inf, Y_inf, D_inf, W_inf = split_dataset(
    (X, T, Y, D, W),
    val_split=val_split,
    inf_split=inf_split,
    order=order,
    shuffle_train=shuffle_train
)

X_tr = np.squeeze(X_tr)
X_val = np.squeeze(X_val)
X_inf = np.squeeze(X_inf)

In [36]:
x_mean = np.mean(X_tr)
x_std = np.std(X_tr)
y_min = np.min(Y_tr)
y_max = np.max(Y_tr)

# scaler = MinMaxScaler()
scaler = StandardScaler()

X_tr_std = scaler.fit_transform(X_tr)
X_val_std = scaler.transform(X_val)
X_inf_std = scaler.transform(X_inf)

print('The training set dates:', D_tr[0], D_tr[-1])
print('The validation set dates:', D_val[0], D_val[-1])
if len(D_inf) > 0:
    print('The inference set dates:', D_inf[0], D_inf[-1])
    
print('The training set wind speeds:', Y_tr[0], Y_tr[-1])
print('The validation set wind speeds:', Y_val[0], Y_val[-1])
if len(Y_inf) > 0:
    print('The inference set wind speeds:', Y_inf[0], Y_inf[-1])
    
X_inf_sum = np.sum(X_inf, 1)
idx_inf_1 = np.where(X_inf_sum > 0)[0]
print('Number of positive current samples in the inference subset =', len(idx_inf_1))

The training set dates: 21/11/2022 10:59:21 04/03/2023 13:32:37
The validation set dates: 23/11/2022 14:07:15 11/05/2023 12:23:46
The inference set dates: 19/06/2022 16:20:42 06/08/2022 23:04:01
The training set wind speeds: 0.11619233 2.165924
The validation set wind speeds: 2.165924 2.725383
The inference set wind speeds: 2.725383 6.627586
Number of positive current samples in the inference subset = 1701119


<h3>Utils</h3>

In [37]:
def train_sk(alg_name, kwargs, order, X_tr_std, Y_tr):
    
    model = globals()[alg_name](**kwargs)

    model.fit(
        X_tr_std,
        Y_tr
    )
    
    model_fname = f'{alg_name.lower()}_{uuid.uuid3(uuid.NAMESPACE_DNS, json.dumps(kwargs))}_{order}'
    model_fpath = osp.join(WINDSIDE_MODEL_DIR, model_fname)
    print('Model fpath:', model_fpath)
        
    if not osp.isdir(model_fpath):
        os.mkdir(model_fpath)       
    
    joblib.dump(model, osp.join(model_fpath, 'model.jblb'))

In [38]:
def train_nn(alg_name, layers, order, X_tr_std, Y_tr, W_tr, output_dim, loss, lr, val_split, epochs, batch_size, patience):
    
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)
    
    inputs = tf.keras.layers.Input(shape=X_tr_std.shape[1:])

    hidden = parse_layers(inputs, layers)
    
    if len(hidden.shape) > 2:
        hidden = tf.keras.layers.Flatten()(hidden)
    
    #outputs = tf.keras.layers.Dense(units=output_dim, activation='sigmoid')(hidden)
    outputs = tf.keras.layers.Dense(units=output_dim, activation='linear')(hidden)
    
    y_min = np.min(Y_tr)
    y_max = np.max(Y_tr)
    #outputs = outputs * (y_max - y_min) + y_min
    
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

    metrics = [tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.MeanAbsoluteError()]
    model.compile(loss=loss, optimizer=tf.keras.optimizers.Adam(learning_rate=lr), weighted_metrics=metrics)
    
    model_summary = []
    model.summary(print_fn=lambda x: model_summary.append(x))
    summary = "\n".join(model_summary)

    print(summary)
    
    model.fit(
        X_tr_std, Y_tr,
        sample_weight=pd.Series(W_tr).to_frame(),
        validation_split=val_split,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='min', restore_best_weights=True)
        ],
        verbose=True
    )
    
    model_fname = f'{alg_name.lower()}_{uuid.uuid3(uuid.NAMESPACE_DNS, json.dumps(layers))}_{order}'
    model_fpath = osp.join(WINDSIDE_MODEL_DIR, model_fname)
    print('Model fpath:', model_fpath)
    
    model.save(model_fpath)

In [39]:
def eval_alg(alg_name, kwargs, order, Y_inf, X_inf_std, idx_inf_1, batch_size=None):
    
    model_fname = f'{alg_name.lower()}_{uuid.uuid3(uuid.NAMESPACE_DNS, json.dumps(kwargs))}_{order}'
    model_fpath = osp.join(WINDSIDE_MODEL_DIR, model_fname)
    print('Model fpath:', model_fpath)
    
    try:
        model = tf.keras.models.load_model(model_fpath)
        predict_kwargs = {'batch_size': batch_size}
    except:
        model = joblib.load(osp.join(model_fpath, 'model.jblb'))
        predict_kwargs = {}
            
    Y1 = Y_inf[idx_inf_1]
    
    P1 = np.array(model.predict(X_inf_std[idx_inf_1, :], **predict_kwargs)).flatten()
    
    error1 = np.mean(np.abs(Y1 - P1))
    print(f'\nError 0: {y0_error}, error 1: {error1}')
    
    if not osp.isdir(model_fpath):
        os.mkdir(model_fpath)

    with open(osp.join(model_fpath, 'eval_stats.json'), 'w') as f:
        json.dump({
            'error0': float(y0_error),
            'error1': float(error1),
            'algorithm': alg_name,
            'kwargs': kwargs
        }, f)

In [40]:
def save_data_info(alg_name, kwargs, order):
    
    model_fname = f'{alg_name.lower()}_{uuid.uuid3(uuid.NAMESPACE_DNS, json.dumps(kwargs))}_{order}'
    model_fpath = osp.join(WINDSIDE_MODEL_DIR, model_fname)
    print('Model fpath:', model_fpath)
    
    if not osp.isdir(model_fpath):
        os.mkdir(model_fpath)    
    
    with open(osp.join(model_fpath, 'data_info.json'), 'w') as f:
        json.dump({
            'input_shape': X_tr.shape,
            'output_shape': Y_tr.shape if len(Y_tr.shape) > 1 else (Y_tr.shape[0], 1),
            'x_mean': float(x_mean),
            'x_std': float(x_std),
            'y_min': float(y_min),
            'y_max': float(y_max),
            'y0_mean': float(y0_mean),
        }, f)

<h3>Linear regression</h2

In [41]:
np.random.seed(seed)

kwargs = {}
alg_name = 'LR'

train_sk(alg_name, kwargs, order, X_tr_std, Y_tr)

Model fpath: models/windside/lr_aec4c435-0411-33e9-ab2d-ea1f6e2e72de_windspeed


In [42]:
save_data_info(alg_name, kwargs, order)

Model fpath: models/windside/lr_aec4c435-0411-33e9-ab2d-ea1f6e2e72de_windspeed


In [43]:
eval_alg(alg_name, kwargs, order, Y_inf, X_inf_std, idx_inf_1)

Model fpath: models/windside/lr_aec4c435-0411-33e9-ab2d-ea1f6e2e72de_windspeed

Error 0: 0.31285613775253296, error 1: 1.2245783805847168


<h3>Gradient boosting trees</h3>

In [44]:
np.random.seed(seed)

kwargs = {'random_state': seed, 'n_estimators': 28, 'num_leaves': 190}
alg_name = 'LGBMR'

train_sk(alg_name, kwargs, order, X_tr_std, Y_tr)

Model fpath: models/windside/lgbmr_f9453746-9e18-34b0-b86e-17ab46604b32_windspeed


In [45]:
save_data_info(alg_name, kwargs, order)

Model fpath: models/windside/lgbmr_f9453746-9e18-34b0-b86e-17ab46604b32_windspeed


In [46]:
eval_alg(alg_name, kwargs, order, Y_inf, X_inf_std, idx_inf_1)

Model fpath: models/windside/lgbmr_f9453746-9e18-34b0-b86e-17ab46604b32_windspeed

Error 0: 0.31285613775253296, error 1: 1.654787244392738


<h3>Neural networks</h3>

In [47]:
#os.environ["CUDA_VISIBLE_DEVICES"] = '-1'

output_dim = 1
loss = 'mean_squared_error'
epochs = 100
patience = 10
lr = 5e-4
batch_size = 4096

<h4>Dense</h4>

In [48]:
alg_name = 'dnn'
layers = ["dense_256"]

In [49]:
p = multiprocessing.Process(
    target=train_nn,
    args=(alg_name, layers, order, X_tr_std, Y_tr, W_tr, output_dim, loss, lr, val_split, epochs, batch_size, patience)
)

p.start()
p.join()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 60)]              0         
                                                                 
 dense (Dense)               (None, 256)               15616     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 15,873
Trainable params: 15,873
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epo

In [50]:
save_data_info(alg_name, layers, order)

Model fpath: models/windside/dnn_94db7790-69e1-310d-9b9e-13d97d933afc_windspeed


In [51]:
p = multiprocessing.Process(
    target=eval_alg,
    args=(alg_name, layers, order, Y_inf, X_inf_std, idx_inf_1, batch_size)
)

p.start()
p.join()

Model fpath: models/windside/dnn_94db7790-69e1-310d-9b9e-13d97d933afc_windspeed

Error 0: 0.31285613775253296, error 1: 1.7096868753433228


<h4>Convolutional</h4>

In [52]:
alg_name = 'cnn'
layers = ["conv_256"]

In [53]:
p = multiprocessing.Process(
    target=train_nn,
    args=(alg_name, layers, order, np.expand_dims(X_tr_std, -1), Y_tr, W_tr, output_dim, loss, lr, val_split, epochs, batch_size, patience)
)

p.start()
p.join()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 60, 1)]           0         
                                                                 
 conv1d (Conv1D)             (None, 18, 256)           2048      
                                                                 
 dropout (Dropout)           (None, 18, 256)           0         
                                                                 
 flatten (Flatten)           (None, 4608)              0         
                                                                 
 dense (Dense)               (None, 1)                 4609      
                                                                 
Total params: 6,657
Trainable params: 6,657
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100



INFO:tensorflow:Assets written to: models/windside/cnn_fbedd7a3-94bb-339b-84cd-9fb8bde9eee6_windspeed/assets


INFO:tensorflow:Assets written to: models/windside/cnn_fbedd7a3-94bb-339b-84cd-9fb8bde9eee6_windspeed/assets


In [54]:
save_data_info(alg_name, layers, order)

Model fpath: models/windside/cnn_fbedd7a3-94bb-339b-84cd-9fb8bde9eee6_windspeed


In [55]:
p = multiprocessing.Process(
    target=eval_alg,
    args=(alg_name, layers, order, Y_inf, np.expand_dims(X_inf_std, -1), idx_inf_1, batch_size)
)

p.start()
p.join()

Model fpath: models/windside/cnn_fbedd7a3-94bb-339b-84cd-9fb8bde9eee6_windspeed

Error 0: 0.31285613775253296, error 1: 1.7651110887527466


<h4>LSTM</h4>

In [56]:
alg_name = 'lstm'
layers = ["lstm_64"]

In [57]:
p = multiprocessing.Process(
    target=train_nn,
    args=(alg_name, layers, order, np.expand_dims(X_tr_std, -1), Y_tr, W_tr, output_dim, loss, lr, val_split, epochs, batch_size, patience)
)

p.start()
p.join()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 60, 1)]           0         
                                                                 
 lstm (LSTM)                 (None, 64)                16896     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 16,961
Trainable params: 16,961
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epo

In [58]:
save_data_info(alg_name, layers, order)

Model fpath: models/windside/lstm_92e9abcb-b566-3166-a138-3316c30de6e5_windspeed


In [59]:
p = multiprocessing.Process(
    target=eval_alg,
    args=(alg_name, layers, order, Y_inf, np.expand_dims(X_inf_std, -1), idx_inf_1, batch_size)
)

p.start()
p.join()

Model fpath: models/windside/lstm_92e9abcb-b566-3166-a138-3316c30de6e5_windspeed

Error 0: 0.31285613775253296, error 1: 1.7369617223739624
