In [None]:
import pandas as pd

### Y : energy

In [None]:
df_energy = pd.read_csv('data\energy_dataset.csv')
sorted(df_energy.columns)

In [None]:
df_energy.sort_values('time', inplace = True, )
df_energy.interpolate(method='linear', limit_direction='forward', inplace=True, axis=0)

dropping useless features

In [None]:
useless_energy_features = ['forecast solar day ahead',
 'forecast wind offshore eday ahead',
 'forecast wind onshore day ahead',
 'price day ahead',
 'total load actual',
 'total load forecast',
 
 # NaN
 'generation geothermal',
 'generation marine',
 ]
df_energy.drop(useless_energy_features, axis=1, inplace=True, )

merging similar features

In [None]:
fossil_features = ['generation fossil brown coal/lignite',
 'generation fossil coal-derived gas',
 'generation fossil gas',
 'generation fossil hard coal',
 'generation fossil oil',
 'generation fossil oil shale',
 'generation fossil peat',]
df_energy['generation fossil'] = df_energy[fossil_features].sum(axis = 1)
df_energy.drop(fossil_features, axis=1, inplace=True, )

In [None]:
hydro_features = ['generation hydro pumped storage aggregated',
 'generation hydro pumped storage consumption',
 'generation hydro run-of-river and poundage',
 'generation hydro water reservoir',]
df_energy['generation hydro'] = df_energy[hydro_features].sum(axis = 1)
df_energy.drop(hydro_features, axis=1, inplace=True, )

In [None]:
wind_features = ['generation wind offshore',
 'generation wind onshore',]
df_energy['generation wind'] = df_energy[wind_features].sum(axis = 1)
df_energy.drop(wind_features, axis=1, inplace=True, )

In [None]:
nonrenew_features = [
 'generation fossil',
 'generation other',
 'generation nuclear',
 'generation waste',
    ]

renewable_features = [
 'generation biomass',
 'generation hydro',
 'generation other renewable',
 'generation solar',
 'generation wind']

In [None]:
# cities = ['Barcelona', 'Bilbao', 'Madrid', 'Seville', 'Valencia']
# cities_weights = {'Madrid': 6155116 / 15272043,
#                   'Barcelona': 5179243 / 15272043,
#                   'Valencia': 1645342 / 15272043,
#                   'Seville': 1305342 / 15272043,
#                   'Bilbao': 987000 / 15272043}

In [None]:
# for city, weight in cities_weights.items():    
#     for feature in features_list:
#         df_en[feature + ' ' + city] = df_en[feature]*weight

### X : weather

In [None]:
df_weather = pd.read_csv('data\weather_features.csv')
sorted(df_weather.columns)

In [None]:
df_weather.drop_duplicates(subset=['dt_iso', 'city_name'], keep='first', inplace = True, )

In [None]:
useless_weather_features = [
    'weather_id', 
    'weather_main', 
    'weather_icon', 
    ]
df_weather.drop(useless_weather_features, axis=1, inplace=True, )

In [None]:
weather_list = [ 'city_name', 'temp', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h', 'snow_3h',
       'clouds_all',  'weather_description', ]

encoding `weather_description`

In [None]:
# extract token words
weather_tokens = {w for l in df_weather.weather_description for w in l.split()}
weather_tokens.difference_update(['and', 'with', 'is', ])
weather_tokens

# TODO PCA per alleggerire

In [None]:
for t in weather_tokens:
    df_weather[t] = df_weather.weather_description.apply(lambda text : 1 if t in text else 0)
df_weather.drop('weather_description', axis=1, inplace = True, )

encodind `cities`

In [None]:
# split in cities
cities = list(set(df_weather.city_name))
df_city_weather = { c : df_weather[df_weather.city_name == c].drop('city_name', axis=1) for c in cities }

# adjust indexes for each split 
for city, dfi in df_city_weather.items():
    dfi.set_index('dt_iso', inplace = True)
    dfi.rename(columns = {c : f'{c}_{city}' for c in dfi.columns}, inplace = True )

# re-merge dataframe
df_final = pd.concat(list(df_city_weather.values()), axis=1, join='inner')

In [None]:
df_final = df_city_weather['Valencia']
df_final = df_final[df_final.columns.difference({f'{t}_Valencia' for t in weather_tokens})]

In [None]:
df_energy.set_index('time', inplace=True)
inputs, outputs = df_final.columns, df_energy.columns
len(inputs), len(outputs)

In [None]:
dataset = pd.concat([df_final, df_energy], axis=1, join='inner')
dataset

## tensor

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import tensorflow_probability as tfp

In [None]:
tfk = tf.keras
tf.keras.backend.set_floatx("float64")
tfd = tfp.distributions

In [None]:
tf.config.list_physical_devices("GPU")

In [None]:
# Define helper functions.
scaler = StandardScaler()
detector = IsolationForest(n_estimators=1000, random_state=42) # (of outliers)
neg_log_likelihood = lambda x, rv_x: -rv_x.log_prob(x)

In [None]:
# scale
tmp = scaler.fit_transform(dataset)

# keep inliers
# is_inlier = detector.fit_predict(tmp)
# print(sum(is_inlier))
# tmp = tmp[(is_inlier > 0),:]

# restore
dataset = pd.DataFrame(tmp, columns=dataset.columns)

In [None]:
# Define some hyperparameters.
n_epochs, n_batches, n_samples = 50, 15, dataset.shape[0]
buffer_size, batch_size = n_samples, np.floor(n_samples/n_batches)

In [None]:
# Define training and test data sizes.
n_train = int(0.75*dataset.shape[0])
n_train

In [None]:
# Define dataset instance.
data = tf.data.Dataset.from_tensor_slices((dataset[inputs].values, dataset[outputs].values))
data = data.shuffle(n_samples, reshuffle_each_iteration=True)

In [None]:
# Define train and test data instances.
data_train = data.take(n_train).batch(batch_size).repeat(n_epochs)
data_test = data.skip(n_train).batch(1)

In [None]:
# Define prior for regularization.
prior = tfd.Independent(
    tfd.Normal(loc=tf.zeros(len(outputs), dtype=tf.float64), scale=1.0),
    reinterpreted_batch_ndims=1)

In [None]:
# Define model instance.
model = tfk.Sequential([

    # input
    tfk.layers.InputLayer(
        input_shape=(len(inputs),),
    name="input"),
    
    # dense for inputs
    tfk.layers.Dense(
        n_batches, 
    activation="relu", name="dense_1"),
    
    # dense for weights
    tfk.layers.Dense(
        tfp.layers.MultivariateNormalTriL.params_size(len(outputs)), # uncertainty in the parameters weights
    activation=None, name="distribution_weights"),
    
    # (declaration of the) posterior probability distribution structure
    tfp.layers.MultivariateNormalTriL(
        len(outputs), activity_regularizer=tfp.layers.KLDivergenceRegularizer(prior, weight=1/n_batches), # activity_regularizer acts as prior for the output layer
    name="output")

], name="model")


In [None]:
# Compile model.
model.compile(optimizer="adam", loss=neg_log_likelihood)

In [None]:
# Run training session.
# MIN 4 MIN 4 MIN 4
model.fit(data_train, epochs=n_epochs, validation_data=data_test, verbose=True)

In [None]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [None]:
# Describe model.
model.summary()

To account for uncertainty in parameter weights, the dense layers have to be exchanged with

- Flipout layers (``DenseFlipout``)
- Variational layers (``DenseVariational``)

Such a model has more parameters, since every weight is parametrized by normal distribution with non-shared mean and standard deviation. \
Weights will be resampled for different predictions.

In [None]:
tfp.layers.DenseFlipout(10, activation="relu", name="dense_1")

The default prior distribution over weights is `tfd.Normal(loc=0., scale=1.)` and can be adjusted using the ``kernel_prior_fn``

In [None]:
# Predict.
samples = 500
iterations = 10
test_iterator = tf.compat.v1.data.make_one_shot_iterator(data_test)
X_true, Y_true, Y_pred = np.empty(shape=(samples, len(inputs))), np.empty(shape=(samples, len(outputs))), np.empty(shape=(samples, len(outputs), iterations))

In [None]:
for i in range(samples):
    features, labels = test_iterator.get_next()
    X_true[i,:] = features
    Y_true[i,:] = labels.numpy()
    for k in range(iterations):
        Y_pred[i,:,k] = model.predict(features)

In [None]:
# Calculate mean and standard deviation.
Y_pred_m = np.mean(Y_pred, axis=-1)
Y_pred_s = np.std(Y_pred, axis=-1)
Y_pred_m, Y_pred_s