In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler

2024-07-16 13:31:17.530622: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-16 13:31:17.546414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-16 13:31:17.558313: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-16 13:31:17.561712: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-16 13:31:17.571203: I tensorflow/core/platform/cpu_feature_guar

## **CARGA DE DATOS Y PREPROCESAMIENTO**

Definimos un `DataGenerator` que permite leer secuencialmente los datasets y aplicar el preprocesamiento necesario:

In [2]:
class DataGenerator(tf.keras.utils.Sequence):
    '''
    Data generator for EnergyPlus runs under different simulation conditions.
    '''

    def __init__(self, base_dir, scaler_sample_fraction=0.1, input_length=24, output_length=1, batch_size=32, shuffle=True):
        '''
        Class constructor.
        '''
        self.base_dir = base_dir

        self.scaler_sample_fraction = scaler_sample_fraction
        self.scaler = self.__fit_scaler_with_sample_files()

        self.input_length = input_length
        self.output_length = output_length

        self.batch_size = batch_size

        self.shuffle = shuffle
        self.file_list = self.__get_file_list()

        self.on_epoch_end()

    def __sample_datasets(self):
        '''
        Returns a list of sampled datasets.
        '''
        file_list = []
        for weather in os.listdir(self.base_dir):
            weather_dir = os.path.join(self.base_dir, weather)
            if os.path.isdir(weather_dir):
                weather_dfs = [os.path.join(weather_dir, file) for file in os.listdir(
                    weather_dir) if file.endswith('.parquet')]
                sample_size = int(len(weather_dfs) *
                                  self.scaler_sample_fraction)
                sample_files = np.random.choice(
                    weather_dfs, sample_size, replace=False)
                file_list.extend(sample_files)
        return file_list

    def __fit_scaler_with_sample_files(self):
        '''
        Fits sklearn.StandardScaler based on a list of sample datasets.
        '''
        sampled_dfs = []

        for file in self.__sample_datasets():
            df = pd.read_parquet(file)
            sampled_dfs.append(df)

        sampled_data = pd.concat(sampled_dfs, ignore_index=True)

        num_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()

        scaler = StandardScaler()
        scaler.fit(sampled_data[num_cols])

        return scaler

    def __get_file_list(self):
        '''
        Returns the list of data files for each weather folder.
        '''
        file_list = [
            os.path.join(self.base_dir, weather, file)
            for weather in os.listdir(self.base_dir)
            if os.path.isdir(os.path.join(self.base_dir, weather))
            for file in os.listdir(os.path.join(self.base_dir, weather))
            if file.endswith('.parquet')
        ]
        return file_list

    def __preprocess(self, df):
        '''
        Applies standarization and one hot encoding to a given dataset. Splits the dataframe in (X, y).
        '''
        numeric_cols = df.select_dtypes(include=['number']).columns

        df[numeric_cols] = self.scaler.transform(df[numeric_cols])

        X = df.iloc[:, :-1]
        y = df.iloc[:, -1].values.reshape(-1, 1)

        X = pd.get_dummies(X).values

        return X, y

    def __create_sequences(self, X, y):
        '''
        Split data into sequences of `input_length` X_t features and `output_length` y_{t+1} labels.
        '''
        X_seq, y_seq = [], []

        for i in range(len(X)):

            end_x = i + self.input_length
            end_y = end_x + self.output_length - 1

            if end_y > len(X):
                break

            seq_x = X[i:end_x]
            seq_y = y[end_x-1:end_y, -1]

            X_seq.append(seq_x)
            y_seq.append(seq_y)

        return np.array(X_seq), np.array(y_seq)

    def __generate_data(self, batch_files):
        '''
        Reads multiple files, processes them, and returns a batch of (X, y) sequences.
        '''
        
        X_batch_data = []
        y_batch_data = []
        
        for file in batch_files:
            df = pd.read_parquet(file)
            
            # Preprocessing
            X_prep, y_prep = self.__preprocess(df)
            
            # Create sequences
            X_seq, y_seq = self.__create_sequences(X_prep, y_prep)
            
            X_batch_data.append(X_seq)
            y_batch_data.append(y_seq)

        return np.array(X_batch_data), np.array(y_batch_data)

    def __len__(self):
        '''
        Provides the number of batches to be returned as items.
        '''
        return len(self.file_list) // self.batch_size

    def __getitem__(self, index):
        '''
        Returns a batch of time series items.
        '''
        indices = self.indices[index *
                               self.batch_size: (index + 1) * self.batch_size]
        batch_files = [self.file_list[k] for k in indices]
        X, y = self.__generate_data(batch_files)
        return X, y

    def on_epoch_end(self):
        '''
        Shuffles the list of datasets.
        '''
        self.indices = np.arange(len(self.file_list))
        if self.shuffle:
            np.random.shuffle(self.indices)

Instanciamos el `DataGenerator`:

In [3]:
data_dir = 'reduced_datasheet'
scaler_sample_fraction = 0.1

input_length = 24 # 6h * 4 rec/h = 24
output_length = 1
batch_size = 32

data_gen = DataGenerator(data_dir, scaler_sample_fraction,
                         input_length, output_length, batch_size)

Probamos el `DataGenerator`:

In [4]:
sample_batch = data_gen.__getitem__(0)

sample_x = sample_batch[0][0]
sample_y = sample_batch[1][0]

print('X shape: ', sample_x.shape)
print('y shape: ', sample_y.shape)

X shape:  (20329, 24, 13)
y shape:  (20329, 1)


## **LSTM**

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, InputLayer

Definimos las dimensiones del *input* del modelo:

In [6]:
num_features = sample_x.shape[2]

input_shape = (input_length, num_features)
input_shape

(24, 13)

Creamos el modelo LSTM:

In [7]:
model = Sequential()

model.add(InputLayer(batch_size=batch_size, shape = input_shape))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

model.summary()

I0000 00:00:1721129483.293052  126213 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721129483.322967  126213 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721129483.323100  126213 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1721129483.324614  126213 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

InternalError: {{function_node __wrapped__Sign_device_/job:localhost/replica:0/task:0/device:GPU:0}} 'cuLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, reinterpret_cast<CUstream>(stream), params, nullptr)' failed with 'CUDA_ERROR_INVALID_HANDLE' [Op:Sign] name: 

Entrenamos el modelo:

In [None]:
model.fit(data_gen, epochs=50)