In [1]:
import tensorflow as tf

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Bidirectional

from tensorflow.python.keras import backend as K

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

from numpy.random import seed


SEED = 123  # used to help randomly select the data points
DATA_SPLIT_PCT = 0.2

df = pd.read_csv("reanalysis.csv")

2024-05-14 17:15:22.098018: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-14 17:15:22.102452: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-14 17:15:22.174014: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Convert Categorical column to hot dummy columns
hotencoding1 = pd.get_dummies(df['month'])
hotencoding1 = hotencoding1.add_prefix('month')
hotencoding2 = pd.get_dummies(df['hour'])
hotencoding2 = hotencoding2.add_prefix('hour')

df=df.drop(['month', 'hour'], axis=1)

df=pd.concat([df, hotencoding1, hotencoding2], axis=1)

# Rename response column name for ease of understanding
df=df.rename(columns={'sfcWind':'y'})

#Filter the rows with time less than 2015-01-01
df_train = df[df['time'] < '2014-01-01 00:00:00']
df_test = df[df['time'] >= '2014-01-01 00:00:00']

df_train = df_train.drop(['time'], axis=1)
df_test = df_test.drop(['time'], axis=1)

In [4]:
input_X_train = df_train.loc[:, df_train.columns != 'y'].values  # converts df to numpy array
input_X_test = df_test.loc[:, df_test.columns != 'y'].values  
input_y_train = df_train['y'].values
input_y_test = df_test['y'].values

n_features = input_X_train.shape[1]  # number of features

In [5]:
def temporalize(X, y, lookback):
    '''
    Inputs
    X         A 2D numpy array ordered by time of shape: (n_observations x n_features)
    y         A 1D numpy array with indexes aligned with X, i.e. y[i] should correspond to X[i]. Shape: n_observations.
    lookback  The window size to look back in the past records. Shape: a scalar.

    Output
    output_X  A 3D numpy array of shape: ((n_observations-lookback-1) x lookback x n_features)
    output_y  A 1D array of shape: (n_observations-lookback-1), aligned with X.
    '''
    output_X = []
    output_y = []
    for i in range(len(X) - lookback - 1):
        t = []
        for j in range(1, lookback + 1):
            # Gather the past records upto the lookback period
            t.append(X[[(i + j + 1)], :])
        output_X.append(t)
        output_y.append(y[i + lookback + 1])
    return np.squeeze(np.array(output_X)), np.array(output_y)

def flatten(X):
    '''
    Flatten a 3D array.

    Input
    X            A 3D array for lstm, where the array is sample x timesteps x features.

    Output
    flattened_X  A 2D array, sample x features.
    '''
    flattened_X = np.empty(
        (X.shape[0], X.shape[2]))  # sample x features array.
    for i in range(X.shape[0]):
        flattened_X[i] = X[i, (X.shape[1] - 1), :]
    return flattened_X

def scale(X, scaler):
    '''
    Scale 3D array.

    Inputs
    X            A 3D array for lstm, where the array is sample x timesteps x features.
    scaler       A scaler object, e.g., sklearn.preprocessing.StandardScaler, sklearn.preprocessing.normalize

    Output
    X            Scaled 3D array.
    '''
    for i in range(X.shape[0]):
        X[i, :, :] = scaler.transform(X[i, :, :])

    return X

In [8]:
lookback = 5
X_train, y_train = temporalize(X=input_X_train, 
                      y=input_y_train, 
                      lookback=lookback)
X_test, y_test = temporalize(X=input_X_test, 
                      y=input_y_test, 
                      lookback=lookback)

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    test_size=DATA_SPLIT_PCT,
    random_state=SEED)

TIMESTEPS = X_train.shape[1]  # equal to the lookback
N_FEATURES = X_train.shape[2]  # the number of features

# Initialize a scaler using the training data.
scaler = StandardScaler().fit(flatten(X_train))
X_train_scaled = scale(X_train, scaler)
X_valid_scaled = scale(X_valid, scaler)
X_test_scaled =  scale(X_test, scaler)

# Modelo

In [10]:
model = Sequential()
model.add(Input(shape=(TIMESTEPS, N_FEATURES), 
                name='input'))
model.add(
    LSTM(units=16, 
         activation='tanh',
         recurrent_activation='sigmoid',
         return_sequences=True, 
         name='lstm_layer_1'))
model.add(
    LSTM(units=8, 
         activation='tanh', 
         recurrent_activation='sigmoid',
         return_sequences=False, 
         name='lstm_layer_2'))
model.add(Dense(units=1, 
                activation='linear', 
                name='output'))

model.summary()

In [11]:
model.compile(optimizer='adam',
              loss='mse',
              metrics=[
                  tf.keras.metrics.RootMeanSquaredError()
              ])

In [12]:
history = model.fit(x= np.asarray(X_train_scaled).astype('float32'),
                    y=y_train,
                    batch_size=128,
                    epochs=100,
                    validation_data=(np.asarray(X_valid_scaled).astype('float32'), 
                                     y_valid),
                    verbose=1).history

Epoch 1/100


2024-05-14 17:23:33.210918: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 88279200 exceeds 10% of free system memory.


[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 5.2826 - root_mean_squared_error: 2.1649 - val_loss: 0.4455 - val_root_mean_squared_error: 0.6675
Epoch 2/100
[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.4219 - root_mean_squared_error: 0.6494 - val_loss: 0.3723 - val_root_mean_squared_error: 0.6102
Epoch 3/100
[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.3695 - root_mean_squared_error: 0.6078 - val_loss: 0.3525 - val_root_mean_squared_error: 0.5937
Epoch 4/100
[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.3508 - root_mean_squared_error: 0.5923 - val_loss: 0.3417 - val_root_mean_squared_error: 0.5846
Epoch 5/100
[1m767/767[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.3378 - root_mean_squared_error: 0.5812 - val_loss: 0.3355 - val_root_mean_squared_error: 0.5792
Epoch 6/100
[1m767/767[0m [32m━━━━━━━━━━━━━━

In [16]:
p = model.predict(np.asarray(X_test_scaled).astype('float32'))
res = pd.DataFrame(p)
res.to_csv("lstm_output.csv")

[1m273/273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
