<a href="https://colab.research.google.com/github/pedro-ponte/deep-learning-time-series/blob/master/notebooks/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import tqdm

In [2]:
def get_data_df(data_path="../data/raw/jena_climate_2009_2016.csv"):
    df = pd.read_csv(data_path)
    df.set_index('Date Time', inplace=True)
    return df

In [3]:
df = get_data_df()

In [4]:
df.shape

(420551, 14)

In [5]:
df.head(5)

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
01.01.2009 00:10:00,996.52,-8.02,265.4,-8.9,93.3,3.33,3.11,0.22,1.94,3.12,1307.75,1.03,1.75,152.3
01.01.2009 00:20:00,996.57,-8.41,265.01,-9.28,93.4,3.23,3.02,0.21,1.89,3.03,1309.8,0.72,1.5,136.1
01.01.2009 00:30:00,996.53,-8.51,264.91,-9.31,93.9,3.21,3.01,0.2,1.88,3.02,1310.24,0.19,0.63,171.6
01.01.2009 00:40:00,996.51,-8.31,265.12,-9.07,94.2,3.26,3.07,0.19,1.92,3.08,1309.19,0.34,0.5,198.0
01.01.2009 00:50:00,996.51,-8.27,265.15,-9.04,94.1,3.27,3.08,0.19,1.92,3.09,1309.0,0.32,0.63,214.3


In [8]:
lookback = 1440 # 5 days
step = 6 # one sample per hour
delay = 144 # 24 hours

df_data = df.copy(deep=True)
df_data['target'] = df_data['T (degC)'].shift(-1*delay)
df_data.dropna(how='any', axis=0, inplace=True)



X = df_data.drop(columns=['target'])
y = df_data['target']

mean = X.iloc[:200000].mean(axis=0)
# X_norm = X - mean
std = X.iloc[:200000].std(axis=0)
X_norm = (X - mean)/std
y_norm = (y - mean['T (degC)'])/std['T (degC)']
# good_indices = (np.abs(X_norm) > 10).sum(axis=1) == 0
# X_norm = X_norm[good_indices]
# y_norm = y_norm[good_indices]



In [9]:
def transform_data_to_rnn_shape(X, y, X_index, lookback=50, step=1):
    sequences = np.hstack((X, y[:, np.newaxis]))
    print(sequences.shape)
    X_rnn, y_rnn, X_index_rnn = list(), list(), list()
#     X_rnn, y_rnn, X_index_rnn = np.empty((0, lookback//step, X.shape[1])), list(), list()
    for i in tqdm.tqdm(range(len(sequences))):
        end_ix  = i + lookback
        if end_ix > len(sequences): break
        seq_x, seq_y = \
            sequences[
                list(range(end_ix - 1, i-1, -step))[::-1], :-1], sequences[end_ix-1, -1]
        X_rnn.append(seq_x)
        y_rnn.append(seq_y)
        X_index_rnn.append(X_index[end_ix-1])
    print("done loop")
        
    
            
    X_rnn = np.asarray(X_rnn)
    print("done first array")
    y_rnn = np.asarray(y_rnn)
    X_index_rnn = np.asarray(X_index_rnn)
    print("done all array")

    return {"X_rnn": X_rnn, "y_rnn": y_rnn, "X_index_rnn": X_index_rnn}
            

In [95]:
X_norm.shape

(420387, 14)

In [96]:
X.shape

(420407, 14)

In [139]:
pd.Series(X_norm.values.flatten()).value_counts().sort_index()

-8.877632    1
-8.818673    1
-8.429543    2
-8.323416    1
-8.299833    1
            ..
 8.537405    2
 8.539471    1
 8.551367    1
 8.562196    1
 8.661363    1
Length: 75815, dtype: int64

In [10]:
data_transf_results = transform_data_to_rnn_shape(X_norm.values, y_norm.values, X.index, lookback=lookback, step=step)

  1%|          | 2316/420407 [00:00<00:18, 23156.99it/s]

(420407, 15)


 99%|█████████▉| 417344/420407 [00:20<00:00, 21452.58it/s]

done loop


 99%|█████████▉| 417344/420407 [00:40<00:00, 21452.58it/s]

done first array
done all array


In [34]:
X_rnn = data_transf_results["X_rnn"]
y_rnn = data_transf_results["y_rnn"]
X_index_rnn = data_transf_results["X_index_rnn"]

In [35]:
X_index_rnn

array(['11.01.2009 00:00:00', '11.01.2009 00:10:00',
       '11.01.2009 00:20:00', ..., '30.12.2016 23:40:00',
       '30.12.2016 23:50:00', '31.12.2016 00:00:00'], dtype='<U19')

In [36]:
X_rnn.shape

(418968, 240, 14)

In [27]:
# # from keras.models import Sequential
# from keras import layers
# # from keras.optimizers import RMSprop, Adam
# from kerastuner.tuners import RandomSearch, Hyperband
# from kerastuner.engine.hypermodel import HyperModel
# from kerastuner.engine.hyperparameters import HyperParameters
# # from tensorflow import keras
# # from tensorflow.keras import layers
# from kerastuner.tuners import RandomSearch
# import keras
# import keras.optimizers
# import keras.layers

from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch, Hyperband

# def build_gru(hidden_units, dropout, recurrent_dropout, learning_rate):

#     model = Sequential()
#     model.add(layers.GRU(hidden_units, 
#                          dropout=dropout,
#                          recurrent_dropout=recurrent_dropout,
#                          input_shape=(X_rnn.shape[1], X_rnn.shape[-1])))
#     model.add(layers.Dense(1))

#     model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mae', metric=['mae'])
#     return model

#                              hp.Int('hidden_units',
#                                 min_value=10,
#                                 max_value=100,
#                                 step=100), 

def build_gru_hp(hp):
    
    from numpy.random import seed
    seed(1)
    import tensorflow
    tensorflow.random.set_seed(2)
    model = keras.Sequential()
    model.add(layers.GRU(
        hp.Choice('hidden_units', values=[4, 8, 16, 32, 64]),
        dropout=hp.Float('dropout', 0, 0.50),
#                          recurrent_dropout=hp.Float('recurrent_dropout', 0, 0.50),
        input_shape=(X_rnn.shape[1], X_rnn.shape[-1])))
    model.add(layers.Dense(1))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=1e-10, max_value=1e-1)), 
                  loss='mae', 
                  metrics=['mae']
                 )
    return model


def build_gru(hidden_units, dropout, recurrent_dropout, learning_rate):
    
    from numpy.random import seed
    seed(1)
    import tensorflow
    tensorflow.random.set_seed(2)
    model = keras.Sequential()
    model.add(layers.GRU(
        hidden_units,
        dropout=dropout,
                         recurrent_dropout=recurrent_dropout,
                         input_shape=(X_rnn.shape[1], X_rnn.shape[-1])))
    model.add(layers.Dense(1))

    model.compile(optimizer=keras.optimizers.RMSprop(), 
                  loss='mae') # metric=['mae'])
    return model

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(units=hp.Int('units',
                                        min_value=32,
                                        max_value=512,
                                        step=32),
                           activation='relu'))
    model.add(layers.Dense(10, activation='softmax'))
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Choice('learning_rate',
                      values=[1e-2, 1e-3, 1e-4])),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy'])
    return model


tuner = RandomSearch(
    build_gru_hp,
    objective='val_loss',
    max_trials=100,
    executions_per_trial=1,
    directory='test_dir_11',
)

In [28]:
X_train, y_train = X_rnn[:10000], y_rnn[:10000]
X_val, y_val = X_rnn[10000:20000], y_rnn[10000:20000]

In [29]:
X_train, y_train = X_rnn[:200000], y_rnn[:200000]
X_val, y_val = X_rnn[200000:300000], y_rnn[200000:300000]

In [30]:
tuner.search(x=X_train,
             y=y_train,
             epochs=10,
             batch_size=128,
             validation_data=(X_val, y_val), steps_per_epoch=500)

Train on 200000 samples, validate on 100000 samples
Epoch 1/10


KeyboardInterrupt: 

In [37]:
def evaluate_naive_method(X, y_true): 
    y_pred = X[:, -1, 1]
    return np.mean(np.abs(y_pred - y_true))

In [38]:
evaluate_naive_method(X_val, y_val)

0.27681210142182566

In [32]:
# %matplotlib inline
# import matplotlib.pyplot as plt

In [23]:
# def build_gru(hidden_units, dropout, recurrent_dropout, learning_rate):
    
#     from numpy.random import seed
#     seed(1)
#     import tensorflow
#     tensorflow.random.set_seed(2)
#     model = keras.Sequential()
#     model.add(layers.GRU(
#         hidden_units,
#         dropout=dropout,
#         recurrent_dropout=recurrent_dropout,
#         input_shape=(X_rnn.shape[1], X_rnn.shape[-1])))
#     model.add(layers.Dense(1))

#     model.compile(optimizer=keras.optimizers.RMSprop(learning_rate), 
#                   loss='mae') # metric=['mae'])
#     return model


In [24]:
# gru = build_gru(32, 0.2, 0.2, 0.000001)

In [39]:
# gru.summary()