# Setup

In [None]:
import numpy as np

import tensorflow as tf
from tensorflow.keras import metrics, losses, layers, models, optimizers, callbacks, activations

# Forecasting a Time Series

In [None]:
def generate_time_series(batch_size, n_steps):
    freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)
    time = np.linspace(0, 1, n_steps)
    series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10)) # wave 1
    series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20)) # wave 2
    series += 0.1 * (np.random.rand(batch_size, n_steps) - 0.5) # noise
    return series[..., np.newaxis].astype(np.float32)

In [None]:
# Create the training, validation and test set

n_steps = 50
series = generate_time_series(10000, n_steps + 1)
x_train, y_train = series[:7000, :n_steps], series[:7000, -1]
x_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -1]
x_test, y_test = series[9000:, :n_steps], series[9000:, -1]

## Baseline Metrics

In [None]:
# Naive Forecasting
y_pred = x_valid[:, -1]
np.mean(losses.mean_squared_error(y_valid, y_pred))

0.02084235

In [None]:
# Use FNN to forecast the next value

model = models.Sequential([
    layers.Flatten(input_shape=[50, 1]),
    layers.Dense(1)
])

params = {
    'loss' : 'mse',
    'metrics' : 'mse',
    'optimizer' : 'adam'
}

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9f998dd0>

In [None]:
print(f'The MSE of this model is {model.evaluate(x_test, y_test)[1]}')

The MSE of this model is 0.004371064715087414


## Implementing a Simple RNN

In [None]:
model = models.Sequential([
    layers.SimpleRNN(1, input_shape=[None, 1])
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9cf4a410>

In [None]:
print(f'The MSE of this model is {model.evaluate(x_test, y_test)[1]}')

The MSE of this model is 0.014572584070265293


## Deep RNNs

In [None]:
model = models.Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    layers.SimpleRNN(10, return_sequences=True),
    layers.SimpleRNN(1)
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9cccce90>

In [None]:
print(f'The MSE of this model is {model.evaluate(x_test, y_test)[1]}')

The MSE of this model is 0.0033664132934063673


In [None]:
model = models.Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    layers.SimpleRNN(10),
    layers.Dense(1)
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9a16a550>

In [None]:
print(f'The MSE of this model is {model.evaluate(x_test, y_test)[1]}')

The MSE of this model is 0.003151234006509185


## Forecasting Several Time Steps Ahead

In [None]:
# Predict one value at a time

series = generate_time_series(1, n_steps + 10)
x_new, y_new = series[:, :n_steps], series[:, n_steps:]
x = x_new
for step_ahead in range(10):
    y_pred_one = model.predict(x[:, step_ahead:])[:, np.newaxis, :]
    x = np.concatenate([x, y_pred_one], axis=1)

y_pred = x[:, n_steps:]

In [None]:
# Predict the 10 values at once
series = generate_time_series(10000, n_steps + 10)
x_train, y_train = series[:7000, :n_steps], series[:7000, -10:, 0]
x_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]
x_test, y_test = series[9000:, :n_steps], series[9000:, -10, 0]

# Create, compile and train the model 
model = models.Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    layers.SimpleRNN(20),
    layers.Dense(10)
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9f2e7cd0>

In [None]:
y_pred = model.predict(x_new)
y_pred

array([[-0.4852456 , -0.365886  , -0.22739744, -0.06450187,  0.11762518,
         0.26355815,  0.36882114,  0.4553286 ,  0.4889334 ,  0.48952588]],
      dtype=float32)

In [None]:
# Convert the previous snippet to sequence-to-sequence
y = np.empty((10000, n_steps, 10))
for step_ahead in range(1, 10 + 1):
    y[:, :, step_ahead - 1] = series[:, step_ahead:step_ahead + n_steps, 0]
y_train = y[:7000]
y_valid = y[7000:9000]
y_test = y[9000:]

In [None]:
model = models.Sequential([
    layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),
    layers.SimpleRNN(20, return_sequences=True),
    layers.TimeDistributed(layers.Dense(10))
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba9765fa10>

# Handling Long Sequences

## Fighting The Unstable Gradients Problem

In [None]:
# Create a custom SimpleRNN cell that applies LayerNormalization
class LNSimpleRNNCell(layers.Layer):
    
    def __init__(self, units, activation='tanh', **kwargs):
        super().__init__(**kwargs)
        self.state_size = units
        self.output_size = units
        self.simple_rnn_cell = layers.SimpleRNNCell(units, activation=None)
        self.layer_norm = layers.LayerNormalization()
        self.activation = activations.get(activation)
    
    def call(self, inputs, states):
        outputs, new_states = self.simple_rnn_cell(inputs, states)
        norm_outputs = self.activation(self.layer_norm(outputs))
        return norm_outputs, [norm_outputs]

In [None]:
# Implementing this custom cell requires using a keras.layers.RNN layer
model = models.Sequential([
    layers.RNN(LNSimpleRNNCell(20), return_sequences=True, input_shape=[None, 1]),
    layers.RNN(LNSimpleRNNCell(20), return_sequences=True),
    layers.TimeDistributed(layers.Dense(10))
])

model.compile(**params)

model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20,
          verbose=0)

print(f'The MSE of the model is {model.evaluate(x_test, y_test, verbose=0)[1]}')

The MSE of the model is 0.026506725698709488


## Tackling the Short-Term Memory Problem

### LSTM Cells

In [None]:
model = models.Sequential([
    layers.LSTM(20, return_sequences=True, input_shape=[None, 1]),
    layers.LSTM(20, return_sequences=True),
    layers.Dense(10)
])

model.compile(**params)
model.fit(x_train, y_train,
          validation_data=[x_valid, y_valid],
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fba961fe710>

In [None]:
evaluation = model.evaluate(x_test, y_test, verbose=0)
print(f'The MSE of the model is {evaluation[1]}')

The MSE of the model is 0.02304089069366455


### Using 1D convolutional layers to process sequences

In [None]:
def last_time_step_mse(y_true, y_pred):
    return metrics.mean_squared_error(y_true[:-1], y_pred[:, -1])

params = {
    'loss' : 'mse',
    'metrics' : [last_time_step_mse],
    'optimizer' : 'adam'
}


In [None]:
model = models.Sequential([
    layers.Conv1D(filters=20,
                  kernel_size=4,
                  strides=2,
                  padding='valid',
                  input_shape=[None, 1]),
    layers.GRU(20, return_sequences=True),
    layers.GRU(20, return_sequences=True),
    layers.Dense(10)
])

### WaveNet

In [None]:
model = models.Sequential([
    layers.InputLayer(input_shape=[None, 1])
])

for rate in (1, 2, 4, 8) * 2:
    model.add(layers.Conv1D(filters=20, 
                            kernel_size=2, 
                            padding='causal',
                            activation='relu',
                            dilation_rate=rate))
model.add(layers.Conv1D(filters=10, kernel_size=1))

# Exercises

1. Can you think of a few applications for a sequence-to-sequence RNN? What
about a sequence-to-vector RNN, and a vector-to-sequence RNN?

> One application to sequence-to-sequence would be to enter text that describes the audio we want to generate, and the output to be a track that reproduces the audio we required. For sequence-to-vector would be to provide a song and have a NN that determines the genre of the song. Finally, one vector-to-sequence would be providing an image and obtaining the description of the image. 

2. How many dimensions must the inputs of an RNN layer have? What does each
dimension represent? What about its outputs?

> The input of an RNN needs to have 3 dimensions: *batch size*, *time steps*, and *dimensionality*. The dimensionality determines the number of features of the input. The output depends on the task: it may be a vector, or a sequence. Depending on the task, the output may have 1 or more dimensions. 

3. If you want to build a deep sequence-to-sequence RNN, which RNN layers
should have return_sequences=True? What about a sequence-to-vector RNN?

> In the first example, all the RNN cells should have ```return_sequences=True```. If we want to have a sequence-to-vector RNN, we should have ```return_sequences=True``` in all the layers except for the last RNN Cell.

4. Suppose you have a daily univariate time series, and you want to forecast the next seven days. Which RNN architecture should you use?

> In order to forecast the next seven days, the architecture should be several RNN cells with ```return_sequences=True``` except for the top layer, and an additional ```Dense``` layer with 7 neurons, and no activation. This would be a sequence-to-vector. Alternatively, we could train the model to have a sequence of the next 7 days as target, and have all the layers with the hyperparameter ```return_sequences=True``` and this would be a sequence-to-sequence.

5. What are the main difficulties when training RNNs? How can you handle them?

> The two main difficulties are the unstable gradients and the limited memory of the network. The first can be approached by using smaller learning rates, saturating activation functions, LayerNormalization or dropout. The second, can be addressed by using another RNN cells such as the ```LSTM``` (Long-Short Term Memory) and ```GRU``` (Gated Recurrent Unit) layers that improves the long term memory of the network.

6. Can you sketch the LSTM cell’s architecture?

>

7. Why would you want to use 1D convolutional layers in an RNN?

> Using 1D convolutional layers allows to apply filters to the input, by reducing their dimensionality and helping the network to have a better memory for long sequences. 

8. Which neural network architecture could you use to classify videos?

> The architecture to use may be a sequence-to-vector, where each frame can be seen as a time step, and process this into a embedding to then pass the embedding to a ```Dense``` layer with softmax as activation function. The loss function would be ```cross-entropy``` .