# Vanilla RNNs and GRUs

In [64]:
import numpy as np
from numpy import random
from time import perf_counter
import tensorflow as tf

In [65]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

## Part 1: Formward method for vanilla RNNs and GRUs using numpy

In [66]:
random.seed(10)
emb = 128 # Embedding size
T = 256 # Length of sequence
h_dim = 16  # Hidden state dimension
h_0 = np.zeros((h_dim, 1))


w1 = random.standard_normal((h_dim, emb + h_dim))
w2 = random.standard_normal((h_dim, emb + h_dim))
w3 = random.standard_normal((h_dim, emb + h_dim))

b1 = random.standard_normal((h_dim, 1))
b2 = random.standard_normal((h_dim, 1))
b3 = random.standard_normal((h_dim, 1))


X = random.standard_normal((T, emb, 1)) # Add third dimension to achieve the batch representation

weights_vanilla = [w1, b1]
weights_GRU = [w1.copy(), w2, w3, b1.copy(), b2, b3]

### Forward method for vanilla RNNs

\begin{equation}
h^{<t>}=g(W_{h}[h^{<t-1>},x^{<t>}] + b_h)
\label{eq: htRNN}
\end{equation}
    
\begin{equation}
\hat{y}^{<t>}=g(W_{yh}h^{<t>} + b_y)
\label{eq: ytRNN}
\end{equation}

where $[h^{<t-1>},x^{<t>}]$ means that $h^{<t-1>}$ and $x^{<t>}$ are concatenated together. 

In [67]:
def forward_V_RNN(inputs, weights):
    """Forward propogation for a single vanilla RNN cell"""
    x, h_t = inputs

    #print(f'Dimensions x: {x.shape}')
    #print(f'Dimensions h_t: {h_t.shape}')
    #print()
    
    # weights.
    wh, bh = weights
    #print(f'Dimensions wh: {wh.shape}')
    #print(f'Dimensions bh: {bh.shape}')
    
    # new hidden state
    #h_t = wh @ np.vstack((h_t, x)) + bh (SAME)
    h_t = np.dot(wh, np.concatenate([h_t, x])) + bh
    h_t = sigmoid(h_t)
    
    # We avoid implementation of y for clarity
    y = h_t
    
    return y, h_t

In [68]:
forward_V_RNN(inputs = (X[1], h_0), weights=weights_vanilla)[0]

array([[9.77827287e-01],
       [9.99999109e-01],
       [5.19961637e-01],
       [9.99999886e-01],
       [9.99707011e-01],
       [3.02197037e-04],
       [9.58733743e-01],
       [2.10804828e-02],
       [9.77365398e-05],
       [9.99835894e-01],
       [1.63200940e-08],
       [8.51874636e-01],
       [5.21399924e-02],
       [2.15495962e-02],
       [9.99879173e-01],
       [9.99997211e-01]])

### Forward method for GRUs

GRUs have relevance $\Gamma_r$ and update $\Gamma_u$ gates that control how the hidden state $h^{<t>}$ is updated on every time step. With these gates, GRUs are capable of keeping relevant information in the hidden state even for long sequences. The equations needed for the forward method in GRUs are provided below: 

\begin{equation}
\Gamma_r=\sigma{(W_r[h^{<t-1>}, x^{<t>}]+b_r)}
\end{equation}

\begin{equation}
\Gamma_u=\sigma{(W_u[h^{<t-1>}, x^{<t>}]+b_u)}
\end{equation}

\begin{equation}
c^{<t>}=\tanh{(W_h[\Gamma_r*h^{<t-1>},x^{<t>}]+b_h)}
\end{equation}

\begin{equation}
h^{<t>}=\Gamma_u*c^{<t>}+(1-\Gamma_u)*h^{<t-1>}
\end{equation}

In [69]:
def forward_GRU(inputs, weights):
    x, h_t = inputs

    wu, wr, wc, bu, br, bc = weights

    #u = wu @ np.vstack((h_t, x)) + bu (SAME AS BELOW)
    u = np.dot(wu, np.concatenate([h_t, x])) + bu
    u = sigmoid(u)

    #r = wr @ np.vstack((h_t, x)) + br (SAME AS BELOW)
    r = np.dot(wr, np.concatenate([h_t, x])) + br
    r = sigmoid(r)

    #c = wc @ np.vstack((r * h_t, x)) + bc (SAME AS BELOW)
    c = np.dot(wc, np.concatenate([r * h_t, x])) + bc
    c = np.tanh(c)

    h_t = u * c + (1-u) * h_t

    y = h_t

    return y, h_t

In [70]:
forward_GRU([X[1], h_0], weights_GRU)[0]

array([[ 9.77779014e-01],
       [-9.97986240e-01],
       [-5.19958083e-01],
       [-9.99999886e-01],
       [-9.99707004e-01],
       [-3.02197037e-04],
       [-9.58733503e-01],
       [ 2.10804828e-02],
       [ 9.77365398e-05],
       [ 9.99833090e-01],
       [ 1.63200940e-08],
       [ 8.51874303e-01],
       [ 5.21399924e-02],
       [ 2.15495959e-02],
       [ 9.99878828e-01],
       [ 9.77165472e-01]])

## Implementation of `scan`function

- `fn` : the function to be called recurrently (i.e. `forward_GRU`)
- `elems` : the list of inputs for each time step (`X`)
- `weights` : the parameters needed to compute `fn`
- `h_0` : the initial hidden state

In [71]:
def scan(fn, elems, weights, h_0):
    h_t = h_0
    ys = []

    for x in elems: # Ex with first element of [T, emb, 1] -> [0, emd, 1] -> [emd, 1] -> [128, 1]
        #print(f'<Inside scan - Size of x: {x.shape}>') # [128, 1]    
        y, h_t = fn([x, h_t], weights)
        ys.append(y)

    return ys, h_t

In [72]:
ys, h_T = scan(forward_V_RNN, X, weights_vanilla, h_0)

In [73]:
print(f"Length of ys: {len(ys)}")
print(f"Shape of each y within ys: {ys[0].shape}")
print(f"Shape of h_T: {h_T.shape}")

Length of ys: 256
Shape of each y within ys: (16, 1)
Shape of h_T: (16, 1)


## Comparison between vanilla RNNs and GRUs

In the next two cells, we compute forward propagation for a sequence with 256 time steps (`T`) for an RNN and a GRU with the same hidden state `h_t` size (`h_dim`=16).  

In [74]:
# vanilla RNNs
tic = perf_counter()
ys, h_T = scan(forward_V_RNN, X, weights_vanilla, h_0)
toc = perf_counter()
RNN_time=(toc-tic)*1000

In [75]:
print (f"It took {RNN_time:.2f}ms to run the forward method for the vanilla RNN.")

It took 2.45ms to run the forward method for the vanilla RNN.


Comment: Takes much more time when you use np.vstack()

In [76]:
tic = perf_counter()
ys, h_T = scan(forward_GRU, X, weights_GRU, h_0)
toc = perf_counter()
GRU_time=(toc-tic)*1000

In [77]:
print (f"It took {GRU_time:.2f}ms to run the forward method for the GRU.")

It took 4.25ms to run the forward method for the GRU.


Comment: Takes much more time when you use np.vstack()

## Create a GRU model in tensorflow

 - [`Sequential`](https://www.tensorflow.org/guide/keras/sequential_model) A Sequential model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.
   - [`Dense`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dense) A regular fully connected layer
   - [`GRU`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/GRU) The GRU (gated recurrent unit) layer. The hidden state dimension should be specified (the syntax is the same as for `Dense`). By default it does not return a sequence, but only the output of the last unit. If you want to stack two consecutive GRU layers, you need the first one to output a sequence, which you can achieve by setting the parameter `return_sequences` to True. If you are further interested in similar layers, you can also check out the [`RNN`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/RNN), [`LSTM`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM) and [`Bidirectional`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Bidirectional). If you want to use a RNN or LSTM instead of GRU in the code below, simply change the layer name, no other change in the syntax is needed.

In [78]:
model_GRU = tf.keras.Sequential([
    tf.keras.layers.GRU(256, return_sequences=True, name='GRU_1_returns_seq'),
    tf.keras.layers.GRU(128, return_sequences=True, name='GRU_2_returns_seq'),
    tf.keras.layers.GRU(64, name='GRU_3_returns_last_only'),
    tf.keras.layers.Dense(10)
])

In [79]:
try:
    model_GRU.summary()
except Exception as e:
    print(e)

In [80]:
# Remember these three numbers and follow them further through the notebook
batch_size = 60
sequence_length = 50
word_vector_length = 40

input_data = tf.random.normal([batch_size, sequence_length, word_vector_length])

# Pass the data through the network
prediction = model_GRU(input_data)

# Show the summary of the model
model_GRU.summary()

In [81]:
# Define some data with a different length of word vectors
new_word_vector_length = 44  # Before it was 40
# Keep the batch_size = 60 and sequence_length = 50 as originally
input_data_1 = tf.random.normal([batch_size, sequence_length, new_word_vector_length])

# Pass the data through the network. This should Fail (if you ran all the cells above)
try:
    prediction = model_GRU(input_data_1)
except Exception as e:
    print(e)

Exception encountered when calling GRUCell.call().

[1m{{function_node __wrapped__MatMul_device_/job:localhost/replica:0/task:0/device:CPU:0}} Matrix size-incompatible: In[0]: [60,44], In[1]: [40,768] [Op:MatMul] name: [0m

Arguments received by GRUCell.call():
  • inputs=tf.Tensor(shape=(60, 44), dtype=float32)
  • states=('tf.Tensor(shape=(60, 256), dtype=float32)',)
  • training=False


2024-04-12 16:34:52.397242: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: Matrix size-incompatible: In[0]: [60,44], In[1]: [40,768]


Why did this fail? Remember how the layers are constructed: they know what length of vectors to expect and they have their weight matrices defined to accommodate for it. However if you change the length of the word vector, it cannot be multiplied by an incompatible matrix .

In [83]:
# Define some data with a different length of the sequence
new_sequence_length = 55  # Before it was 50
# Keep the batch_size = 60 and word_vector_length = 40 as originally
input_data_2 = tf.random.normal([batch_size, new_sequence_length, word_vector_length])

# Pass the data through the network. This should Fail (if you ran all the cells above)
prediction = model_GRU(input_data_2)

model_GRU.summary()

Well, this worked! Why? because the neural network does not have any specific parameters (weights) associated with the length of the sequence, so it is flexible in this dimension. Look at the summary at what happened in the second dimension of the output of the first two layers. Where there was "50" before, it turned to "None". This tells you that the network now expects any sequence length.


How about `batch_size`? If you guessed it must also be flexible, you are right. You can any time change the batch size and the model should be fine with it. Let's test it.

In [84]:
# Define some data with a different batch size
new_batch_size = 66  # Before it was 60
# Keep the sequence_length = 50 and word_vector_length = 40 as originally
input_data_3 = tf.random.normal([new_batch_size, sequence_length, word_vector_length])

# Pass the data through the network. This should Fail (if you ran all the cells above)
prediction = model_GRU(input_data_3)

model_GRU.summary()

In [85]:
model_GRU_2 = tf.keras.Sequential([
    tf.keras.layers.GRU(256, return_sequences=True, name='GRU_1_returns_seq'),
    tf.keras.layers.GRU(128, return_sequences=True, name='GRU_2_returns_seq'),
    tf.keras.layers.GRU(64, name='GRU_3_returns_last_only'),
    tf.keras.layers.Dense(10)
])

model_GRU_2.build([None, None, word_vector_length])

model_GRU_2.summary()