<a href="https://colab.research.google.com/github/neuralsrg/SequenceModels/blob/main/manual_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Manual implementation of RNN in numpy

In [7]:
import numpy as np 
import scipy

## RNN

### RNN unit
![image](https://i.imgur.com/7PAU2pP.png)

In [39]:
def rnn_unit(x, a_prev, parameters):
  '''
  Implements signle RNN unit forward pass

  Args:
  x -- input vector
  a_prev -- activation from previous RNN unit
  parameters -- python dictionary with keys 'Waa', 'Wax', 'ba', 'Wya', 'by'

  Returns:
  y_pred -- softmax activation outputs
  a -- current RNN unit hidden state
  cache -- tuple containing (a, a_prev, x, parameters)
  '''
  Waa, Wax, ba, Wya, by = parameters['Waa'], parameters['Wax'], parameters['ba'], \
                          parameters['Wya'], parameters['by']

  a = Waa @ a_prev + Wax @ x + ba
  a = np.tanh(a)

  y_pred = scipy.special.softmax(Wya @ a + by, axis=0)

  cache = (a, a_prev, x, parameters)

  return y_pred, a, cache

### RNN forward pass

In [35]:
def rnn(x, a_init, parameters):
  '''
  Implements RNN forward pass

  Args:
  x -- tensor of shape (n_x, m, T_x), where n_x - num of features, m - batch size,
      T_x - num of time steps
  a_init -- initial rnn hidden state
  parameters -- python dictionary with keys 'Waa', 'Wax', 'ba', 'Wya', 'by'

  Returns:
  Y_pred -- tensor of shape (n_y, m, T_x)
  A -- tensor of hidden states of shape (n_a, m, T_x)
  caches -- (list_of_caches, x)
  '''
  n_x, m, T_x = x.shape
  a = a_init
  n_y, n_a = parameters['Wya'].shape

  Y_pred = np.empty((n_y, m, 0))
  A = np.empty((n_a, m, 0))
  caches = []

  for time_step in range(T_x):
    y_pred, a, cache = rnn_unit(x[:, :, time_step], a, parameters)

    Y_pred = np.append(Y_pred, y_pred[..., np.newaxis], axis=-1)
    A = np.append(A, a[..., np.newaxis], axis=-1)
    caches.append(cache)

  return Y_pred, A, (caches, x)

### RNN backpropagation

![image](https://i.imgur.com/x2jxcSq.png)

$$
\begin{align}
\displaystyle a^{\langle t \rangle} &= \tanh(W_{ax} x^{\langle t \rangle} + W_{aa} a^{\langle t-1 \rangle} + b_{a})\tag{-} \\[8pt]
\displaystyle \frac{\partial \tanh(x)} {\partial x} &= 1 - \tanh^2(x) \tag{-} \\[8pt]
\displaystyle {dtanh} &= da_{next} * ( 1 - \tanh^2(W_{ax}x^{\langle t \rangle}+W_{aa} a^{\langle t-1 \rangle} + b_{a})) \tag{0} \\[8pt]
\displaystyle  {dW_{ax}} &= dtanh \cdot x^{\langle t \rangle T}\tag{1} \\[8pt]
\displaystyle dW_{aa} &= dtanh \cdot a^{\langle t-1 \rangle T}\tag{2} \\[8pt]
\displaystyle db_a& = \sum_{batch}dtanh\tag{3} \\[8pt]
\displaystyle dx^{\langle t \rangle} &= { W_{ax}}^T \cdot dtanh\tag{4} \\[8pt]
\displaystyle da_{prev} &= { W_{aa}}^T \cdot dtanh\tag{5}
\end{align}
$$

In [59]:
def rnn_unit_backward(da, cache):
  '''
  Implements the backward pass for a single rnn unit based on it's cache

  Args:
  da -- dJ/da (da_next from the picture above)
  cache -- cache from rnn forward pass

  Returns:
  grads -- python dictionary with keys: 'dba', 'dWax', 'dx' (for deep RNNs),
      'dWaa', 'dWa_prev', 'da_prev'
  '''
  a, a_prev, x, parameters = cache
  Waa, Wax, ba, Wya, by = parameters['Waa'], parameters['Wax'], parameters['ba'], \
                          parameters['Wya'], parameters['by']

  grads = {}

  dz = da * (1 - np.square(a))

  grads['dba'] = np.sum(dz, axis=-1, keepdims=True)
  grads['dWaa'] = dz @ a_prev.T # it is basically a sum over mini-batch sample gradients
  grads['da_prev'] = Waa.T @ dz # shape (n_a, m)

  grads['dWax'] = dz @ x.T
  grads['dx'] = Wax.T @ dz

  return grads

In [66]:
def rnn_backward(da, caches):
  '''
  Implements the backward pass for the entire RNN

  Args:
  da -- dJ/da of shape (n_a, m, T_x) computed elsewhere
  caches -- output from rnn()

  Returns:
  grads -- python dictionary with keys 
    'dx' : shape(n_x, m, T_x)
    'da_init' : shape(n_a, m)
    'dWax' : shape(n_a, n_x)
    'dWaa' : shape(n_a, n_a)
    'dba' : shape(n_a, 1)
  '''
  caches, x = caches
  a, a_init, x, parameters = caches[0]

  n_a, m, T_x = da.shape
  n_x, _ = x.shape

  dx = np.empty((n_x, m, 0))
  dWax = np.zeros((n_a, n_x))
  dWaa = np.zeros((n_a, n_a))
  dba = np.zeros((n_a, 1))

  # to save the last value
  da_prev = np.zeros((n_a, m))

  for time_step in range(T_x - 1, -1, -1):
    grads = rnn_unit_backward(da[:, :, time_step] + da_prev, caches[time_step])
    dba_t, dWaa_t, da_prev, dWax_t, dx_t = grads['dba'], grads['dWaa'], \
        grads['da_prev'], grads['dWax'], grads['dx']
    
    dx = np.append(dx_t[..., np.newaxis], dx, axis=-1)
    dWax += dWax_t
    dWaa += dWaa_t
    dba += dba_t
  
  grads = {"dx": dx, "da0": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}

  return grads