In [1]:
import numpy as np

In [2]:
def softmax(x):
    if x.ndim == 1:
        x = x.reshape(-1, 1)
        
    shift_x = x - np.max(x, axis=0, keepdims=True)
    e_x = np.exp(shift_x)
    softmax_output = e_x / e_x.sum(axis=0, keepdims=True)
    return softmax_output

In [3]:
def pad_sequences(sequences, max_len=None, padding_value=0.0):
    """
    Pads variable-length sequences with zeros (or a specified value) to make them the same length.

    Arguments:
    sequences -- List of numpy arrays, where each array has shape (n_x, T_i) (T_i is the length of the sequence).
    max_len -- Integer, maximum length for padding (default: use the longest sequence in `sequences`).
    padding_value -- Value used for padding (default: 0.0).

    Returns:
    padded_x -- Padded sequences, numpy array of shape (n_x, m, max_len).
    seq_lengths -- List of original lengths of the sequences.
    """
    # Determine the max length for padding
    if max_len is None:
        max_len = max(seq.shape[1] for seq in sequences)  # Longest sequence

    # Get feature size (n_x) and batch size (m)
    n_x = sequences[0].shape[0]  # Feature size of input sequences
    m = len(sequences)  # Number of sequences (batch size)

    # Initialize padded array with the padding value
    padded_x = np.full((n_x, m, max_len), padding_value)

    # Original sequence lengths
    seq_lengths = []

    # Fill the padded array with actual sequence data
    for i, seq in enumerate(sequences):
        seq_len = seq.shape[1]  # Length of the current sequence
        seq_lengths.append(seq_len)
        padded_x[:, i, :seq_len] = seq  # Fill up to the actual sequence length

    return padded_x, seq_lengths

# 1. RNN

### 1.1. RNN Forward

You can think of the recurrent neural network as the repeated use of a single cell. First, you'll implement the computations for a single time step. The following figure describes the operations for a single time step of an RNN cell: 

<img src="images/rnnCell_forward.png" style="width:100%;height:auto;">
<caption><center><font color='purple'><b>Figure 2</b>: Basic RNN cell. Takes as input $x^{\langle t \rangle}$ (current input) and $a^{\langle t - 1\rangle}$ (previous hidden state containing information from the past), and outputs $a^{\langle t \rangle}$ which is given to the next RNN cell and also used to predict $\hat{y}^{\langle t \rangle}$ 
</center></caption>

In [7]:
def rnnCell_forward(xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell as described in Figure (2)

    Arguments:
    xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    at -- next hidden state at timestep "t", numpy array of shape (n_a, m)
    yt_pred -- prediction at timestep "t", numpy array of shape (n_y, m)
    cache -- tuple of values needed for the backward pass, contains (a_next, a_prev, xt, parameters)
    """
    
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    at = np.tanh(np.dot(Wax, xt) + np.dot(Waa, a_prev) + ba)  # output_shape(n_a, m)
    yt_pred = softmax(np.dot(Wya, a_next) + by)
    
    cache = (at, a_prev, xt, parameters)
    return at, yt_pred, cache

- A recurrent neural network (RNN) is a repetition of the RNN cell that you've just built. 
    - If your input sequence of data is 10 time steps long, then you will re-use the RNN cell 10 times 
- Each cell takes two inputs at each time step:
    - $a^{\langle t-1 \rangle}$: The hidden state from the previous cell
    - $x^{\langle t \rangle}$: The current time step's input data
- It has two outputs at each time step:
    - A hidden state ($a^{\langle t \rangle}$)
    - A prediction ($y^{\langle t \rangle}$)
- The weights and biases $(W_{aa}, W_{ax}, b_{a}, W_{ay}, b_{y})$ are re-used each time step 
    - They are maintained between calls to `rnnCell_forward` in the 'parameters' dictionary

<img src="images/rnnSeq_forward.png" style="width:100%;height:auto;">
<caption><center><font color='purple'><b>Figure 3</b>: Basic RNN. The input sequence $x = (x^{\langle 1 \rangle}, x^{\langle 2 \rangle}, ..., x^{\langle T_x \rangle})$  is carried over $T_x$ time steps. The network outputs $y = (y^{\langle 1 \rangle}, y^{\langle 2 \rangle}, ..., y^{\langle T_x \rangle})$. </center></caption>

In [9]:
def rnn_forward(x, a0, parameters, seq_lengths):
    """
    Implements forward propagation for RNNs with variable-length sequences.

    Arguments:
    x -- Input data for every time-step (padded), numpy array of shape (n_x, m, T_max).
    a0 -- Initial hidden state, numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa, Wax, Wya, ba, by (same as before)
    seq_lengths -- List or array of integers indicating the actual lengths of sequences in the batch.

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_max)
    y_pred -- Predictions for every time-step, numpy array of shape (n_y, m, T_max)
    caches -- tuple of values needed for the backward pass, contains (list of caches, x, seq_lengths)
    """
    caches = []
    
    n_x, m, T_max = x.shape  # T_max is the maximum sequence length in the batch
    n_y, n_a = parameters["Wya"].shape

    a = np.zeros((n_a, m, T_max))
    y_pred = np.zeros((n_y, m, T_max))
    
    a_prev = a0
    for t in range(T_max):
        # Only process the batch samples where t < seq_lengths
        active_batch = (seq_lengths > t)
        if not np.any(active_batch):  # Skip if no sequence in the batch has t < seq_lengths
            break

        xt = x[:, active_batch, t]
        a_prev_active = a_prev[:, active_batch]
        
        at, yt_pred, cache = rnnCell_forward(xt, a_prev_active, parameters)
        
        a[:, active_batch, t] = at
        y_pred[:, active_batch, t] = yt_pred
        caches.append((cache, active_batch))
        
        a_prev[:, active_batch] = at  # Update only for active sequences

    caches = (caches, x, seq_lengths)
    return a, y_pred, caches

### 1.2. RNN Backward

Begin by computing the backward pass for the basic RNN cell. Then, in the following sections, iterate through the cells.

<img src="images/rnnCell_backward_equations.png" alt="RNN Cell Backward Pass" style="width:100%; height:auto;">
<br>
<caption><center><font color='purple'><b>Figure 6</b>: The RNN cell's backward pass. Just like in a fully-connected neural network, the derivative of the cost function $J$ backpropagates through the time steps of the RNN by following the chain rule from calculus. Internal to the cell, the chain rule is also used to calculate $(\frac{\partial J}{\partial W_{ax}},\frac{\partial J}{\partial W_{aa}},\frac{\partial J}{\partial b})$ to update the parameters $(W_{ax}, W_{aa}, b_a)$. The operation can utilize the cached results from the forward path. </center></caption>


<img src="images/rnnCell_backward.png" style="width:800px;height:500px;"> <br>
<caption>
    <center>
        <font color='purple'><b>Figure 7</b>: This implementation of `rnn_cell_backward` doesn't include the output dense layer and softmax which are included in <code>rnnCell_forward</code>.

$da_{next}$ is $\frac{\partial{J}}{\partial a^{\langle t \rangle}}$ and includes loss from previous stages and current stage output logic. The addition shown in green will be part of your implementation of `rnn_backward`.  
    </center>
</caption>

In [23]:
def rnnCell_backward(da_next, cache):
    """
    Implements the backward pass for the RNN-cell (single time-step).

    Arguments:
    da_next -- Gradient of loss with respect to the next hidden state
    cache -- python dictionary containing useful values (output of rnnCell_forward())

    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradients of input data, of shape (n_x, m)
                        da_prev -- Gradients of previous hidden state, of shape (n_a, m)
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dba -- Gradients of bias vector, of shape (n_a, 1)
    """
    
    (a_next, a_prev, xt, parameters) = cache
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    dtanh = (1 - a_next**2) * da_next  # (n_a, m)
    
    dWax = np.dot(dtanh, xt.T)  # (n_a, m) @ (n_x, m).T
    dWaa = np.dot(dtanh, a_prev.T)  # (n_a, m) @ (n_a, m).T
    dba = np.sum(dtanh, axis = 1, keepdims=1)
    
    dxt = np.dot(Wax.T, dtanh)  # (n_a, n_x).T  @ (n_a, m)
    da_prev = np.dot(Waa.T, dtanh)  # (n_a, n_a).T @ (n_a, m)

    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    return gradients

* Note that this notebook does not implement the backward path from the Loss 'J' backwards to 'a'. 
    * This would have included the dense layer and softmax which are a part of the forward path. 
    * This is assumed to be calculated elsewhere and the result passed to `rnn_backward` in 'da'. 
    * You must combine this with the loss from the previous stages when calling `rnnCell_backward` (see figure 7 above).
* It is further assumed that loss has been adjusted for batch size (m).
    * Therefore, division by the number of examples is not required here.

In [15]:
def rnn_backward(da, caches):
    """
    Implement the backward pass for an RNN over an entire sequence of input data.

    Arguments:
    da -- Upstream gradients of all hidden states, of shape (n_a, m, T_x)
    caches -- tuple containing information from the forward pass (rnnSeq_forward)
    
    Returns:
    gradients -- python dictionary containing:
                        dx -- Gradient w.r.t. the input data, numpy-array of shape (n_x, m, T_max)
                        da0 -- Gradient w.r.t the initial hidden state, numpy-array of shape (n_a, m)
                        dWax -- Gradient w.r.t the input's weight matrix, numpy-array of shape (n_a, n_x)
                        dWaa -- Gradient w.r.t the hidden state's weight matrix, numpy-array of shape (n_a, n_a)
                        dba -- Gradient w.r.t the bias, of shape (n_a, 1)
    """
    
    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]
    
    n_a, m, T_x = da.shape
    n_x, m = x1.shape
    
    # initialize the gradients with the right sizes (≈6 lines)
    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))
        
    
    # Loop through all the time steps
    for t in reversed(range(T_x)):
        # Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
        gradients = rnn_cell_backward(da[:,:,t] + da_prevt, caches[t])
        # Retrieve derivatives from gradients (≈ 1 line)
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        # Increment global derivatives w.r.t parameters by adding their derivative at time-step t (≈4 lines)
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat
        
    # Set da0 to the gradient of a which has been backpropagated through all time-steps (≈1 line) 
    da0 = da_prevt
    ### END CODE HERE ###
    
    # Store the gradients in a python dictionary
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    
    return gradients