## 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import math

## 2a. Create the RNN Basic Unit

Here's the basic idea of the cell. The RNN is basically a repetition of the cell built over the time steps (e.g. `10` time steps, meaning `10` inputs features for `x_1, ..., x_10`).

<img src="https://datascience-enthusiast.com/figures/rnn_step_forward.png" width="500" height="auto" />
<img src="https://datascience-enthusiast.com/figures/rnn.png" width="800" height="auto" />

(Source: [Fisseha Berhane, n.d.](https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html))

Basic idea of backward,
<img src="https://datascience-enthusiast.com/figures/rnn_cell_backprop.png" width="700" height="auto" />

(Source: [Fisseha Berhane, n.d.](https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html))

Credits: [Fisseha Berhane](https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html) _(course)_, and [@brunoklein99](https://github.com/brunoklein99/deep-learning-notes/blob/master/rnn_utils.py) _(for some code)_.

In [109]:
def softmax(z):
    e_z = np.exp(z - np.max(z))
    return e_z / e_z.sum(axis=0)


class BasicUnit:
    
    def __init__(self, x_n=3, y_n=2, a_n=5):
        # Weight for multiplying current input
        self.ax_weights = np.random.randn(a_n, x_n)
        # Weight for multiplying past input (hidden-state)
        self.aa_weights = np.random.randn(a_n, a_n)
        # Weight for relating the hidden-state to output
        self.ya_weights = np.random.randn(y_n, a_n)
        # Bias for activation function
        self.a_bias = np.random.randn(a_n, 1)
        # Bias for relating the hidden-state to output
        self.y_bias = np.random.randn(y_n, 1)

    def cell_forward(self, x_values, a_prev):
        a_next = np.tanh(np.dot(self.aa_weights, a_prev) + np.dot(self.ax_weights, x_values) + self.a_bias)
        y_pred = softmax(np.dot(self.ya_weights, a_next) + self.y_bias)

        return dict(a_prev=a_prev, a_next=a_next, y_pred=y_pred)

    def cell_backward(self, x_values, hs, a_next_d):
        # Gradient of tanh respect to a_next derivative
        tanh_d = (1 - hs['a_next'] ** 2) * a_next_d

        # Gradient of loss respect to W_ax
        x_values_d = np.dot(self.ax_weights.T, tanh_d)
        ax_weights_d = np.dot(tanh_d, x_values.T)

        # Gradient with respect to W_aa
        a_prev_d = np.dot(self.aa_weights.T, tanh_d)
        aa_weights_d = np.dot(tanh_d, hs['a_prev'].T)

        # Gradient with respect to b_a
        a_bias_d = np.sum(tanh_d, 1, keepdims=True)

        return dict(x_values_d=x_values_d, 
                    a_prev_d=a_prev_d, 
                    ax_weights_d=ax_weights_d, 
                    aa_weights_d=aa_weights_d, 
                    a_bias_d=a_bias_d)

    def forward_pass(self, x_time_values, a_init=None):
        y_n, a_n = self.ya_weights.shape
        _, x_m, features_n = x_time_values.shape

        a = np.zeros((a_n, x_m, features_n))
        y_pred = np.zeros((y_n, x_m, features_n))
        # History cache
        hss = []

        a_next = a_init if a_init is not None else np.zeros((a_n, x_m))

        for t in range(features_n):
            cell = self.cell_forward(x_time_values[:,:,t], a_next)
            a_next = cell['a_next']
            a[:,:,t] = cell['a_next']
            y_pred[:,:,t] = cell['y_pred']
            hss.append(cell)

        return dict(a=a, y_pred=y_pred, hss=hss)

    def backward_pass(self, x_time_values, hss, a_init_d=None):
        _, x_m, features_n = x_time_values.shape
        a_n = self.ya_weights.shape[1]

        x_time_values_d = np.zeros(x_time_values.shape)
        ax_weights_d_cum = np.zeros(self.ax_weights.shape)
        aa_weights_d_cum = np.zeros(self.aa_weights.shape)
        a_bias_d_cum = np.zeros(self.a_bias.shape)

        a_d = a_init_d if a_init_d is not None else np.zeros((a_n, x_m, features_n))
        a_prev_d = np.zeros((a_n, x_m))

        for t in reversed(range(features_n)):
            print(a_prev_d.shape, a_d[:,:,t].shape)
            a_prev_d + a_d[:,:,t]
            cell_gradient = self.cell_backward(x_time_values[:,:,t], hss[t], a_prev_d + a_d[:,:,t])

            x_time_values_d[:,:,t] = cell_gradient['x_values_d']
            a_prev_d = cell_gradient['a_prev_d']

            ax_weights_d_cum += cell_gradient['ax_weights_d']
            aa_weights_d_cum += cell_gradient['aa_weights_d']
            a_bias_d_cum += cell_gradient['a_bias_d']

        return dict(x_values_d=x_time_values_d, 
                    a_prev_d=a_prev_d,
                    ax_weights_d=ax_weights_d_cum, 
                    aa_weights_d=aa_weights_d_cum, 
                    a_bias_d=a_bias_d_cum)


## 2b. Preview pass-through

In [110]:
np.random.seed(1)

no_of_features = 4
x_train = np.random.randn(3, 10, no_of_features)

unit = BasicUnit(x_n=x_train.shape[0], y_n=2, a_n=5)

print('=== Forward pass (peak @ [0][0]) ===')

fp_result = unit.forward_pass(x_train)

for t in range(x_train.shape[2]):
    print(f"""Value at time {t+1}:
    a(t-1) = {fp_result['hss'][t]['a_prev'][0][0]}, 
    a(t) = {fp_result['a'][:,:,t][0][0]}, 
    y_pred = {fp_result['y_pred'][:,:,t][0][0]}
""")
    
print(f"""Values for weights and biases:
    W_ax = {unit.ax_weights[0][0]},
    W_aa = {unit.aa_weights[0][0]},
    W_ya = {unit.ya_weights[0][0]},
    b_a = {unit.a_bias[0][0]},
    b_y = {unit.y_bias[0][0]}
""")

print('=== Backward pass (peak @ [0][0]) ===')

bp = unit.backward_pass(x_train, fp_result['hss'], a_init_d=np.random.randn(*fp_result['a'].shape))

for t in reversed(range(x_train.shape[2])):
    print(f"""Derivative at time {t+1}:
    x(t) = {bp['x_values_d'][:,:,t][0][0]}
""")

print(f"""Derivatives for weights and biases:
    W_ax = {bp['ax_weights_d'][0][0]},
    W_aa = {bp['aa_weights_d'][0][0]},
    b_a = {bp['a_bias_d'][0][0]}""")

=== Forward pass (peak @ [0][0]) ===
Value at time 1:
    a(t-1) = 0.0, 
    a(t) = -0.6758264535817718, 
    y_pred = 0.00042255310469995687

Value at time 2:
    a(t-1) = -0.6758264535817718, 
    a(t) = 0.2430228309281321, 
    y_pred = 0.24597191206471836

Value at time 3:
    a(t-1) = 0.2430228309281321, 
    a(t) = -0.5729623577084761, 
    y_pred = 0.08878426145466899

Value at time 4:
    a(t-1) = -0.5729623577084761, 
    a(t) = -0.9860486609143284, 
    y_pred = 0.9965121118034168

Values for weights and biases:
    W_ax = -0.024616955875778355,
    W_aa = -0.6235307296797916,
    W_ya = -0.5170944579202279,
    b_a = -0.646916688254908,
    b_y = -0.22631424251360518

=== Backward pass (peak @ [0][0]) ===
(5, 10) (5, 10)
(5, 10) (5, 10)
(5, 10) (5, 10)
(5, 10) (5, 10)
Derivative at time 4:
    x(t) = 0.10360058464145615

Derivative at time 3:
    x(t) = 1.5662417561397577

Derivative at time 2:
    x(t) = -0.06048178204429309

Derivative at time 1:
    x(t) = -0.042608395617

## 3a. Create the LSTM Unit

Here's the basic idea of the cell. For illustration sake, we assume our task as reading words in a piece of text.

- **Forget gate**: Keep track of grammatical structures (e.g. if subject is singular or plural). Gets rid of previously stored memory when subject has changed. Values: `0` (forget) - `1` (keep).

- **Update gate**: Find a way to update to reflect that new subject is plural. Values: `0` - `1`.

- **Output gate**: Decide which outputs to use.

<img src="https://datascience-enthusiast.com/figures/LSTM.png" width="800" height="auto" />
<img src="https://datascience-enthusiast.com/figures/LSTM_rnn.png" width="1000" height="auto" />

(Source: [Fisseha Berhane, n.d.](https://datascience-enthusiast.com/DL/Building_a_Recurrent_Neural_Network-Step_by_Step_v1.html))

In [116]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


class LSTMUnit:
    
    def __init__(self, x_n=3, y_n=2, a_n=5):
        # Weight for forget gate
        self.f_weights = np.random.randn(a_n, a_n + x_n)
        # Weight for update gate
        self.i_weights = np.random.randn(a_n, a_n + x_n)
        # Weight for candidate "tanh"
        self.c_weights = np.random.randn(a_n, a_n + x_n)
        # Weight for output gate
        self.o_weights = np.random.randn(a_n, a_n + x_n)
        # Weight for relating the hidden-state to output
        self.y_weights = np.random.randn(y_n, a_n)
        # Bias for forget gate
        self.f_bias = np.random.randn(a_n, 1)
        # Bias for update gate
        self.i_bias = np.random.randn(a_n, 1)
        # Bias for candidate "tanh"
        self.c_bias = np.random.randn(a_n, 1)
        # Bias for output gate
        self.o_bias = np.random.randn(a_n, 1)
        # Bias for relating the hidden-state to output
        self.y_bias = np.random.randn(y_n, 1)

    def cell_forward(self, x_values, a_prev, c_prev):
        x_n, x_m = x_values.shape
        y_n, a_n = self.y_weights.shape
        
        # Concatenate a_prev with x_values
        concat = np.zeros((a_n + x_n, x_m))
        concat[:a_n,:] = a_prev
        concat[a_n:,:] = x_values
        
        forget_v = sigmoid(np.dot(self.f_weights, concat) + self.f_bias)
        update_v = sigmoid(np.dot(self.i_weights, concat) + self.i_bias)
        tanh_v = np.tanh(np.dot(self.c_weights, concat) + self.c_bias)
        output_v = sigmoid(np.dot(self.o_weights, concat) + self.o_bias)

        c_next = forget_v * c_prev + update_v * tanh_v
        a_next = output_v * np.tanh(c_next)

        y_pred = softmax(np.dot(self.y_weights, a_next) + self.y_bias)

        return dict(a_prev=a_prev, 
                    a_next=a_next, 
                    c_prev=c_prev, 
                    c_next=c_next,
                    forget=forget_v,
                    update=update_v,
                    tanh=tanh_v,
                    output=output_v,
                    y_pred=y_pred)

    def cell_backward(self, x_values, hs, a_next_d, c_next_d):
        a_n = hs['a_next'].shape[0]

        output_v_d = a_next_d * np.tanh(hs['c_next']) * \
                     hs['output'] * (1 - hs['output'])
        
        tanh_v_d = (c_next_d * hs['update'] + hs['output'] * (1 - np.square(np.tanh(hs['c_next']))) * \
                     hs['update'] * a_next_d) * \
                   (1 - np.square(hs['tanh']))
        
        update_v_d = (c_next_d * hs['tanh'] + hs['output'] * (1 - np.square(np.tanh(hs['c_next']))) * \
                       hs['tanh'] * a_next_d) * \
                     hs['update'] * (1 - hs['update'])
        
        forget_v_d = (c_next_d * hs['c_prev'] + hs['output'] * (1 - np.square(np.tanh(hs['c_next']))) * \
                       hs['c_prev'] * a_next_d) * \
                     hs['forget'] * (1 - hs['forget'])

        concat = np.concatenate((hs['a_prev'], x_values), axis=0)

        f_weights_d = np.dot(forget_v_d, concat.T)
        i_weights_d = np.dot(update_v_d, concat.T)
        c_weights_d = np.dot(tanh_v_d, concat.T)
        o_weights_d = np.dot(output_v_d, concat.T)

        f_bias_d = np.sum(forget_v_d, axis=1 ,keepdims=True)
        i_bias_d = np.sum(update_v_d, axis=1, keepdims=True)
        c_bias_d = np.sum(tanh_v_d, axis=1,  keepdims=True)
        o_bias_d = np.sum(output_v_d, axis=1, keepdims=True)

        a_prev_d = np.dot(self.f_weights[:, :a_n].T, forget_v_d) + \
                   np.dot(self.i_weights[:, :a_n].T, update_v_d) + \
                   np.dot(self.c_weights[:, :a_n].T, tanh_v_d) + \
                   np.dot(self.o_weights[:, :a_n].T, output_v_d)

        c_prev_d = c_next_d * hs['forget'] + hs['output'] * \
                   (1 - np.square(np.tanh(hs['c_next']))) * hs['forget'] * a_next_d

        x_values_d = np.dot(self.f_weights[:, a_n:].T, forget_v_d) + \
                     np.dot(self.i_weights[:, a_n:].T, update_v_d) + \
                     np.dot(self.c_weights[:, a_n:].T, tanh_v_d) + \
                     np.dot(self.o_weights[:, a_n:].T, output_v_d)

        return dict(output_d=output_v_d,
                    tanh_d=tanh_v_d,
                    update_d=update_v_d,
                    forget_d=forget_v_d,
                    f_weights_d=f_weights_d,
                    i_weights_d=i_weights_d,
                    c_weights_d=c_weights_d,
                    o_weights_d=o_weights_d,
                    f_bias_d=f_bias_d,
                    i_bias_d=i_bias_d,
                    c_bias_d=c_bias_d,
                    o_bias_d=o_bias_d,
                    a_prev_d=a_prev_d,
                    c_prev_d=c_prev_d,
                    x_values_d=x_values_d)

    def forward_pass(self, x_time_values, a_init=None):
        y_size, a_n = self.y_weights.shape
        _, x_m, features_n = x_time_values.shape

        a = np.zeros((a_n, x_m, features_n))
        c = a
        y_pred = np.zeros((y_size, x_m, features_n))
        hss = []
    
        a_next = a_init if a_init is not None else np.zeros((a_n, x_m))
        c_next = np.zeros(a_next.shape)

        for t in range(features_n):
            uc = self.cell_forward(x_time_values[:,:,t], a_next, c_next)
            a_next = uc['a_next']
            c_next = uc['c_next']
            a[:,:,t] = uc['a_next']
            c[:,:,t] = uc['c_next']
            y_pred[:,:,t] = uc['y_pred']
            hss.append(uc)

        return dict(a=a, c=c, y_pred=y_pred, hss=hss)

    def backward_pass(self, x_time_values, hss, a_init_d=None, c_init_d=None):
        _, x_m, features_n = x_time_values.shape
        a_n = self.y_weights.shape[1]

        x_time_values_d = np.zeros(x_time_values.shape)

        f_weights_d_cum = np.zeros(self.f_weights.shape)
        i_weights_d_cum = np.zeros(self.i_weights.shape)
        c_weights_d_cum = np.zeros(self.c_weights.shape)
        o_weights_d_cum = np.zeros(self.o_weights.shape)

        f_bias_d_cum = np.zeros(self.f_bias.shape)
        i_bias_d_cum = np.zeros(self.i_bias.shape)
        c_bias_d_cum = np.zeros(self.c_bias.shape)
        o_bias_d_cum = np.zeros(self.o_bias.shape)

        a_d = a_init_d if a_init_d is not None else np.zeros((a_n, x_m, features_n))
        c_d = c_init_d if c_init_d is not None else np.zeros(a_d.shape)

        a_prev_d = np.zeros((a_n, x_m))
        c_prev_d = np.zeros(a_prev_d.shape)

        for t in reversed(range(features_n)):
            cell_gradient = self.cell_backward(x_time_values[:,:,t], hss[t], a_prev_d + a_d[:,:,t], c_prev_d + c_d[:,:,t])
            
            x_time_values_d[:,:,t] = cell_gradient['x_values_d']
            f_weights_d_cum += cell_gradient['f_weights_d']
            i_weights_d_cum += cell_gradient['i_weights_d']
            c_weights_d_cum += cell_gradient['c_weights_d']
            o_weights_d_cum += cell_gradient['o_weights_d']
            
            
            f_bias_d_cum += cell_gradient['f_bias_d']
            i_bias_d_cum += cell_gradient['i_bias_d']
            c_bias_d_cum += cell_gradient['c_bias_d']
            o_bias_d_cum += cell_gradient['o_bias_d']

            a_prev_d = cell_gradient['a_prev_d']
            c_prev_d = cell_gradient['c_prev_d']
        
        return dict(f_weights_d=f_weights_d_cum,
                    i_weights_d=i_weights_d_cum,
                    c_weights_d=c_weights_d_cum,
                    o_weights_d=o_weights_d_cum,
                    f_bias_d=f_bias_d_cum,
                    i_bias_d=i_bias_d_cum,
                    c_bias_d=c_bias_d_cum,
                    o_bias_d=o_bias_d_cum,
                    a_prev_d=a_prev_d,
                    c_prev_d=c_prev_d,
                    x_values_d=x_time_values_d)


## 3b. Preview pass-through

In [117]:
np.random.seed(1)

no_of_features = 4
x_train = np.random.randn(3, 10, no_of_features)

unit = LSTMUnit(x_n=x_train.shape[0], y_n=2, a_n=5)

print('=== Forward pass (peak @ [0][0]) ===')

fp_result = unit.forward_pass(x_train)

for t in range(x_train.shape[2]):
    print(f"""Value at time {t+1}:
    a(t-1) = {fp_result['hss'][t]['a_prev'][0][0]}, 
    a(t) = {fp_result['a'][:,:,t][0][0]}, 
    c(t-1) = {fp_result['hss'][t]['c_prev'][0][0]}, 
    c(t) = {fp_result['c'][:,:,t][0][0]},
    y_pred = {fp_result['y_pred'][:,:,t][0][0]},

    forget gate={fp_result['hss'][t]['forget'][0][0]},
    update gate={fp_result['hss'][t]['update'][0][0]},
    tanh={fp_result['hss'][t]['tanh'][0][0]},
    output gate={fp_result['hss'][t]['output'][0][0]}
""")
    
print(f"""Values for weights and biases:
    W_f = {unit.f_weights[0][0]}
    W_o = {unit.o_weights[0][0]}
    W_i = {unit.i_weights[0][0]}
    W_c = {unit.c_weights[0][0]}
    W_y = {unit.y_weights[0][0]}
    b_f = {unit.f_bias[0][0]}
    b_o = {unit.o_bias[0][0]}
    b_i = {unit.i_bias[0][0]}
    b_c = {unit.c_bias[0][0]}
    b_y = {unit.y_bias[0][0]}
""")


print('=== Backward pass (peak @ [0][0]) ===')

bp = unit.backward_pass(x_train, fp_result['hss'], a_init_d=np.random.randn(*fp_result['a'].shape), 
                                                   c_init_d=np.random.randn(*fp_result['c'].shape))

for t in reversed(range(x_train.shape[2])):
    print(f"""Derivative at time {t+1}:
    x(t) = {bp['x_values_d'][:,:,t][0][0]}
""")

print(f"""Derivatives for weights and biases:
    W_f = {bp['f_weights_d'][0][0]},
    W_c = {bp['c_weights_d'][0][0]},
    W_i = {bp['i_weights_d'][0][0]},
    W_o = {bp['o_weights_d'][0][0]},
    b_f = {bp['f_bias_d'][0][0]},
    b_c = {bp['c_bias_d'][0][0]},
    b_i = {bp['o_bias_d'][0][0]},
    b_o = {bp['o_bias_d'][0][0]}""")

=== Forward pass (peak @ [0][0]) ===
Value at time 1:
    a(t-1) = 0.0, 
    a(t) = -0.08343433206127393, 
    c(t-1) = 0.0, 
    c(t) = -0.08343433206127393,
    y_pred = 0.734875861137326,

    forget gate=0.8075218579825069,
    update gate=0.3062247180809734,
    tanh=-0.27246112784145604,
    output gate=0.9369490290591823

Value at time 2:
    a(t-1) = -0.07799282351874157, 
    a(t) = 0.29223704943478285, 
    c(t-1) = -0.08343433206127393, 
    c(t) = 0.29223704943478285,
    y_pred = 0.6521032449346058,

    forget gate=0.18974605150000293,
    update gate=0.3081022533091858,
    tanh=0.9998900728382465,
    output gate=0.7921089450360703

Value at time 3:
    a(t-1) = 0.22511141400895762, 
    a(t) = 0.35817201453850156, 
    c(t-1) = 0.29223704943478285, 
    c(t) = 0.35817201453850156,
    y_pred = 0.6193589787769862,

    forget gate=0.10914937710662423,
    update gate=0.3266801882861681,
    tanh=0.9987582177447643,
    output gate=0.6912641796134577

Value at time 4:
  

## 4a. Create the GRU Unit (Forward only)

Here's the basic idea of the cell.

<img src="http://media5.datahacker.rs/2020/09/69-2048x1292.jpg" width="400" height="auto" />

(Source: [Strahinja Zivkovic, 2020](http://datahacker.rs/005-rnn-tackling-vanishing-gradients-with-gru-and-lstm/) - _Note that diagram omits the reset gate. Refer to [this](https://d2l.ai/chapter_recurrent-modern/gru.html) for better overview_)

Credits: [d2l.ai, n.d.](https://d2l.ai/chapter_recurrent-modern/gru.html)

In [256]:
class GRUUnit:
    
    def __init__(self, x_n=3, x_m=10, y_n=2, a_n=5):
        # Weight for reset gate
        self.r_weights = np.random.randn(a_n, a_n)
        # Weight for update gate
        self.i_weights = np.random.randn(a_n, x_n)
        # Weight for candidate "tanh"
        self.c_weights = np.random.randn(a_n, x_n)
        self.ch_weights = np.random.randn(a_n, 1)
        # Weight for relating the hidden-state to output
        self.y_weights = np.random.randn(y_n, a_n)
        # Bias for reset gate
        self.r_bias = np.random.randn(a_n, 1)
        # Bias for update gate
        self.i_bias = np.random.randn(a_n, 1)
        # Bias for candidate "tanh"
        self.c_bias = np.random.randn(a_n, 1)
        # Bias for relating the hidden-state to output
        self.y_bias = np.random.randn(y_n, 1)

    def cell_forward(self, x_values, a_prev):
        #a_next = np.tanh(np.dot(self.aa_weights, a_prev) + np.dot(self.ax_weights, x_values) + self.a_bias)
        #y_pred = softmax(np.dot(self.ya_weights, a_next) + self.y_bias)

        #return dict(a_prev=a_prev, a_next=a_next, y_pred=y_pred)

        x_n, x_m = x_values.shape
        y_n, a_n = self.y_weights.shape

        reset_value = sigmoid(np.dot(self.r_weights, a_prev) + self.r_bias)
        update_value = sigmoid(np.dot(self.i_weights, x_values) + self.i_bias)
        tanh_value = np.tanh(np.dot(self.c_weights, x_values) + (reset_value * a_prev * self.ch_weights) + self.c_bias)
        
        a_next = update_value * a_prev + (1 - update_value) * tanh_value
        y_pred = softmax(np.dot(self.y_weights, a_prev) + self.y_bias)
        
        return dict(a_prev=a_prev,
                    a_next=a_next,
                    y_pred=y_pred,
                    reset=reset_value,
                    update=update_value,
                    tanh=tanh_value)

    def forward_pass(self, x_time_values, a_init=None):
        y_size, a_n = self.y_weights.shape
        _, x_m, features_n = x_time_values.shape

        a = np.zeros((a_n, x_m, features_n))
        y_pred = np.zeros((y_size, x_m, features_n))
        hss = []
    
        a_next = a_init if a_init is not None else np.zeros((a_n, x_m))

        for t in range(features_n):
            uc = self.cell_forward(x_time_values[:,:,t], a_next)
            a_next = uc['a_next']
            a[:,:,t] = uc['a_next']
            y_pred[:,:,t] = uc['y_pred']
            hss.append(uc)

        return dict(a=a, y_pred=y_pred, hss=hss)

## 4b. Preview pass-through (Forward only)

In [257]:
np.random.seed(1)

no_of_features = 4
x_train = np.random.randn(3, 10, no_of_features)

unit = GRUUnit(x_n=x_train.shape[0], x_m=10, y_n=2, a_n=5)

print('=== Forward pass (peak @ [0][0]) ===')

fp_result = unit.forward_pass(x_train)

for t in range(x_train.shape[2]):
    print(f"""Value at time {t+1}:
    a(t-1) = {fp_result['hss'][t]['a_prev'][0][0]}, 
    a(t) = {fp_result['a'][:,:,t][0][0]}, 
    y_pred = {fp_result['y_pred'][:,:,t][0][0]},

    reset gate={fp_result['hss'][t]['reset'][0][0]},
    update gate={fp_result['hss'][t]['update'][0][0]},
    tanh={fp_result['hss'][t]['tanh'][0][0]}
""")
    
print(f"""Values for weights and biases:
    W_r = {unit.r_weights[0][0]}
    W_i = {unit.i_weights[0][0]}
    W_c = {unit.c_weights[0][0]}
    W_ch = {unit.ch_weights[0][0]}
    W_y = {unit.y_weights[0][0]}
    b_r = {unit.r_bias[0][0]}
    b_i = {unit.i_bias[0][0]}
    b_c = {unit.c_bias[0][0]}
    b_y = {unit.y_bias[0][0]}
""")

=== Forward pass (peak @ [0][0]) ===
Value at time 1:
    a(t-1) = 0.0, 
    a(t) = -0.6624936673915963, 
    y_pred = 0.47253584733537496,

    reset gate=0.7613942293321033,
    update gate=0.17419602049370927,
    tanh=-0.8022408269183566

Value at time 2:
    a(t-1) = -0.6624936673915963, 
    a(t) = 0.5197961377421829, 
    y_pred = 0.5676720695982849,

    reset gate=0.7239697979279207,
    update gate=0.12955500229112404,
    tanh=0.6957653934861379

Value at time 3:
    a(t-1) = 0.5197961377421829, 
    a(t) = 0.5165426982099552, 
    y_pred = 0.4256564095305796,

    reset gate=0.7891613891909318,
    update gate=0.16206445587942556,
    tanh=0.5159134530237035

Value at time 4:
    a(t-1) = 0.5165426982099552, 
    a(t) = -0.6265279462002945, 
    y_pred = 0.47346074188071147,

    reset gate=0.8851338171573095,
    update gate=0.1994219899747511,
    tanh=-0.9112639990714243

Values for weights and biases:
    W_r = -0.024616955875778355
    W_i = 0.13770120999738608
    W_c