# Dependencies

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchinfo import summary

In [2]:
# set a seed for deterministic results
random_state = 42
torch.manual_seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# check if cuda is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Dataset

## Regular Dataset
   - Regular datasets typically used in CNNs & MLPs are composed of independent data points
   - Each data point is usually represented as a fixed-size vector (or tensor for images)
   - **Notation**:
      - $N$: Number of samples in the dataset.
      - $\mathbf{x}_i$: Input data point $i$, where $i \in \{1, 2, \ldots, N\}$.
      - $\mathbf{y}_i$: Label or target associated with input data $i$.
   - **Formulation**:
      - Dataset: $D=\{(\mathbf{x}_i, \mathbf{y}_i)\mid i = 1, 2, \ldots, N\}$
      - Each $\mathbf{x}_i \in \R^M$, where $M$ is the dimensionality of the input feature vector
   - **Example**: $D = \{ (\mathbf{x}_1, \mathbf{y}_1), (\mathbf{x}_2, \mathbf{y}_2), (\mathbf{x}_3, \mathbf{y}_3) \}$
      - $\mathbf{x}_1 = [1.0, 2.0], \quad \mathbf{y}_1 = 0$
      - $\mathbf{x}_2 = [2.5, 3.5], \quad \mathbf{y}_2 = 1$
      - $\mathbf{x}_3 = [0.5, 1.5], \quad \mathbf{y}_3 = 0$

In [4]:
class RegularDataset(Dataset):
    def __init__(self):
        self.data = torch.tensor([[1.1, 2.1], [2.5, 3.5], [0.5, 1.5]], dtype=torch.float32)
        self.labels = torch.tensor([0, 1, 0], dtype=torch.int64)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


# create dataset and dataloader
dataset = RegularDataset()
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

# iterate through the dataset
for data, label in dataloader:
    print(f"data: {data}, label: {label}")

data: tensor([[1.1000, 2.1000]]), label: tensor([0])
data: tensor([[2.5000, 3.5000]]), label: tensor([1])
data: tensor([[0.5000, 1.5000]]), label: tensor([0])


## Sequential Dataset
   - Sequential datasets used in RNNs are composed of sequences of data points.
   - Each sequence represents a temporal or sequential relationship among the data points
   - **Notation**:
      - $N$: Number of sequences in the dataset.
      - $T$: Length of each sequence.
      - $\mathbf{x}^t_i$: Input data point at time step $t$ in the sequence $i$, where $t \in \{1, 2, \ldots, T\}$ and $i \in \{1, 2, \ldots, N\}$
      - $\mathbf{y}_i$: Label or target associated with sequence $i$.
   - **Formulation**:
      - Dataset: $D = \{ (\mathbf{x}_i^1, \mathbf{x}_i^2, \ldots, \mathbf{x}_i^T, \mathbf{y}_i) \mid i = 1, 2, \ldots, N \}$
      - Each $\mathbf{x}^t_i \in \R^M$, where $M$ is the dimensionality of the input feature vector at each time step.
   - **Example**: $D = \{ (\mathbf{x}_1, \mathbf{x}_2, \mathbf{x}_3, \mathbf{y}_1), (\mathbf{x}_2, \mathbf{x}_3, \mathbf{x}_4, \mathbf{y}_2), (\mathbf{x}_3, \mathbf{x}_4, \mathbf{x}_5, \mathbf{y}_3) \}$
      - $\mathbf{x}_1 = [1.0, 0.0]$
      - $\mathbf{x}_2 = [0.5, 1.5]$
      - $\mathbf{x}_3 = [1.0, 2.0]$
      - $\mathbf{x}_4 = [2.0, 1.0]$
      - $\mathbf{x}_5 = [1.5, 0.5]$
      - $\mathbf{y}_1 = 0$
      - $\mathbf{y}_2 = 1$
      - $\mathbf{y}_3 = 0$

In [5]:
class SequentialDatasetWithoutOverlap(Dataset):
    def __init__(self):
        # original data points
        self.data = torch.tensor([
            [1.0, 0.0],
            [0.5, 1.5],
            [1.0, 2.0],
            [2.0, 1.0],
            [1.5, 0.5],
            [2.5, 1.5]
        ], dtype=torch.float32)

        # labels for each sequence
        self.labels = torch.tensor([0, 1], dtype=torch.int64)

        # sequence length
        self.seq_length = 3

    def __len__(self):
        # number of sequences without overlap
        return len(self.data) // self.seq_length

    def __getitem__(self, idx):
        # calculate the start index of the sequence
        start_idx = idx * self.seq_length

        # create a sequence of length seq_length
        sequence = self.data[start_idx:start_idx + self.seq_length]
        label = self.labels[idx]
        return sequence, label


# create dataset and dataloader
dataset = SequentialDatasetWithoutOverlap()
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

# iterate through the dataset
for sequence, label in dataloader:
    print(f"sequence:\n{sequence}\nlabel: {label}\n")

sequence:
tensor([[[1.0000, 0.0000],
         [0.5000, 1.5000],
         [1.0000, 2.0000]]])
label: tensor([0])

sequence:
tensor([[[2.0000, 1.0000],
         [1.5000, 0.5000],
         [2.5000, 1.5000]]])
label: tensor([1])



In [6]:
class SequentialDatasetWithOverlap(Dataset):
    def __init__(self):
        # original data points
        self.data = torch.tensor([
            [1.0, 0.0],
            [0.5, 1.5],
            [1.0, 2.0],
            [2.0, 1.0],
            [1.5, 0.5]
        ], dtype=torch.float32)

        # labels for each sequence
        self.labels = torch.tensor([0, 1, 0], dtype=torch.int64)

        # sequence length
        self.seq_length = 3

    def __len__(self):
        return len(self.data) - self.seq_length + 1

    def __getitem__(self, idx):
        # create a sequence of length seq_length
        sequence = self.data[idx:idx+self.seq_length]
        label = self.labels[idx]
        return sequence, label


# create dataset and dataloader
dataset = SequentialDatasetWithOverlap()
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# iterate through the dataset
for sequence, label in dataloader:
    print(f"sequence:\n{sequence}\nlabel: {label}\n")

sequence:
tensor([[[0.5000, 1.5000],
         [1.0000, 2.0000],
         [2.0000, 1.0000]]])
label: tensor([1])

sequence:
tensor([[[1.0000, 2.0000],
         [2.0000, 1.0000],
         [1.5000, 0.5000]]])
label: tensor([0])

sequence:
tensor([[[1.0000, 0.0000],
         [0.5000, 1.5000],
         [1.0000, 2.0000]]])
label: tensor([0])



# Types of sequence-to-sequence modeling configurations
   - **One-to-One** (Single Input to Single Output):
      - Simplest form of neural network where a single input is mapped to a single output
      - Used in a standard feed-forward neural network (e.g. MLP or CNN based architectures)
      - e.g. Image classification
   - **One-to-Many** (Single Input to Sequence Output):
      - A single input is processed by the RNN, which then produces a sequence of outputs over time.
      - e.g. Image captioning (an image input resulting in a sequence of words).
   - **Many-to-One** (Sequence Input to Single Output):
      - The RNN processes each input in the sequence, and the final hidden state is used to produce the output
      - e.g. Sentiment analysis (a sequence of words leading to a single sentiment label)
   - **Many-to-Many** (Sequence Input to Sequence Output):
      - A sequence of inputs leads to a sequence of outputs. This can be further divided into two subcategories:
         - **Synchronized** Many-to-Many
            - Each input in the sequence has a corresponding output
            - The RNN processes a sequence of inputs, producing a corresponding output at each time step
            - e.g. Video classification (each frame in a video results in a corresponding label)
         - **Asynchronized** Many-to-Many
            - The lengths of the input and output sequences can differ
            - The RNN processes a sequence of inputs and generates a sequence of outputs which may have different lengths
            - e.g. Machine translation (a sequence of words in one language translates to a sequence of words in another language)
         

<figure style="text-align: center;">
    <img src="../resources/images/SVGs/seq-to-seq-modeling.svg" alt="seq-to-seq-modeling.svg" style="width: 100%;">
    <figcaption style="text-align: center;">sequence-to-sequence modeling</figcaption>
</figure>

# Network Structure: Recurrent Neural Networks
   - RNNs are specifically designed to handle sequential data, where the order of elements matters
   - Unlike feedforward neural networks, RNNs possess a "memory" component to process information from previous inputs, influencing the current output
   - Each step in the sequence is processed by the same network (shared weights), with information passed between steps
   - RNNs can suffer from vanishing and exploding gradients, making training difficult for long sequences.

**RNN Variants**:
   - Vanilla RNN
   - Long Short-Term Memory (LSTM)
      - Improves upon the vanilla RNN by introducing gates to control information flow
   - Gated Recurrent Units (GRU)
      - Simplifies the LSTM architecture while maintaining performance

## Simple Vanilla RNN
   - **Notations**:
      - $\mathbf{x}_t$: input at time step $t$.
      - $\mathbf{h}_t$: Hidden state at time step $t$.
      - $\mathbf{y}_t$: Output at time step $t$.
      - $\mathbf{W}_{ih}$: Weight matrix for input to hidden
      - $\mathbf{W}_{hh}$: Weight matrix for hidden to hidden
      - $\mathbf{W}_{ho}$: Weight matrix for hidden to output
      - $\mathbf{b}_{ih}$: Bias for input to hidden
      - $\mathbf{b}_{hh}$: Bias for hidden to hidden
      - $\mathbf{b}_{ho}$: Bias for hidden to output
      - $\mathbf{\sigma}$: Activation function (e.g., Tanh, Sigmoid, ReLU)
      - $\mathbf{g}$: Activation function for output (e.g., Softmax for classification)
   - **Formulations**:
      - Hidden State Calculation:
      $$\mathbf{h}_t = \sigma(\mathbf{W}_{ih} \mathbf{x}_t + \mathbf{b}_{ih} + \mathbf{W}_{hh} \mathbf{h}_{t-1} + \mathbf{b}_{hh}), \quad \mathbf{h}_0 = \mathbf{0}$$
      - Output Calculation:
      $$\mathbf{y}_t = g(\mathbf{W}_{ho} \mathbf{h}_t + \mathbf{b}_{ho})$$

<figure style="text-align: center;">
    <img src="../resources/images/SVGs/recurrent-neural-networks-1.svg" alt="recurrent-neural-networks-1.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Folded Recurrent Neural Networks</figcaption>
</figure>

<figure style="text-align: center;">
    <img src="../resources/images/SVGs/recurrent-neural-networks-2.svg" alt="recurrent-neural-networks-2.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Unfolded Recurrent Neural Networks</figcaption>
</figure>

In [7]:
class VanillaRNN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
        super(VanillaRNN, self).__init__()
        self.hidden_dim = hidden_dim

        # input to hidden connection weights
        self.W_ih = nn.Parameter(torch.randn(hidden_dim, input_dim))
        # input to hidden connection biases
        self.b_ih = nn.Parameter(torch.randn(hidden_dim))

        # hidden to hidden connection weights
        self.W_hh = nn.Parameter(torch.randn(hidden_dim, hidden_dim))
        # hidden to hidden connection biases
        self.b_hh = nn.Parameter(torch.randn(hidden_dim))

        # weights for hidden to output connection
        self.W_ho = nn.Parameter(torch.randn(output_dim, hidden_dim))
        # bias for output layer
        self.b_ho = nn.Parameter(torch.randn(output_dim))

    def forward(self, input: torch.Tensor, hidden: torch.Tensor) -> torch.Tensor:
        hidden = torch.tanh(input @ self.W_ih.T + self.b_ih + hidden @ self.W_hh.T + self.b_hh)
        output = hidden @ self.W_ho.T + self.b_ho
        return output, hidden

    def init_hidden(self) -> torch.Tensor:
        # initialize the hidden state with zeros (h_0)
        return torch.zeros(self.hidden_dim)

In [8]:
# parameters
input_dim = 10
hidden_dim = 20
output_dim = 5

# create RNN
rnn_1 = VanillaRNN(input_dim, hidden_dim, output_dim)

In [9]:
# example input (num data, sequence length, input dim)
num_data = 128
sequence_length = 5
x = torch.randn(num_data, sequence_length, input_dim)
y = torch.randn(num_data)

# create dataset and dataloader
batch_size = 32
dataset = TensorDataset(x, y)
trainsetloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# log
print(f"x.size()               : {x.size()}")
print(f"y.size()               : {y.size()}")
print(f"x.size() [first batch] : {next(iter(trainsetloader))[0].size()}")
print(f"y.size() [first batch] : {next(iter(trainsetloader))[1].size()}")

x.size()               : torch.Size([128, 5, 10])
y.size()               : torch.Size([128])
x.size() [first batch] : torch.Size([32, 5, 10])
y.size() [first batch] : torch.Size([32])


In [10]:
# forward pass through the RNN
for c, (x, y_true) in enumerate(trainsetloader):
    # initialize hidden state
    hidden = rnn_1.init_hidden()

    for i in range(sequence_length):
        y_pred, hidden = rnn_1(x[:, i, :], hidden)
        print(f"batch: {c+1}/{len(trainsetloader)} | time step: {i+1} | hidden.size(): {hidden.size()} | output.size(): {y_pred.size()}")

batch: 1/4 | time step: 1 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 2 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 3 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 4 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 5 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 1 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 2 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 3 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 4 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 5 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size

In [11]:
summary(rnn_1, input_size=((batch_size, input_dim), hidden.size()), device="cpu")

Layer (type:depth-idx)                   Output Shape              Param #
VanillaRNN                               [32, 5]                   745
Total params: 745
Trainable params: 745
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01

## Combined Weights and Concatenated Input and Hidden
   - Reformulate the Vanilla RNN by:
      - Combining the input-to-hidden and hidden-to-hidden weights into a single weight matrix
      - Concatenating the input and hidden states together
   - **Notations**:
      - $\mathbf{x}_t$: Input at time step $t$.
      - $\mathbf{h}_t$: Hidden state at time step $t$.
      - $\mathbf{y}_t$: Output at time step $t$.
      - $\mathbf{W}$: Combined weight matrix
      - $\mathbf{b}$: Combined bias vector
      - $\mathbf{W}_ho$: Weight matrix for hidden to output
      - $\mathbf{b}_ho$: Bias for hidden to output
      - $\mathbf{\sigma}$: Activation function (e.g., Tanh, Sigmoid, ReLU)
      - $\mathbf{g}$: Activation function for output (e.g., Softmax for classification)
   - **Formulations**:
      - Concatenation of Input and Hidden State:
      $$\mathbf{z}_t = [\mathbf{x}_t; \mathbf{h}_{t-1}]$$
      - Hidden State Calculation:
      $$\mathbf{h}_t = \sigma(\mathbf{W} \mathbf{z}_t + \mathbf{b})$$
      - Output Calculation:
      $$\mathbf{y}_t = g(\mathbf{W}_{ho} \mathbf{h}_t + \mathbf{b}_{ho})$$

In [12]:
class VanillaRNN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int) -> None:
        super(VanillaRNN, self).__init__()
        self.hidden_dim = hidden_dim

        # combined weight matrix for input to hidden and hidden to hidden
        self.W = nn.Parameter(torch.randn(hidden_dim, input_dim + hidden_dim))
        self.b = nn.Parameter(torch.randn(hidden_dim))

        # weights for hidden to output connection
        self.W_ho = nn.Parameter(torch.randn(output_dim, hidden_dim))
        self.b_ho = nn.Parameter(torch.randn(output_dim))

    def forward(self, input: torch.Tensor, hidden: torch.Tensor) -> torch.Tensor:
        combined = torch.cat((input, hidden), dim=1)  # concatenate input and hidden state
        hidden = torch.tanh(combined @ self.W.T + self.b)
        output = hidden @ self.W_ho.T + self.b_ho
        return output, hidden

    def init_hidden(self, batch_size: int) -> torch.Tensor:
        # initialize the hidden state with zeros (h_0)
        return torch.zeros(batch_size, self.hidden_dim)

In [13]:
# Parameters
input_dim = 10
hidden_dim = 20
output_dim = 5

# Create RNN
rnn_2 = VanillaRNN(input_dim, hidden_dim, output_dim)

In [14]:
# Example input (num data, sequence length, input dim)
num_data = 128
sequence_length = 5
x = torch.randn(num_data, sequence_length, input_dim)
y = torch.randn(num_data)

# Create dataset and dataloader
batch_size = 32
dataset = TensorDataset(x, y)
trainsetloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# Log
print(f"x.size()               : {x.size()}")
print(f"y.size()               : {y.size()}")
print(f"x.size() [first batch] : {next(iter(trainsetloader))[0].size()}")
print(f"y.size() [first batch] : {next(iter(trainsetloader))[1].size()}")

x.size()               : torch.Size([128, 5, 10])
y.size()               : torch.Size([128])
x.size() [first batch] : torch.Size([32, 5, 10])
y.size() [first batch] : torch.Size([32])


In [15]:
# Forward pass through the RNN
for c, (x, y_true) in enumerate(trainsetloader):
    # Initialize hidden state
    hidden = rnn_2.init_hidden(batch_size)

    for i in range(sequence_length):
        y_pred, hidden = rnn_2(x[:, i, :], hidden)
        print(f"batch: {c+1}/{len(trainsetloader)} | time step: {i+1} | hidden.size(): {hidden.size()} | output.size(): {y_pred.size()}")

batch: 1/4 | time step: 1 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 2 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 3 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 4 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 1/4 | time step: 5 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 1 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 2 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 3 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 4 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size([32, 5])
batch: 2/4 | time step: 5 | hidden.size(): torch.Size([32, 20]) | output.size(): torch.Size

In [16]:
summary(rnn_2, input_size=((batch_size, input_dim), hidden.size()), device="cpu")

Layer (type:depth-idx)                   Output Shape              Param #
VanillaRNN                               [32, 5]                   725
Total params: 725
Trainable params: 725
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.01

## Deep RNN
<figure style="text-align: center;">
    <img src="../resources/images/SVGs/recurrent-neural-networks-3.svg" alt="recurrent-neural-networks-3.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Deep Recurrent Neural Networks</figcaption>
</figure>

In [17]:
class DeepVanillaRNN(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int) -> None:
        super(DeepVanillaRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # define RNN layers
        self.rnn_layers = nn.ModuleList()
        for i in range(num_layers):
            if i == 0:
                self.rnn_layers.append(nn.Linear(input_dim + hidden_dim, hidden_dim))
            else:
                self.rnn_layers.append(nn.Linear(hidden_dim + hidden_dim, hidden_dim))
                
        # define the output layer
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, input: torch.Tensor, hidden: torch.Tensor) -> torch.Tensor:
        combined_input = torch.cat((input, hidden[0]), dim=1)  # concatenate input and the first hidden state along the feature dimension
        new_hidden = []

        for i, rnn_layer in enumerate(self.rnn_layers):
            hidden_state = torch.tanh(rnn_layer(combined_input))
            new_hidden.append(hidden_state)
            combined_input = torch.cat((hidden_state, hidden[i]), dim=1)  # concatenate the current hidden state with the previous one

        # use the last hidden state for output
        final_hidden = new_hidden[-1]
        output = self.output_layer(final_hidden)
        return output, torch.stack(new_hidden)

    def init_hidden(self, batch_size: int) -> torch.Tensor:
        # initialize hidden state with zeros for each layer and batch
        return torch.zeros(self.num_layers, batch_size, self.hidden_dim)

# parameters
input_dim = 10
hidden_dim = 20
output_dim = 5
num_layers = 3

# create Deep RNN
deep_rnn = DeepVanillaRNN(input_dim, hidden_dim, output_dim, num_layers)

# example input
num_data = 128
sequence_length = 5
x = torch.randn(num_data, sequence_length, input_dim)
y = torch.randn(num_data)

# create dataset and dataloader
batch_size = 32
dataset = TensorDataset(x, y)
trainsetloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)

# training loop
for c, (x, y_true) in enumerate(trainsetloader):
    # initialize hidden state for each batch
    hidden = deep_rnn.init_hidden(batch_size)
    
    for i in range(sequence_length):
        y_pred, hidden = deep_rnn(x[:, i, :], hidden)
        print(f"Batch: {c+1}/{len(trainsetloader)} | Time step: {i+1} | hidden.size(): {hidden.size()} | output.size(): {y_pred.size()}")


Batch: 1/4 | Time step: 1 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 1/4 | Time step: 2 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 1/4 | Time step: 3 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 1/4 | Time step: 4 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 1/4 | Time step: 5 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 2/4 | Time step: 1 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 2/4 | Time step: 2 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 2/4 | Time step: 3 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 2/4 | Time step: 4 | hidden.size(): torch.Size([3, 32, 20]) | output.size(): torch.Size([32, 5])
Batch: 2/4 | Time step: 5 | hidden.size(): torch.Size([3, 32, 20

In [18]:
deep_rnn

DeepVanillaRNN(
  (rnn_layers): ModuleList(
    (0): Linear(in_features=30, out_features=20, bias=True)
    (1-2): 2 x Linear(in_features=40, out_features=20, bias=True)
  )
  (output_layer): Linear(in_features=20, out_features=5, bias=True)
)

In [19]:
summary(deep_rnn, input_size=((batch_size, input_dim), hidden.size()), device='cpu')

Layer (type:depth-idx)                   Output Shape              Param #
DeepVanillaRNN                           [32, 5]                   --
├─ModuleList: 1-1                        --                        --
│    └─Linear: 2-1                       [32, 20]                  620
│    └─Linear: 2-2                       [32, 20]                  820
│    └─Linear: 2-3                       [32, 20]                  820
├─Linear: 1-2                            [32, 5]                   105
Total params: 2,365
Trainable params: 2,365
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.08
Input size (MB): 0.01
Forward/backward pass size (MB): 0.02
Params size (MB): 0.01
Estimated Total Size (MB): 0.04

## Long Short-Term Memory (LSTM)
   - A type of recurrent neural network (RNN) aimed at dealing with the vanishing gradient problem present in traditional RNNs.
   - It aims to provide a short-term memory for RNN that can last thousands of timesteps, thus "long short-term memory".
   - It is based on the [Long Short-term Memory](https://www.researchgate.net/publication/13853244_Long_Short-term_Memory) paper, Developed in 1997 by [Sepp Hochreiter](https://scholar.google.at/citations?user=tvUH3WMAAAAJ&hl=en) and [Jürgen Schmidhuber](https://scholar.google.com/citations?user=gLnCTgIAAAAJ&hl=en).
<figure style="text-align: center;">
    <img src="../resources/images/SVGs/recurrent-neural-networks-4.svg" alt="recurrent-neural-networks-4.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Long Short-Term Memory (LSTM)</figcaption>
</figure>

## Gated Recurrent Units (GRU)
   - A gating mechanism in recurrent neural networks, introduced in 2014 by [Kyunghyun](https://dblp.uni-trier.de/search/author?author=Kyunghyun%20Cho).
   - It is based on the [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](https://arxiv.org/abs/1412.3555) paper.
   - Similar to LSTM but lacks a `context vector` or `output gate`, resulting in fewer parameters than LSTM.
<figure style="text-align: center;">
    <img src="../resources/images/SVGs/recurrent-neural-networks-5.svg" alt="recurrent-neural-networks-5.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Gated Recurrent Units (GRU)</figcaption>
</figure>