In [None]:
import torch
import torch.nn as nn
import torch.optim as optim # Optimizer

`nn.RNN` is a class within the PyTorch framework, specifically part of the torch.nn module. It is used to create an instance of a `recurrent neural network (RNN)` layer.

#Key Parameters of nn.RNN
`input_size:`
The number of expected features in the input x.

`hidden_size:`
The number of features in the hidden state h.

`num_layers (optional):`
 Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN, with the second RNN taking in outputs of the first RNN and computing the final results.

`nonlinearity (optional):`
 The non-linearity to use. Can be either 'tanh' or 'relu'. Default is 'tanh'.

`bias (optional):`
 If False, then the layer does not use bias weights b_ih and b_hh. Default is True.

`batch_first (optional):`
 If True, then the input and output tensors are provided as (batch, seq, feature). Default is False, which expects (seq, batch, feature).

 `dropout (optional):`
 If non-zero, introduces a Dropout layer on the outputs of each RNN layer except the last layer, with dropout probability equal to dropout. Default is 0.

`bidirectional (optional):`
 If True, becomes a bidirectional RNN. Default is False.

In [None]:
# Define the RNN model with Embedding
class RNN1(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super (RNN1, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)  # RNN layer
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer to produce the output

    def forward(self, x):
        # Embed input words
        x = self.embedding(x)
        # Initialize hidden state with zeros (h0 i.e. 0th hidden state, it can be random numbers, zeros or ones)
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the fully connected layer
        out = self.fc(out[:, -1, :])
        return out

# Parameters
vocab_size = 10  # Size of the vocabulary (max integer index + 1)
embedding_dim = 4  # Dimension of the embedding vectors
hidden_size = 10  # Number of features in the hidden state
output_size = 1  # Number of output classes

# Create the model
model = RNN1(vocab_size, embedding_dim, hidden_size, output_size)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Sample data (batch size, sequence length)
inputs = torch.tensor([[1, 2, 3], [2, 3, 4]])
targets = torch.tensor([[4.0], [5.0]])

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}')

# Test the model
model.eval()
test_input = torch.tensor([[3, 4, 5]])
predicted = model(test_input)
print(f'Predicted value: {predicted.detach().numpy()}')


Epoch [10/100], Loss: 10.7145
Epoch [20/100], Loss: 3.3133
Epoch [30/100], Loss: 0.6507
Epoch [40/100], Loss: 0.2449
Epoch [50/100], Loss: 0.3256
Epoch [60/100], Loss: 0.2804
Epoch [70/100], Loss: 0.2404
Epoch [80/100], Loss: 0.2399
Epoch [90/100], Loss: 0.2370
Epoch [100/100], Loss: 0.2316
Predicted value: [[4.2711134]]


In [None]:
# Where we want to get the output at every time step.
class RNN2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(RNN2, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(2*hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        out, _ = self.rnn(x, h0)
        # Apply the fully connected layer to all time steps
        out = self.fc(out)
        return out

# Parameters
vocab_size = 10  # Size of the vocabulary
embedding_dim = 4  # Dimension of the embedding vectors
hidden_size = 10  # Number of features in the hidden state
output_size = 1  # Number of output classes per timestep

# Create the model
model = RNN2(vocab_size, embedding_dim, hidden_size, output_size)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Sample data (batch size, sequence length)
inputs = torch.tensor([[1, 2, 3], [2, 3, 4]])
targets = torch.tensor([[[4.0], [5.0], [6.0]], [[5.0], [6.0], [7.0]]])

# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/100], Loss: {loss.item():.4f}')

# Test the model
model.eval()
test_input = torch.tensor([[3, 4, 5]])
predicted = model(test_input)
print(f'Predicted values: {predicted.detach().numpy()}')


 `nn.Embedding` layer maps each integer in the input sequence to a high-dimensional vector. This layer is particularly useful when dealing with words where each word is represented as a unique integer.

`nn.Embedding` layer transforms each integer in the input tensor into an embedding vector. The output shape from the embedding layer becomes (batch_size, sequence_length, embedding_dim).

 If embedding_dim is 4, as in the example, the shape after the embedding layer will be (2, 3, 4).

`RNN Layer` When this tensor is passed through the nn.RNN layer, the RNN processes each sequence of embedded vectors. The nn.RNN layer outputs two tensors: the output tensor and the hidden state. The output tensor from the RNN has the general shape `(batch_size, sequence_length, num_directions * hidden_size)`, and `the hidden state` has the shape `(num_layers * num_directions, batch_size, hidden_size)`.

`Fully Connected Layer` the model can uses the output at the last step or at every timestep of the sequence to make a prediction.

If we want the prediction at the last time step output is sliced from the RNN output tensor with `out[:, -1, :]`, which reduces its shape to `(batch_size, hidden_size)`, or (2, 10).the sliced output is then passed through a fully connected layer `(nn.Linear)`, which is designed to map the RNN's hidden state to the desired output size.


If we want the prediction at every time step output no need to slice the RNN output tensor simply pass it to `(nn.Linear)`, which is designed to map the RNN's hidden state to the desired output size.


In [None]:
# Parameters
input_size = 3
hidden_size = 4  # Each direction has 4 features
num_layers = 1
batch_size = 2
seq_length = 5

# Create a bidirectional RNN
rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers,
             bidirectional=True, batch_first=True)

# Example input (batch size, sequence length, input size)
input_tensor = torch.randn(batch_size, seq_length, input_size)

# Forward pass
out, hn = rnn(input_tensor)

print("Output shape (out):", out.shape)  # Expect [batch_size, seq_length, 2 * hidden_size]
print("Last hidden state shape (hn):", hn.shape)  # Expect [2 * num_layers, batch_size, hidden_size]

# Output for first batch, first timestep
print("Forward pass output (first half):", out[0, 0, :hidden_size])
print("Backward pass output (second half):", out[0, 0, hidden_size:])


Output shape (out): torch.Size([2, 5, 8])
Last hidden state shape (hn): torch.Size([2, 2, 4])
Forward pass output (first half): tensor([-0.4961, -0.0098,  0.1262,  0.6290], grad_fn=<SliceBackward0>)
Backward pass output (second half): tensor([ 0.0261, -0.3007, -0.6989,  0.2977], grad_fn=<SliceBackward0>)


### Bidirectional RNN

In [None]:
# Parameters
input_size = 3
hidden_size = 4
num_layers = 2  # Two layers of RNN
batch_size = 1
seq_length = 5
bidirectional = True

# Define a multi-layer bidirectional RNN
rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers,
             bidirectional=bidirectional, batch_first=True)

# Example input tensor
input_tensor = torch.randn(batch_size, seq_length, input_size)

# Forward pass through the RNN
out, hn = rnn(input_tensor)

print("Output shape (out):", out.shape)  # Expect [batch_size, seq_length, 2 * hidden_size]
print("Hidden state shape (hn):", hn.shape)  # Expect [num_layers * num_directions, batch_size, hidden_size]

# Accessing and using the hidden states
hn_forward_layer1 = hn[0, :, :]  # First layer, forward direction
hn_backward_layer1 = hn[1, :, :]  # First layer, backward direction
hn_forward_layer2 = hn[2, :, :]  # Second layer, forward direction
hn_backward_layer2 = hn[3, :, :]  # Second layer, backward direction

# Example of concatenating forward and backward hidden states for each layer
concatenated_layer1 = torch.cat((hn_forward_layer1, hn_backward_layer1), dim=1)
concatenated_layer2 = torch.cat((hn_forward_layer2, hn_backward_layer2), dim=1)

# Output for verification
print("Concatenated hidden states, layer 1:", concatenated_layer1.shape)
print("Concatenated hidden states, layer 2:", concatenated_layer2.shape)

### Deep BiRNN

In [None]:
class DeepBiRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers, output_all_timesteps=False):
        super(DeepBiRNN, self).__init__()
        self.output_all_timesteps = output_all_timesteps
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers=num_layers,
                          batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # *2 for bidirectional

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)  # No need to initialize h0 explicitly; it defaults to zero

        if self.output_all_timesteps:
            # Apply the fully connected layer to all timesteps
            out = self.fc(out)
        else:
            # Apply the fully connected layer only to the final timestep's output
            out = self.fc(out[:, -1, :])

        return out

# Parameters for the model
vocab_size = 100
embedding_dim = 50
hidden_size = 20
output_size = 1
num_layers = 2  # More than one layer makes it a deep RNN

# Model instantiation for output at every timestep
model_all_timesteps = DeepBiRNN(vocab_size, embedding_dim, hidden_size, output_size, num_layers, output_all_timesteps=True)

# Model instantiation for output at only the last timestep
model_last_timestep = DeepBiRNN(vocab_size, embedding_dim, hidden_size, output_size, num_layers, output_all_timesteps=False)

# Example Input
input_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]])

# Forward pass for both models
output_all_timesteps = model_all_timesteps(input_tensor)
output_last_timestep = model_last_timestep(input_tensor)

print("Output (All Timesteps):", output_all_timesteps.shape)  # (batch_size, sequence_length, output_size)
print("Output (Last Timestep):", output_last_timestep.shape)  # (batch_size, output_size)
