**Udemy 6-8. Paying Attention to Shapes**



Class Notes:

---



Simple Recurrent Unit or Elman Unit

$
h_t = σ(W_{xh}^T. x_t + W_{hh}^T. h_{t-1} + b_h)
$

Prediction will be

$
ŷ_t = σ(W_o^T .h_t + b_o)
$

---
Option 1:

if `batch_first == True`

Input must be N x T x D

Option 2:

if `batch_first == False`

Input must be T x N x D

---

In [None]:
import torch
import torch.nn as nn
import numpy as np

In [None]:
# Things you should automatically know and have memorized
# N: number of samples
# T: length of the sequence
# D: number of input features
# M: number of hidden units
# K: number of output units

In [None]:
# Make some Data
N = 1
T = 10
D = 3
M = 5
K = 2
X = np.random.randn(N, T, D) # because batch_first=True, if batch_first=False then (T, N, D)

In [None]:
# PART 1 - RNN using Torch
# Make an RNN
class SimpleRNN(nn.Module):
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        self.D = n_inputs
        self.M = n_hidden
        self.K = n_output
        self.rnn = nn.RNN(input_size=self.D,
                          hidden_size=self.M,
                          nonlinearity='tanh',
                          batch_first=True,)
        self.fc = nn.Linear(self.M, self.K)
        
    def forward(self, X):
        # initialize h0
        h0 = torch.zeros(1, X.size(0), self.M) # L x N x M
        
        # get RNN unit output
        out, _ = self.rnn(X, h0) # X: N x T x D | out: N x T x M
        
        # passing all hidden state (h_0 ... h_T) through dense layer
        out = self.fc(out) # in: N x T x M | out: N x T x K
        return out

In [None]:
# Instantiate the model
model = SimpleRNN(n_inputs=D, n_hidden=M, n_output=K)

In [None]:
# Get the output
inputs = torch.from_numpy(X.astype(np.float32))
outputs = model(inputs)
print(outputs)
print(outputs.shape) # out: N x T x K

tensor([[[-0.2375,  0.1605],
         [-0.3742, -0.2028],
         [-0.7712,  0.0903],
         [-0.4133,  0.0852],
         [-0.3892,  0.7124],
         [-0.0583, -0.2509],
         [-0.8777,  0.7747],
         [-0.0092, -0.0253],
         [-0.3508,  0.5767],
         [-0.0639, -0.1212]]], grad_fn=<AddBackward0>)
torch.Size([1, 10, 2])


In [None]:
# Save for later
Yhat_torch = outputs.detach().numpy()

In [None]:
# get the RNN layer parameters
W_xh, W_hh, b_xh, b_hh = model.rnn.parameters()
print("W_xh.shape", W_xh.shape)
print(W_xh)

W_xh = W_xh.data.numpy()
W_hh = W_hh.data.numpy()
b_xh = b_xh.data.numpy()
b_hh = b_hh.data.numpy()

print(W_xh.shape, b_xh.shape, W_hh.shape, b_hh.shape) # MxD, Dx1, MxM, Mx1 

W_xh.shape torch.Size([5, 3])
Parameter containing:
tensor([[-0.2946, -0.0276, -0.1723],
        [-0.4416, -0.4401, -0.4396],
        [ 0.2607,  0.2543, -0.3633],
        [ 0.3621,  0.3392,  0.1759],
        [-0.2745, -0.3040, -0.4026]], requires_grad=True)
(5, 3) (5,) (5, 5) (5,)


In [None]:
# get the FC layer parameters
W_o, b_o = model.fc.parameters()

W_o = W_o.data.numpy()
b_o = b_o.data.numpy()

print(W_o.shape, b_o.shape) # KxM, Kx1

(2, 5) (2,)


In [None]:
# PART 2 - RNN using numpy
# Simplified because N is considered to be 1
# See if we can replicate the output
h_last = np.zeros(M) # initial hidden state
x = X[0]
print(x.shape)
yhats = np.zeros((T, K),) # where we store the outputs

for t in range(T):
    h = np.tanh(x[t].dot(W_xh.T) + b_xh + h_last.dot(W_hh.T) + b_hh)
    y = h.dot(W_o.T) + b_o # We only care about this value on last iteration
    yhats[t] = y
    
    h_last = h # Don't forget to assign h to h_last

print(yhats)

(10, 3)
[[-0.23747324  0.16046794]
 [-0.37419039 -0.20281294]
 [-0.77121085  0.09033243]
 [-0.41333817  0.08524129]
 [-0.38919497  0.71243846]
 [-0.05832134 -0.25088813]
 [-0.87765071  0.7747328 ]
 [-0.0092352  -0.02526761]
 [-0.35084117  0.57665854]
 [-0.06385781 -0.12120582]]


In [None]:
# Check
print(np.allclose(yhats, Yhat_torch)) # Both nn.RNN and our formula respond the same

True


In [None]:
# Excersize: Calculate the output for multiple samples at once (N > 1)
# Response: "Udemy 6-8. Paying Attention to Shapes - Appendix A"