In [1]:
import torch
from torch import nn

## Linear Layers

<img src=/Users/mayankanand/Documents/pytorch/images/linear_layer.png height=300 width=600>

In [10]:
# Lets take an input data with batch_size of 16 and with feature shape of 8
input_tensor = torch.arange(128, dtype=torch.float).view(16, 8)
input_tensor[:5]

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14., 15.],
        [16., 17., 18., 19., 20., 21., 22., 23.],
        [24., 25., 26., 27., 28., 29., 30., 31.],
        [32., 33., 34., 35., 36., 37., 38., 39.]])

In [11]:
# If we have to make a prediction for two classes then output layer should have two nodes.
linear_layers = nn.Linear(8, 2, dtype=torch.float)
linear_layers

Linear(in_features=8, out_features=2, bias=True)

### Model

In [12]:
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_layer = nn.Linear(8, 2, dtype=torch.float)

    def forward(self, x):
        return self.linear_layer(x)

In [13]:
linear_model = LinearModel()
linear_model.state_dict()

OrderedDict([('linear_layer.weight',
              tensor([[-0.1418, -0.3059,  0.0574, -0.2477,  0.1296,  0.3349, -0.3096, -0.1380],
                      [ 0.3129, -0.2057,  0.3207,  0.1450, -0.3323,  0.2019,  0.3079,  0.1525]])),
             ('linear_layer.bias', tensor([ 0.0578, -0.1919]))])

`sigmoid` activation function

In [14]:
sigmoid = nn.Sigmoid() # Activation function applied on output coming after linear computation

In [15]:
Xo = linear_model(input_tensor)
y = sigmoid(Xo)
print(y)

tensor([[1.8136e-01, 9.6352e-01],
        [1.5384e-03, 9.9997e-01],
        [1.0715e-05, 1.0000e+00],
        [7.4525e-08, 1.0000e+00],
        [5.1831e-10, 1.0000e+00],
        [3.6047e-12, 1.0000e+00],
        [2.5070e-14, 1.0000e+00],
        [1.7436e-16, 1.0000e+00],
        [1.2126e-18, 1.0000e+00],
        [8.4337e-21, 1.0000e+00],
        [5.8655e-23, 1.0000e+00],
        [4.0793e-25, 1.0000e+00],
        [2.8371e-27, 1.0000e+00],
        [1.9731e-29, 1.0000e+00],
        [1.3723e-31, 1.0000e+00],
        [9.5440e-34, 1.0000e+00]], grad_fn=<SigmoidBackward0>)


## RNN Layers

<img src=/Users/mayankanand/Documents/pytorch/images/rnn_flow.png height=400 width=600>

In RNN layer, there are three weight matrix.
1. First is for input at each timestamp
2. Second is for hidden state layers coming from pervious timestamp
3. Third is for the output coming after applying activation function to the linear computation

These weights are shared across all the other timestamp(t).

Let's take an example where input data is sequence of `32 tokens` with `embeddings size of 128`. At each timestamp it gives output of `16 nodes`
- Output node count can be configured based on use-case. E.g, 
    - For machine translation, output could be vocab_size
    - In case of sentiment analysis, output could be a single node

In [18]:
# Defining learning parameters
wi = nn.Linear(128, 64, dtype=torch.float)
wh = nn.Linear(64, 64, dtype=torch.float)
wo = nn.Linear(64, 16, dtype=torch.float)


# Generating a sequence with 32 tokens and it's embedding with batch_size of 8
input_seq = torch.rand(8, 32, 128, dtype=torch.float)
print(input_seq.shape)

torch.Size([8, 32, 128])


In [24]:
for i in range(input_seq.shape[1]):
    print(input_seq[:, i, :])
    print(input_seq[:, i, :].shape)
    break

tensor([[0.1233, 0.7378, 0.7926,  ..., 0.2149, 0.1500, 0.5144],
        [0.5717, 0.6686, 0.7401,  ..., 0.9193, 0.2378, 0.0434],
        [0.4646, 0.1059, 0.3264,  ..., 0.0427, 0.7981, 0.5636],
        ...,
        [0.5659, 0.6944, 0.1545,  ..., 0.3293, 0.1927, 0.9063],
        [0.5424, 0.6935, 0.6207,  ..., 0.2686, 0.2356, 0.7422],
        [0.3006, 0.9047, 0.2034,  ..., 0.0481, 0.6002, 0.8851]])
torch.Size([8, 128])


In [29]:
# Forward propagation
# Initializing initial hidden input
init_hidden_variable = torch.rand(8, 64)

input_at_t0 = input_seq[:, 0, :]
input_at_t0

tensor([[0.1233, 0.7378, 0.7926,  ..., 0.2149, 0.1500, 0.5144],
        [0.5717, 0.6686, 0.7401,  ..., 0.9193, 0.2378, 0.0434],
        [0.4646, 0.1059, 0.3264,  ..., 0.0427, 0.7981, 0.5636],
        ...,
        [0.5659, 0.6944, 0.1545,  ..., 0.3293, 0.1927, 0.9063],
        [0.5424, 0.6935, 0.6207,  ..., 0.2686, 0.2356, 0.7422],
        [0.3006, 0.9047, 0.2034,  ..., 0.0481, 0.6002, 0.8851]])

In [31]:
# Activation Function
tanh_function = nn.Tanh()

In [32]:
# Linear addition of input with wi
xit = wi(input_at_t0)
print(xit.shape)

# Adding hidden state value to x0t
xh = torch.add(xit, init_hidden_variable)
print(xh.shape)

# Applying Activation function
xht = tanh_function(xh)
print(xht.shape)

# Output Linear computation
xo = wo(xht)
print(xo.shape)

torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8, 16])


In [35]:
test_xit = torch.arange(16).view(2, 8)
test_hidden_tensor = torch.rand(8)
print(test_xit, test_hidden_tensor)
print(torch.add(test_hidden_tensor, test_xit))

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7],
        [ 8,  9, 10, 11, 12, 13, 14, 15]]) tensor([0.7922, 0.3760, 0.9716, 0.9059, 0.4850, 0.0643, 0.7914, 0.4331])
tensor([[ 0.7922,  1.3760,  2.9716,  3.9059,  4.4850,  5.0643,  6.7914,  7.4331],
        [ 8.7922,  9.3760, 10.9716, 11.9059, 12.4850, 13.0643, 14.7914, 15.4331]])


In [62]:
# Creating Model
class RNNModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.wi = nn.Linear(128, 64)
        self.wh = nn.Linear(64, 64)
        self.wo = nn.Linear(64, 16)
        self.tanh = nn.Tanh()
        self.init_hidden_variable = torch.rand(64) # Values of this tensor will get broadcasted along batch
        self.hidden_tensor_list = []

    def forward(self, x):
        timestamp_count = x.shape[1]
        output_tensor = []
        for i in range(timestamp_count):
            xit = wi(x[:, i, :])
            # xit = torch.matmul(wi, x[:, i, :])
            if i == 0:
                xh = torch.add(xit, self.init_hidden_variable)
            else:
                try:
                    xh = torch.add(xit, self.hidden_tensor_list.pop())
                except Exception as e:
                    print(f"Error: {e}")

            xht = self.tanh(xh)
            self.hidden_tensor_list.append(xht)
            xot = self.wo(xht)

            output_tensor.append(xot[:, None, :])

        return torch.concat(output_tensor, dim=1)

In [63]:
rnn_model = RNNModel()
output_tensor = rnn_model(input_seq)
print(output_tensor.shape)

torch.Size([8, 32, 16])


In [64]:
print(input_seq.shape, output_tensor.shape)

torch.Size([8, 32, 128]) torch.Size([8, 32, 16])
