In [25]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f9570ae07f0>

In [26]:
INPUT_DIM = 3
HIDDEN_DIM = 5
SEQ_LEN = 7
lstm = nn.LSTM(input_size=INPUT_DIM, 
               hidden_size=HIDDEN_DIM, 
               batch_first=True)

In [35]:
inputs = torch.randn(SEQ_LEN, 1, INPUT_DIM)

In [83]:
#inputs = [torch.randn(1, INPUT_DIM) for _ in range(SEQ_LEN)]

In [36]:
inputs

tensor([[[-0.6490, -0.0774, -0.6042]],

        [[-0.5731,  0.9578,  1.5043]],

        [[-1.3950,  0.8008, -0.6619]],

        [[ 1.2563,  0.5000,  0.0402]],

        [[ 0.4647, -0.0312, -0.0939]],

        [[-0.6191, -0.6363, -0.4242]],

        [[-2.0272,  1.3015, -0.6293]]])

In [80]:
out, (hidden_state, cell_state) = lstm(inputs, (hidden.view(1, SEQ_LEN, HIDDEN_DIM), cell.view(1, SEQ_LEN, HIDDEN_DIM)))

In [81]:
out.size(), (hidden_state.size(), cell_state.size())

(torch.Size([7, 1, 5]), (torch.Size([1, 7, 5]), torch.Size([1, 7, 5])))

In [82]:
out

tensor([[[ 0.0014, -0.3442, -0.3233,  0.1272,  0.0628]],

        [[ 0.1822, -0.1008, -0.1856,  0.2728, -0.1620]],

        [[-0.3205, -0.0206,  0.1476,  0.2962, -0.0751]],

        [[-0.3481, -0.1549,  0.2820, -0.0994,  0.3369]],

        [[-0.0077,  0.0343, -0.2153,  0.0807,  0.1262]],

        [[-0.1799,  0.0330,  0.1802,  0.1951,  0.3317]],

        [[-0.0519,  0.0721,  0.2354, -0.2404,  0.1540]]],
       grad_fn=<TransposeBackward0>)

In [71]:
hidden_state.size(), hidden.size()

(torch.Size([1, 7, 5]), torch.Size([7, 1, 5]))

In [19]:
cell_state

tensor([[[ 0.0123, -0.2199,  0.2875, -0.1356,  0.0766],
         [ 0.3382,  0.2510,  0.6205,  0.4422,  0.0732],
         [-0.0388,  0.0336,  0.0089, -0.1430, -0.0643],
         [-0.0184,  0.0949, -0.0476, -0.0689, -0.1356],
         [ 0.0110,  0.0574,  0.1300,  0.0568,  0.0288],
         [ 0.2608,  0.1801,  0.5237,  0.1864,  0.0636],
         [-0.0150,  0.0053,  0.2304, -0.1897, -0.0862]]],
       grad_fn=<StackBackward>)

In [58]:
hidden, cell = (torch.randn(SEQ_LEN, 1, HIDDEN_DIM), torch.randn(SEQ_LEN, 1, HIDDEN_DIM))

In [59]:
inputs[0].unsqueeze(0)

tensor([[[-0.6490, -0.0774, -0.6042]]])

In [61]:
out1, (hid1, cell1) =  lstm(inputs[0].unsqueeze(0), (hidden[0].unsqueeze(0), cell[0].unsqueeze(0)))
out1, hid1, cell1

(tensor([[[ 0.0014, -0.3442, -0.3233,  0.1272,  0.0628]]],
        grad_fn=<TransposeBackward0>),
 tensor([[[ 0.0014, -0.3442, -0.3233,  0.1272,  0.0628]]],
        grad_fn=<StackBackward>),
 tensor([[[ 0.0031, -0.4773, -0.7585,  0.2320,  0.1027]]],
        grad_fn=<StackBackward>))

In [72]:
inputs[0].unsqueeze(0).size(), (hidden[0].unsqueeze(0).size(), cell[0].unsqueeze(0).size())

(torch.Size([1, 1, 3]), (torch.Size([1, 1, 5]), torch.Size([1, 1, 5])))

In [73]:
out1.size(), (hid1.size(), cell1.size())

(torch.Size([1, 1, 5]), (torch.Size([1, 1, 5]), torch.Size([1, 1, 5])))

# Demystifying `nn.LSTM` module

## What’s the difference between hidden and output in PyTorch LSTM?

According to Pytorch documentation

```py
"""
Outputs: output, (h_n, c_n)
"""
```

There are 2 types of usage of `nn.lstm` module.

- **TYPE 1 (BULK MODE):** Feed all the input in a bulk to the `lstm` module
- **TYPE 2 (LOOP MODE):** Feed each element of the input to the `lstm` module in a loop

## How to interpret the BULK MODE?

- `Outputs` comprises all the hidden states in the last layer (“last” `depth-wise`, not time-wise). 
- $(h_n,c_n)$ comprises the hidden states after the last time step, $t=n$, so you could potentially feed them into another LSTM.

![image](https://i.stack.imgur.com/SjnTl.png)

Simple example to show that both the approach **may not** generate same output for identical problem definition.

**Example:**

Say I have a sentence: _i love my city kolkata very much_. And we want to feed the sentence to a `nn.lstm` module using above 2 approaches.

We have a sequence length = 7 here.

We need to convert each token `["i", "love", "my", "city", "kolkata", "very", "much"]` to an embedding. For this demo we generate an random embedding of dimension of `3`

In [84]:
SEQ_LEN = 7
IMPUT_DIM = EMBED_DIM = 3
HIDDEN_DIM = 5

Conceptually `input dimension` and `embedding dimension` are same. As word ambeddings are the input to the lstm module. We can use both the term.

In [96]:
torch.manual_seed(0)
inputs = torch.randn(SEQ_LEN, 1, INPUT_DIM)
inputs

tensor([[[-1.1258, -1.1524, -0.2506]],

        [[-0.4339,  0.8487, -1.5551]],

        [[-0.3414,  1.8530,  0.4681]],

        [[-0.1577,  1.4437,  0.2660]],

        [[ 0.1665,  1.5863,  0.9463]],

        [[-0.8437,  0.9318,  1.2590]],

        [[ 2.0050,  0.0537,  0.6181]]])

```py
"i"       = [[-1.1258, -1.1524, -0.2506]],
"love"    = [[-0.4339,  0.8487, -1.5551]],
"my"      = [[-0.3414,  1.8530,  0.4681]],
"city"    = [[-0.1577,  1.4437,  0.2660]],
"kolkata" = [[ 0.1665,  1.5863,  0.9463]],
"very"    = [[-0.8437,  0.9318,  1.2590]],
"much"    = [[ 2.0050,  0.0537,  0.6181]]]
```

Let's declare our `lstm` module

In [95]:
lstm = nn.LSTM(input_size=INPUT_DIM, 
               hidden_size=HIDDEN_DIM, 
               batch_first=True)

One interesting fact: `nn.LSTM()` returns a function and we assigned the function in a variable name `lstm`.

The function `lstm()` expects all the argument `inputs, (hidden, cell)` as 3D tensor. 

Now we can pass the entire embedding/input matrix `inputs` to the `lstm()` function.  If you are using TYPE 1, then we can call `lstm()` in 2 ways: 

- Without `(hidden, cell)`. Then system initializes the `(hidden,cell)` with 0
- With custom `(hidden, cell)` initialization

Syntax:

```py
out, (hidden, cell) =  lstm(inputs)
```

Now in many LSTM example we will see this notation where bulk inputs are fed to the `lstm()` module. The confusion arrises when we see example where TYPE 2 approach is used and each input in fed over loop. However we can show both TYPE 1 and TYPE 2 approach are same if we use same `(hidden, cell)` initialization for both the cases. 

But technically there is a slight catch. And that is related to the tensor shape for `(hidden,cell)`. 

In practice, LSTM is a recurrent network. Which takes one embedding for one word and the corresponding `(hidden,cell)` and returns `out, (hidden, cell)`. Now in bulk approach, all are sent together.

Let's initialize `(hidden, cell)`

In [97]:
hidden, cell = (torch.randn(SEQ_LEN, 1, HIDDEN_DIM), torch.randn(SEQ_LEN, 1, HIDDEN_DIM))

## TYPE 2: 

In [140]:
out_1, (hid_1, cell_1) =  lstm(inputs[0].unsqueeze(0), (hidden[0].unsqueeze(0), cell[0].unsqueeze(0)))
out_1, hid_1, cell_1

(tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
        grad_fn=<TransposeBackward0>),
 tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
        grad_fn=<StackBackward>),
 tensor([[[-0.0422,  0.1172,  0.1255,  0.3234,  0.8920]]],
        grad_fn=<StackBackward>))

In [141]:
out_2, (hid_2, cell_2) =  lstm(inputs[1].unsqueeze(0), (hid_1, cell_1))

In [142]:
out_2, (hid_2, cell_2)

(tensor([[[-0.0721,  0.0187, -0.0858, -0.0551, -0.0058]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[[-0.0721,  0.0187, -0.0858, -0.0551, -0.0058]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.2038,  0.0295, -0.2052, -0.1197, -0.0175]]],
         grad_fn=<StackBackward>)))

## TYPE 1:

In [116]:
out_type_1, (hidden_type_1, cell_type_1) = lstm(inputs, (hidden.view(1, SEQ_LEN, HIDDEN_DIM), cell.view(1, SEQ_LEN, HIDDEN_DIM)))

In [147]:
out_type_1

tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]],

        [[-0.0631, -0.3371, -0.1404, -0.3782,  0.0775]],

        [[ 0.1218,  0.5558,  0.0761, -0.3177, -0.3491]],

        [[ 0.0722,  0.0822,  0.2336, -0.0847, -0.2263]],

        [[ 0.0533,  0.0153,  0.1362, -0.3684, -0.1344]],

        [[ 0.0011, -0.0427, -0.1056,  0.1165,  0.2600]],

        [[-0.0602, -0.1055,  0.0908,  0.1777, -0.0081]]],
       grad_fn=<TransposeBackward0>)

see, the first row of `out_type_1` is similar to `out_1`. But the subsequent rows of `out_type_1` are differnet, as the returned `(hidden,cell)` are fed back into `lstm()`. 

This arises one question. We initially initialized the `(hidden, cell)` for all the `7` tokens, but it seems redundant. But that's not the case.  

Let's unroll the bulk and try to regenerate the `output_type_1`

In [143]:
out_1, (hid_1, cell_1) =  lstm(inputs[0].unsqueeze(0), (hidden[0].unsqueeze(0), cell[0].unsqueeze(0)))
out_1, (hid_1, cell_1)

(tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.0422,  0.1172,  0.1255,  0.3234,  0.8920]]],
         grad_fn=<StackBackward>)))

The first element is fine. But the twist comes next. See I am not passing the `(hid_1, cell_1)` for the token `inputs[1]` rather i am passing a reshaped version of ` (hidden[1], cell[1])`  and that is creating the exact replica of `output_type_1[1]`

In [144]:
out_2, (hid_2, cell_2) =  lstm(inputs[1].unsqueeze(0), (hidden[1].unsqueeze(0), cell[1].unsqueeze(0)))
out_2, (hid_2, cell_2)

(tensor([[[-0.0631, -0.3371, -0.1404, -0.3782,  0.0775]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[[-0.0631, -0.3371, -0.1404, -0.3782,  0.0775]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.1009, -0.5193, -0.2378, -1.1299,  0.2654]]],
         grad_fn=<StackBackward>)))

We can go on like this....

**Observation:**

If you see carefully, it seems, in bulk mode (in the above unrolled version), each output is not generated by the previous `(hidden, cell)` i.e $(h_{t-1}, c_{t-1})$ as seen by the above example (but the results are matching for `bulk output` and `unrolled version` of bulk output). 


## Following LSTM defition:

Now lets feed the $(h_{t-1}, c_{t-1})$ (as per the original LSTM definition) to generate the next `out`. 

In [148]:
out_1, (hid_1, cell_1) =  lstm(inputs[0].unsqueeze(0), (hidden[0].unsqueeze(0), cell[0].unsqueeze(0)))
out_1, (hid_1, cell_1)

(tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[[-0.0229,  0.0760,  0.0806,  0.0651,  0.4780]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.0422,  0.1172,  0.1255,  0.3234,  0.8920]]],
         grad_fn=<StackBackward>)))

In [149]:
out_2, (hid_2, cell_2) =  lstm(inputs[1].unsqueeze(0), (hid_1, cell_1))
out_2, (hid_2, cell_2)

(tensor([[[-0.0721,  0.0187, -0.0858, -0.0551, -0.0058]]],
        grad_fn=<TransposeBackward0>),
 (tensor([[[-0.0721,  0.0187, -0.0858, -0.0551, -0.0058]]],
         grad_fn=<StackBackward>),
  tensor([[[-0.2038,  0.0295, -0.2052, -0.1197, -0.0175]]],
         grad_fn=<StackBackward>)))

**Observation:**

The `out_2` is different from `output_type_1[1]` (both denoting the second element)

In [150]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



<torch._C.Generator at 0x7f9570ae07f0>

In [156]:
torch.manual_seed(0)
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    print(out)
    print(hidden)
    print

tensor([[[-0.1815,  0.0289,  0.5399]]], grad_fn=<StackBackward>)
(tensor([[[-0.1815,  0.0289,  0.5399]]], grad_fn=<StackBackward>), tensor([[[-0.3697,  0.0903,  1.3455]]], grad_fn=<StackBackward>))
tensor([[[0.0569, 0.1538, 0.0446]]], grad_fn=<StackBackward>)
(tensor([[[0.0569, 0.1538, 0.0446]]], grad_fn=<StackBackward>), tensor([[[0.1005, 0.3668, 1.2979]]], grad_fn=<StackBackward>))
tensor([[[ 0.0991, -0.0257, -0.0674]]], grad_fn=<StackBackward>)
(tensor([[[ 0.0991, -0.0257, -0.0674]]], grad_fn=<StackBackward>), tensor([[[ 0.1282, -0.0523, -0.1263]]], grad_fn=<StackBackward>))
tensor([[[ 0.1365, -0.2060,  0.0026]]], grad_fn=<StackBackward>)
(tensor([[[ 0.1365, -0.2060,  0.0026]]], grad_fn=<StackBackward>), tensor([[[ 0.2138, -0.3261,  0.0117]]], grad_fn=<StackBackward>))
tensor([[[0.2096, 0.0675, 0.0626]]], grad_fn=<StackBackward>)
(tensor([[[0.2096, 0.0675, 0.0626]]], grad_fn=<StackBackward>), tensor([[[0.3354, 0.2321, 0.4211]]], grad_fn=<StackBackward>))


In [157]:
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.3341,  0.1119,  0.2212]],

        [[-0.0564,  0.2298,  0.0374]],

        [[-0.0412,  0.0749, -0.0919]],

        [[ 0.0481, -0.1281, -0.0022]],

        [[ 0.1721,  0.0864,  0.0560]]], grad_fn=<StackBackward>)
(tensor([[[0.1721, 0.0864, 0.0560]]], grad_fn=<StackBackward>), tensor([[[0.2710, 0.2993, 0.4017]]], grad_fn=<StackBackward>))
