In [41]:
import torch
import torch.nn as nn

In [74]:
def lstm(inpu_dim, num_layers, bi):
    inp_size, hidden_size, num_layers = inpu_dim, 1, num_layers
    model = nn.LSTM(inp_size, hidden_size, num_layers, bidirectional=bi)
    
    for name, param in model.named_parameters():
        param.data.fill_(1)
        
    return model
    

def printer(inpt, o, h):
    # [seq len, batch size, inp dimension]
    print('\nInput: ')
    print(inpt)
    # [seq len, batch size, inp dimension * 2]
    print('\nOutput: ')
    print(o)
    # [num_layers * 2 , batch size, inp dimension]
    print('\nHidden: ')
    print(h)

In [83]:
inpt = torch.randn([2])
inpt = inpt.view(2, 1, 1)
print(inpt)

tensor([[[0.7050]],

        [[0.9573]]])


### Uni-direction with Single layer

In [84]:
model = lstm(1, 1, False)

h = None

for t in inpt:
    if h == None:
        o, (h, c) = model(t.unsqueeze(0))
    else:
        o, (h, c) = model(t.unsqueeze(0), (h, c))
    printer(t, o, h)
    print('---------------------------------------')

print('------------- Full batch --------------')
model = lstm(1, 1, False)

o, (h, c) = model(inpt)
printer(inpt, o, h)


Input: 
tensor([[0.7050]])

Output: 
tensor([[[0.6844]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.6844]]], grad_fn=<StackBackward>)
---------------------------------------

Input: 
tensor([[0.9573]])

Output: 
tensor([[[0.9300]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9300]]], grad_fn=<StackBackward>)
---------------------------------------
------------- Full batch --------------

Input: 
tensor([[[0.7050]],

        [[0.9573]]])

Output: 
tensor([[[0.6844]],

        [[0.9300]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9300]]], grad_fn=<StackBackward>)


---
### Uni-direction with two layers

In [85]:
inp_size, num_layers = 1, 2
model = lstm(inp_size, num_layers, False)

for name, param in model.named_parameters():
    param.data.fill_(1)

h = None

for t in inpt:
    if h == None:
        o, (h, c) = model(t.unsqueeze(0))
    else:
        o, (h, c) = model(t.unsqueeze(0), (h, c))
    printer(t, o, h)
    print('---------------------------------------')

print('------------- Full batch --------------')
model = lstm(inp_size, num_layers, False)

o, (h, c) = model(inpt)
printer(inpt, o, h)


Input: 
tensor([[0.7050]])

Output: 
tensor([[[0.6828]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.6844]],

        [[0.6828]]], grad_fn=<StackBackward>)
---------------------------------------

Input: 
tensor([[0.9573]])

Output: 
tensor([[[0.9290]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9300]],

        [[0.9290]]], grad_fn=<StackBackward>)
---------------------------------------
------------- Full batch --------------

Input: 
tensor([[[0.7050]],

        [[0.9573]]])

Output: 
tensor([[[0.6828]],

        [[0.9290]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9300]],

        [[0.9290]]], grad_fn=<StackBackward>)


---
## Analyzing bi-directional LSTM 

### Single layer

In [150]:
inp_size = 1
num_layers = 1
model = lstm(inp_size, num_layers, False)
h = None

inpx = torch.flip(inpt, [0, 1])


for t, tx in zip(inpt, inpx):
    if h == None:
        o, (h, c) = model(t.unsqueeze(0))
        ox, (hx, cx) = model(tx.unsqueeze(0))
    else:
        o, (h, c) = model(t.unsqueeze(0), (h, c))
        ox, (hx, cx) = model(tx.unsqueeze(0), (hx, cx))
    printer(t, o, h)
    print('\n<<< Reverse')
    printer(tx, ox, hx)
    print('---------------------------------------')
    
print('------------- Full batch --------------')
model = lstm(inp_size, num_layers, True)

o, (h1, c) = model(inpt)
printer(inpt, o, h1)


Input: 
tensor([[0.7050]])

Output: 
tensor([[[0.6844]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.6844]]], grad_fn=<StackBackward>)

<<< Reverse

Input: 
tensor([[0.9573]])

Output: 
tensor([[[0.7013]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.7013]]], grad_fn=<StackBackward>)
---------------------------------------

Input: 
tensor([[0.9573]])

Output: 
tensor([[[0.9300]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9300]]], grad_fn=<StackBackward>)

<<< Reverse

Input: 
tensor([[0.7050]])

Output: 
tensor([[[0.9239]]], grad_fn=<StackBackward>)

Hidden: 
tensor([[[0.9239]]], grad_fn=<StackBackward>)
---------------------------------------
------------- Full batch --------------

Input: 
tensor([[[0.7050]],

        [[0.9573]]])

Output: 
tensor([[[0.6844, 0.9239]],

        [[0.9300, 0.7013]]], grad_fn=<CatBackward>)

Hidden: 
tensor([[[0.9300]],

        [[0.9239]]], grad_fn=<StackBackward>)


### With double layer

In [151]:
inp_size = 1
num_layers = 2

model = lstm(inp_size, num_layers, True)

o, (h2, c) = model(inpt)
printer(inpt, o, h2)


Input: 
tensor([[[0.7050]],

        [[0.9573]]])

Output: 
tensor([[[0.7299, 0.9477]],

        [[0.9480, 0.7306]]], grad_fn=<CatBackward>)

Hidden: 
tensor([[[0.9300]],

        [[0.9239]],

        [[0.9480]],

        [[0.9477]]], grad_fn=<StackBackward>)


### Accessing output

In [154]:
o[:, :, :1] # Forward LSTM

tensor([[[0.7299]],

        [[0.9480]]], grad_fn=<SliceBackward>)

In [155]:
o[:, :, 1:] # Backward LSTM

tensor([[[0.9477]],

        [[0.7306]]], grad_fn=<SliceBackward>)

### Accessing hidden layer output from single layered LSTM

In [156]:
h1[-2,:,:] # forward RNN hidden state output 

tensor([[0.9300]], grad_fn=<SliceBackward>)

In [157]:
h1[-1,:,:] # backward RNN hidden state output

tensor([[0.9239]], grad_fn=<SliceBackward>)

### Accessing hidden layer output from double layered LSTM

In [186]:
num_layers = 2
num_directions = 2
batch = 1
hidden_size = 1

first_layer_lstm = h2.view(num_layers, num_directions, batch, hidden_size)[0]
second_layer_lstm = h2.view(num_layers, num_directions, batch, hidden_size)[1]

print('1st layer forward: \n', first_layer_lstm[-2,:,:], '\n1st layer backward: \n', first_layer_lstm[-1,:,:])
print('\n2nd layer forward: \n', second_layer_lstm[-2,:,:], '\n2nd layer backward: \n', second_layer_lstm[-1,:,:])

1st layer forward: 
 tensor([[0.9300]], grad_fn=<SliceBackward>) 
1st layer backward: 
 tensor([[0.9239]], grad_fn=<SliceBackward>)

2nd layer forward: 
 tensor([[0.9480]], grad_fn=<SliceBackward>) 
2nd layer backward: 
 tensor([[0.9477]], grad_fn=<SliceBackward>)


### Analyzing parameters

In [167]:
for name, param in lstm(1, 1, False).named_parameters():
    print(name)
    print(param)
    print('-----------')

weight_ih_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
weight_hh_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
bias_ih_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
bias_hh_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------


In [168]:
for name, param in lstm(1, 2, False).named_parameters():
    print(name)
    print(param)
    print('-----------')

weight_ih_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
weight_hh_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
bias_ih_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
bias_hh_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
weight_ih_l1
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
weight_hh_l1
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
bias_ih_l1
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
bias_hh_l1
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------


In [169]:
for name, param in lstm(1, 1, True).named_parameters():
    print(name)
    print(param)
    print('-----------')

weight_ih_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
weight_hh_l0
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
bias_ih_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
bias_hh_l0
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
weight_ih_l0_reverse
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
weight_hh_l0_reverse
Parameter containing:
tensor([[1.],
        [1.],
        [1.],
        [1.]], requires_grad=True)
-----------
bias_ih_l0_reverse
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------
bias_hh_l0_reverse
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)
-----------


Basically, output of LSTM is the cummulative hidden states over all time steps.
While hidden state is over t time Step.
for Bi-directional, we get forward and backward both in output.
We can concatenate it or sum or multiply or pass it to linear layer. its just another hyper parameter and then pass it to linear layer to finally predict!

In [5]:
#[src len, batch size, enc hid dim * 2]
import torch
enc = torch.rand([3, 2, 2*2])
print(enc.shape)
print(enc)
enc.permute(1, 0, 2)

torch.Size([3, 2, 4])
tensor([[[0.8673, 0.1860, 0.2513, 0.9623],
         [0.4967, 0.1907, 0.2801, 0.8897]],

        [[0.1326, 0.7849, 0.9479, 0.7799],
         [0.3995, 0.8238, 0.8698, 0.9360]],

        [[0.5219, 0.0967, 0.4878, 0.4901],
         [0.7813, 0.6874, 0.4124, 0.4624]]])


tensor([[[0.8673, 0.1860, 0.2513, 0.9623],
         [0.1326, 0.7849, 0.9479, 0.7799],
         [0.5219, 0.0967, 0.4878, 0.4901]],

        [[0.4967, 0.1907, 0.2801, 0.8897],
         [0.3995, 0.8238, 0.8698, 0.9360],
         [0.7813, 0.6874, 0.4124, 0.4624]]])