# LSTM

In [28]:
import torch
from torch.nn import LSTMCell
from torch import sigmoid, tanh

torch.manual_seed(42)

<torch._C.Generator at 0x116cd9b90>

In [29]:
# Input size of x, i.e.,
# number of features of x
d = 3

# Hidden size, i.e.,
# size of the hidden state vector h
# In fact, all vectors involved in an LSTM cell 
# other than x share the same size
k = 2

lstm_cell = LSTMCell(
    input_size=d,
    hidden_size=k,
)

print(f"model parameters: {lstm_cell.state_dict().keys()}")

model parameters: odict_keys(['weight_ih', 'weight_hh', 'bias_ih', 'bias_hh'])


In [30]:
W_ih = lstm_cell.state_dict()["weight_ih"]
bias_ih = lstm_cell.state_dict()["bias_ih"]
W_hh = lstm_cell.state_dict()["weight_hh"]
bias_hh = lstm_cell.state_dict()["bias_hh"]

print("shape:")
print(f"W_ih: {W_ih.shape}\tbias_ih: {bias_ih.shape}")
print(f"W_hh: {W_hh.shape}\tbias_hh: {bias_hh.shape}")

shape:
W_ih: torch.Size([8, 3])	bias_ih: torch.Size([8])
W_hh: torch.Size([8, 2])	bias_hh: torch.Size([8])


In [31]:
W_ii, W_if, W_ig, W_io = torch.split(W_ih, k)
bias_ii, bias_if, bias_ig, bias_io = torch.split(bias_ih, k)
W_hi, W_hf, W_hg, W_ho = torch.split(W_hh, k)
bias_hi, bias_hf, bias_hg, bias_ho = torch.split(bias_hh, k)

print("shapes of weights and biases of the input gate:")
print(f"W_ii: {W_ii.shape}\tbias_ii: {bias_ii.shape}")
print(f"W_hi: {W_hi.shape}\tbias_hi: {bias_hi.shape}")

shapes of weights and biases of the input gate:
W_ii: torch.Size([2, 3])	bias_ii: torch.Size([2])
W_hi: torch.Size([2, 2])	bias_hi: torch.Size([2])


The following diagram illustrates the structure of a single LSTM cell.

<img src="../figures/lstm.png" width="60%"></img>

The four internal activation vectors inside an LSTM cell, $i_t$, $f_t$, $g_t$ and $o_t$, are calculated using the following equations:

$$
\begin{align*}
    i_t &= \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\
    f_t &= \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\
    g_t &= \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{t-1} + b_{hg}) \\
    o_t &= \sigma(W_{io} x_t + b_{io} + W_{ho} h_{t-1} + b_{ho})
\end{align*}
$$

In [32]:
x = torch.randn(d)
h0 = torch.randn(k)
c0 = torch.randn(k)

# Inpute gate's activation vector
i = sigmoid(
    W_ii @ x + bias_ii + W_hi @ h0 + bias_hi
)

# Forget gate's activation vector
f = sigmoid(
    W_if @ x + bias_if + W_hf @ h0 + bias_hf
)

# Cell input activation vector
g = tanh(
    W_ig @ x + bias_ig + W_hg @ h0 + bias_hg
)

# Ouput gate's activation vector
o = sigmoid(
    W_io @ x + bias_io + W_ho @ h0 + bias_ho
)

print(f"i: {i}")
print(f"f: {f}")
print(f"g: {g}")
print(f"o: {o}")

i: tensor([0.5944, 0.2809])
f: tensor([0.4461, 0.7488])
g: tensor([0.2927, 0.4243])
o: tensor([0.7073, 0.5732])


The cell state vector $c_t$ and hidden state vector $h_t$ are outputted from each LSTM cell:

$$
\begin{align*}
    c_t &= f_t \odot c_{t-1} + i_t \odot g_t \\
    h_t &= o_t \odot \tanh(c_t) 
\end{align*}
$$

In [33]:
# Cell state vector
c = f * c0 + i * g

# Hidden state vector, i.e.,
# output vector of the LSTM cell
h = o * tanh(c)

print(f"manual calculations of h and c:")
print(f"h: {h}")
print(f"c: {c}")

manual calculations of h and c:
h: tensor([ 0.3590, -0.3243])
c: tensor([ 0.5594, -0.6413])


In [34]:
with torch.no_grad():
    h, c = lstm_cell(x, (h0, c0))
    
print(f"output from the LSTM cell:")
print(f"h: {h}")
print(f"c: {c}")

output from the LSTM cell:
h: tensor([ 0.3590, -0.3243])
c: tensor([ 0.5594, -0.6413])


As we have verified, the manual calculations agree with PyTorch's implementation.