# Recurrent Neural Network
- mathematical experession
$$P(x_t|x_{t-1},..,x_1) \approx P(x_t|h_{t-1})$$
$$ h_t = f(x_t, h_{t-1}) $$

- Neural Network Hidden state
$$ H = \phi(XW_{xh} + b_h)$$

In [1]:
import torch
from d2l import torch as d2l

X, W_xh = torch.normal(0, 1, (3, 1)), torch.normal(0, 1, (1, 4)) # x는 3,1의 벡터고 weight는 1,4의 weight
H, W_hh = torch.normal(0, 1, (3, 4)), torch.normal(0, 1, (4, 4))
torch.matmul(X, W_xh) + torch.matmul(H, W_hh) # output

tensor([[-0.9444, -0.8728,  0.1801, -1.7676],
        [-1.4616, -0.3225, -0.2576, -1.2187],
        [-1.5597, -0.7058,  0.1005, -0.1664]])

위와 같은 상황을 선형대수적으로 표현할 경우 아래처럼 concat을 해서 표현할 수 있을 것이다.

In [2]:
torch.matmul(torch.cat((X, H), 1), torch.cat((W_xh, W_hh), 0)) 

tensor([[-0.9444, -0.8728,  0.1801, -1.7676],
        [-1.4616, -0.3225, -0.2576, -1.2187],
        [-1.5597, -0.7058,  0.1005, -0.1664]])

Implementation of RNN

In [1]:
import math
import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

batch_size, num_steps = 32, 35
train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) # load time machine data

In [2]:
# one hot encoding
# match each index to unit vector

F.one_hot(torch.tensor([0, 2]), len(vocab)) # one hot vector for index 0 and 2

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])

In [9]:
X = torch.arange(10).reshape((2, 5)) 
# 2개의 batch로 만든다. 
# 각 batch에 5개의 index가 들어있는 벡터를 만듦
# 각 index에 대해서 28차원의 one_hot encoding

F.one_hot(X.T, 28).shape 
# 5,2,28 차원으로 바뀌었다. 
# 0,1,2,3 ..은 index이고
# index에 따라서

torch.Size([5, 2, 28])

In [20]:
for x in F.one_hot(X.T,28):
    print(x)

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])
tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])
tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])
tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])
tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:
# initialize model parameters

def get_params(vocab_size, num_hiddens, device):
    num_inputs = num_outputs = vocab_size
    
    def normal(shape):
        return torch.randn(size=shape, device=device) * 0.01

    # Hidden Layer parameter, Hidden output 계산
    W_xh = normal((num_inputs, num_hiddens))
    W_hh = normal((num_hiddens, num_hiddens))
    b_h = torch.zeros(num_hiddens, device=device)
    
    # Output Layer parameter, Output 계산
    W_hq = normal((num_hiddens, num_outputs))
    b_q = torch.zeros(num_outputs, device=device)
    
    # Attach gradient. gradient 계산 실행되게끔
    params = [W_xh, W_hh, b_h, W_hq, b_q]
    for param in params:
        param.requires_grad_(True)
    return params


## RNN model

In [48]:
def init_rnn_state(batch_size, num_hidden, device):
    return torch.zeros((batch_size, num_hidden),device=device)

init_state = init_rnn_state(2,28, device='cpu')

In [80]:
def rnn(inputs, state, params):
    '''
    Input shape : (num_steps, batch_size, vocab_size)
    순차적으로(num_steps) 돌아가게 하기 위해서 num_steps가 0쨰 차원에 위치한다. 
    '''
    W_xh, W_hh, b_h, W_hq, b_q = params
    H = state # Hidden state output
    outputs = []
    for X in inputs : # 각 timestep이 돌아가면서 for문 진행
        H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h)
        Y = torch.mm(H, W_hq) + b_q # 배치 사이즈만큼 Y가 나올 것이다.
        outputs.append(Y) # 
    return torch.cat(outputs, dim=0), (H,)

In [81]:
sample = F.one_hot(X.T, 28)
sample.shape

torch.Size([5, 2, 28])

In [None]:
init_params = get_params(28,28, device='cpu')
rnn(sample, init_state, init_params)[0].shape

In [75]:
class RNNModelScratch:
    
    def __init__(self, vocab_size, num_hiddens, device,
                 get_params, init_state, forward_fn):
        self.vocab_size, self.num_hiddens = vocab_size, num_hiddens
        self.params = get_params(vocab_size, num_hiddens, device)
        self.init_state, self.forward_fn = init_state, forward_fn
        
    def __call__(self, X, state): # call을 해줄 때, X랑 state input으로 넣어주면 forwarding 해준다.
        X = F.one_hot(X.T, self.vocab_size).type(torch.float32)
        return self.forward_fn(X, state, self.params)
    
    def begin_state(self, batch_size, device):
        return self.init_state(batch_size, self.num_hiddens, device)

In [83]:
num_hiddens = 512
net = RNNModelScratch(len(vocab), num_hiddens, 'cpu', get_params,
                      init_rnn_state, rnn)
state = net.begin_state(X.shape[0], device='cpu')
Y, new_state = net(X, state)
Y.shape, len(new_state), new_state[0].shape

(torch.Size([10, 28]), 1, torch.Size([2, 512]))

Prediction

In [88]:
def predict_ch8(prefix, num_preds, net, vocab, device):  #@save
    """Generate new characters following the `prefix`."""
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1))
    for y in prefix[1:]:  # Warm-up period
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):  # Predict `num_preds` steps
        y, state = net(get_input(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ''.join([vocab.idx_to_token[i] for i in outputs])

In [90]:
predict_ch8('time traveller ', 10, net, vocab, 'cpu')

TypeError: mm(): argument 'input' (position 1) must be Tensor, not tuple