# RNN

1. 주어진 데이터를 RNN에 넣을 수 있는 형태로 만든다.
2. 기본적인 RNN 사용법 및 적용법을 익힌다.
3. PackedSequence 의 필요성에 대해 배우고 적용법을 실습한다.

## 라이브러리

In [12]:
from tqdm import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from pprint import pprint
import torch

`pack_padded_sequence` : PackedSequence object 를 만든다. 연산량을 줄이기 위해 torch rnn 에 특화된 패딩 객체를 만든다.  
`pad_packed_sequence` : 다시 원래 형태 (batch_size, max_len, hidden_size) 로 만든다.  

rnn 에 주입되는 pack sequence 객체는 output 도 같은 객체로 반환되므로, `pad_packed_sequence` 를 사용해서 다시 원래 형태 tensor 로 만들어준다.

## 데이터 전처리

sample data 를 확인해보자.  
전체 단어 수와 pad token 의 id 도 아래와 같다.  

In [2]:
vocab_size = 100
pad_id = 0 # zero padding

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

Padding 처리를 해주면서 padding 전 길이도 저장한다.

In [9]:
max_len = len(max(data, key=len))
print(f"Maximum sequence length: {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
    valid_lens.append(len(seq))
    if len(seq) < max_len:
        data[i] = seq + [pad_id] * (max_len - len(seq))

100%|██████████| 10/10 [00:00<00:00, 119156.36it/s]

Maximum sequence length: 20





In [13]:
pprint(data, width=100)
print('\n',valid_lens) 

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13],
 [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0],
 [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0],
 [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0],
 [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]

 [20, 5, 8, 10, 15, 18, 17, 6, 6, 18]


위 데이터를 하나의 batch 로 만들어 실습에 이용하겠습니다.

In [14]:
# B : batch_size, L : maximum sequence length
batch = torch.LongTensor(data) # (B, L)
batch_lens = torch.LongTensor(valid_lens) # (B)

In [16]:
batch, batch_lens

(tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
          74, 13],
         [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
           0,  0],
         [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
           0,  0],
         [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
           0,  0],
         [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
           0,  0],
         [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
         

## RNN

RNN 에 넣기 전 word embedding 을 위한 embedding layer 를 만듭니다.

In [17]:
embedding_size = 256
embedding = nn.Embedding(vocab_size, embedding_size)

# d_w : embedding size
batch_emb = embedding(batch) # (B, L, d_w)

In [18]:
embedding

Embedding(100, 256)

아래와 같이 RNN 모델 및 초기 hidden state 를 정의합니다.

In [19]:
hidden_size = 512  # RNN 의 hidden size
num_layers = 1     # 쌓을 RNN layer 의 개수
num_dirs = 1       # 1: 단방향 RNN, 2:양방향 RNN

rnn = nn.RNN(
    input_size = embedding_size, # 256
    hidden_size = hidden_size,   # 512
    num_layers = num_layers,     # 층 1개
    bidirectional = True if num_dirs > 1 else False
)

h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (num_layers * num_dirs, B, d_h)

In [20]:
rnn

RNN(256, 512)

In [22]:
h_0.size() # 초기 hidden state

torch.Size([1, 10, 512])

RNN에 batch data 를 넣으면 아래와 같이 2가지 output을 얻습니다.(return_sequences 유무)    
* `hidden_states` : 각 time step에 해당하는 hidden state 들의 묶음
* `h_n` : 모든 sequence 를 거치고 나온 마지막 hidden state

In [25]:
hidden_states, h_n = rnn(batch_emb.transpose(0,1), h_0)
# rnn 에 input 될 때는 차원이 (length, batch_size, word_embedding) 차원이므로 transpose 해준다.
# (batch_size, length, word_embedding) X
# option 을 주면 transpose 하지 않아도 자동으로 해주기도 한다.

# d_h : hidden size, num_layers : layer 개수, num_dirs : 방향의 개수
print(hidden_states.shape) # (L, B, d_h)
print(h_n.shape) # (num_layers * num_dirs, B, d_h) = (1, B, d_h)

torch.Size([20, 10, 512])
torch.Size([1, 10, 512])


## RNN 활용법

마지막 hidden state 를 이용하여 text classification task 에 적용할 수 있다.

In [26]:
num_classes = 2
classification_layer = nn.Linear(hidden_size, num_classes)

# C : number of classes
output = classification_layer(h_n.squeeze(0)) # (1, B, d_h) => (B, C)
print(output.shape)

torch.Size([10, 2])


각 time step 에 대한 hidden state 를 이용하여 token-level 의 task를 수행할 수도 있다.

In [28]:
num_classes = 5
entity_layer = nn.Linear(hidden_size, num_classes)

# C : number of classes
output = entity_layer(hidden_states) # (L, B, d_h) => (L, B, C)
print(output.shape) # output shape 가 length 별로 나오게 된다.

torch.Size([20, 10, 5])


# PackedSequence

## PackedSequence 사용법

앞서 padding 처리했던 데이터를 확인해보자

In [31]:
import pandas as pd

In [30]:
data

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13],
 [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0],
 [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0],
 [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0],
 [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]

In [51]:
# 아래와 같이 불필요한 pad 계산이 포함된다.
pd.DataFrame(data).style.background_gradient('RdBu', vmin = 0, vmax = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13
1,62,76,79,66,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,93,77,16,67,46,74,24,70,0,0,0,0,0,0,0,0,0,0,0,0
3,19,83,88,22,57,40,75,82,4,46,0,0,0,0,0,0,0,0,0,0
4,70,28,30,24,76,84,92,76,77,51,7,20,82,94,57,0,0,0,0,0
5,58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5,0,0
6,22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1,0,0,0
7,94,21,79,24,3,86,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,80,80,33,63,34,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80,0,0


데이터를 padding 전 원래 길이 기준으로 정렬한다.

In [52]:
sorted_lens, sorted_idx = batch_lens.sort(descending=True)
sorted_batch = batch[sorted_idx]

print(sorted_batch)
print(sorted_lens)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([2

아래와 같은 padding 무시 효과를 얻을 수 있다.  
RNN 에 input 을 packed sequence 라는 객체로 만들어 넣어주면, 아래에서 0 에 해당하는 영역은 자동으로 계산하지 않게 된다.  

In [54]:
pd.DataFrame(sorted_batch.tolist()).style.background_gradient('RdBu', vmin = 0, vmax = 1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13
1,58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5,0,0
2,87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80,0,0
3,22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1,0,0,0
4,70,28,30,24,76,84,92,76,77,51,7,20,82,94,57,0,0,0,0,0
5,19,83,88,22,57,40,75,82,4,46,0,0,0,0,0,0,0,0,0,0
6,93,77,16,67,46,74,24,70,0,0,0,0,0,0,0,0,0,0,0,0
7,94,21,79,24,3,86,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,80,80,33,63,34,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,62,76,79,66,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


`pack_padded_sequence` 를 이용하여 PackedSequence object 를 사용한다.

In [55]:
sorted_batch_emb = embedding(sorted_batch)
# pack sequence object 
packed_batch = pack_padded_sequence(sorted_batch_emb.transpose(0,1), sorted_lens)

print(packed_batch)
print(packed_batch[0].shape)

PackedSequence(data=tensor([[-0.6163,  0.9768,  0.2852,  ...,  0.1052,  1.3940, -1.2653],
        [ 1.7534, -0.9487, -0.0567,  ..., -0.6138, -0.9746,  2.8041],
        [-0.0942,  2.2334,  0.4825,  ..., -0.2707,  1.2072, -0.7752],
        ...,
        [-0.2854,  0.4955,  0.2762,  ..., -1.4020, -1.2778, -1.0994],
        [ 0.7942, -0.2181, -0.6577,  ..., -0.6027,  1.7972, -0.3280],
        [-0.4974, -1.5743, -1.5898,  ...,  0.5070, -0.4136, -0.9620]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 256])


In [56]:
packed_outputs, h_n = rnn(packed_batch, h_0)

print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)
# rnn 에 넣어도 output 이 pack sequence 로 나오게 된다.

PackedSequence(data=tensor([[ 0.1980, -0.6953, -0.2958,  ..., -0.2694, -0.4174,  0.5128],
        [-0.3357, -0.3658, -0.2823,  ..., -0.1293, -0.4792, -0.4755],
        [-0.4251, -0.3122,  0.6325,  ...,  0.3370, -0.2857,  0.6781],
        ...,
        [-0.2350,  0.8144, -0.3755,  ..., -0.0812, -0.5510, -0.1808],
        [ 0.6373,  0.6065, -0.0921,  ...,  0.2328,  0.1297, -0.6605],
        [ 0.0611,  0.3073,  0.4836,  ...,  0.1780, -0.6562, -0.3389]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])


`packed_output` 은 PackedSequence 이므로 원래 output 형태와 다르다.  
이를 다시 원래 형태로 바꿔주기 위해 `pad_packed_sequence` 를 이용한다.

In [59]:
outputs, outputs_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape) # (L, B, d_h)
print(outputs_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])
