# LSTM, GRU, Language Modeling Task

1. 기존 RNN 과 다른점
2. 다양한 적용법

## 라이브러리

In [1]:
from tqdm import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch

## 데이터 전처리


In [2]:
vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

In [3]:
max_len = len(max(data, key=len))
print(f"Maximum sequence length : {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
    valid_lens.append(len(seq))
    if len(seq) < max_len:
        data[i] = seq + [pad_id] * (max_len - len(seq))

100%|██████████| 10/10 [00:00<00:00, 126334.46it/s]

Maximum sequence length : 20





In [4]:
# B : batch size, L : maximum sequence length

batch = torch.LongTensor(data) # (B, L)
batch_lens = torch.LongTensor(valid_lens) # (B)

batch_lens, sorted_idx = batch_lens.sort(descending=True)
batch = batch[sorted_idx]

print(batch)
print(batch_lens)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([2

## LSTM 사용

LSTM 에선 cell state 가 추가됩니다.  
Cell state 의 shape 는 hidden state 의 shape 와 동일하다.

In [5]:
embedding_size = 256
hidden_size = 512
num_layers = 1
num_dirs = 1

embedding = nn.Embedding(vocab_size, embedding_size)
lstm = nn.LSTM(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    bidirectional = True if num_dirs > 1 else False
)

# hidden vector
h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (num_layers * num_dirs, B, d_h)

# state vector
c_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (num_layers * num_dirs, B, d_h)

In [12]:
# d_w : word embedding size
batch_emb = embedding(batch) # (B, L, d_w) 임베딩

packed_batch = pack_padded_sequence(batch_emb.transpose(0, 1), batch_lens) # pack sequence object

packed_outputs, (h_n, c_n) = lstm(packed_batch, (h_0, c_0))
print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)
print(c_n.shape)

PackedSequence(data=tensor([[-0.1417, -0.1051, -0.0269,  ..., -0.0809, -0.0321, -0.0338],
        [ 0.0904, -0.1547, -0.0107,  ...,  0.0723,  0.0023,  0.1199],
        [-0.1141, -0.0416, -0.0819,  ...,  0.0893,  0.0574, -0.0411],
        ...,
        [ 0.0612,  0.0333,  0.2206,  ...,  0.0219,  0.1055, -0.0301],
        [-0.0178, -0.0548, -0.0342,  ...,  0.0118, -0.1112,  0.0629],
        [-0.1250,  0.0660, -0.0872,  ...,  0.0182, -0.1434,  0.0069]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])
torch.Size([1, 10, 512])


**torch.nn.utils.rnn.pack_padded_sequence**
```py
torch.nn.utils.rnn.pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True)
```

`input (Tensor)` – padded batch of variable length sequences.

`lengths (Tensor)` – list of sequences lengths of each batch element.

`batch_first (bool, optional)` – if True, the input is expected in B x T x * format.

`enforce_sorted (bool, optional)` – if True, the input is expected to contain sequences sorted by length in a decreasing order. If False, the input will get sorted unconditionally. Default: True.

**torch.nn.utils.rnn.pad_packed_sequence**

```py
torch.nn.utils.rnn.pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_length=None)
```

`sequence (PackedSequence)` – batch to pad

`batch_first (bool, optional)` – if True, the output will be in B x T x * format.

`padding_value (float, optional)` – values for padded elements.

`total_length (int, optional)` – if not None, the output will be padded to have length total_length. This method will throw ValueError if total_length is less than the max sequence length in sequence.


In [13]:
outputs, output_lens = pad_packed_sequence(packed_outputs) # 다시 원래로 바꿔준다.
print(outputs.shape)
print(output_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


## GRU 사용

GRU 는 cell state 가 없어 RNN과 동일하게 사용 가능합니다.   
GRU 를 이용하여 LM task를 수행해보자.  

In [16]:
gru = nn.GRU(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    bidirectional = True if num_dirs > 1 else False
)

In [17]:
output_layer = nn.Linear(hidden_size, vocab_size)

In [18]:
input_id = batch.transpose(0,1)[0, :] # (B) # 첫 번째 단어만 가져온다.
hidden = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (1, B, d_h)

**Teachear forcing**  

초반에는 무조건 학습이 틀리기 때문에, 나온 결과가 다시 인풋으로 사용되는 것을 끊어버리고, 인위적으로 원래 데이터를 주입해주는 것.  

Teacher forcing 없이 이전에 얻은 결과를 다음 input 으로 이용합니다.

In [19]:
# language modeling 할 때는 직접 for roop 를 통해서 넣어준다.

for t in range(max_len):
    input_emb = embedding(input_id).unsqueeze(0) # (1, B, d_w)
    output, hidden = gru(input_emb, hidden) # output : (1, B, d_h), hidden : (1, B, d_h)
    
    # V : vocab size
    output = output_layer(output) # (1, B, V)
    probs, top_id = torch.max(output, dim=-1) # probs : (1, B), top_id : (1, B)
    
    print('*' * 50)
    print(f"Time step: {t}")
    print(output.shape)
    print(probs.shape)
    print(top_id.shape)
    
    input_id = top_id.squeeze(0) # (B)

**************************************************
Time step: 0
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 1
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 2
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 3
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 4
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 5
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 6
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 7
torch.Size([1, 10, 100])
torch.Si

`max_len` 만큼의 for 문을 돌면서 모든 결과물의 모양을 확인했지만 만약 종료 조건(예를 들어 문장의 끝을 나타내는 end token 등) 이 되면  
중간에 생성을 그만둘 수도 있습니다.

## 양방향 및 여러 layer 사용

이번엔 양방향 + 2개 이상의 layer 를 쓸 때 얻을 수 있는 결과에 대해 알아봅니다.  

In [22]:
num_layers = 2
num_dirs = 2
dropout = 0.1

gru = nn.GRU(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    dropout = dropout,
    bidirectional = True if num_dirs > 1 else False
)

Bidirectional 이 되었고 layer 의 개수가 2로 늘었기 때문에 hidden state 의 shape 도 `(4, B, d_h)` 가 됩니다.

In [23]:
# d_w : word embedding size, num_layers : layer 의 개수, num_dirs : 방향의 개수
batch_emb = embedding(batch) # (B, L, d_w)
h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (num_layers * num_dirs, B, d_h) = (4, B, d_h)

packed_batch = pack_padded_sequence(batch_emb.transpose(0,1), batch_lens)

packed_outputs, h_n = gru(packed_batch, h_0)
print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)

PackedSequence(data=tensor([[-0.0268, -0.0156, -0.0764,  ..., -0.0013, -0.0648,  0.0530],
        [-0.0207,  0.0174, -0.0403,  ...,  0.2451, -0.0301,  0.2327],
        [-0.0364, -0.0895, -0.0029,  ...,  0.2567, -0.0595,  0.0529],
        ...,
        [ 0.1644, -0.0686,  0.0315,  ..., -0.0315,  0.0845,  0.0524],
        [ 0.0934,  0.0764,  0.1168,  ..., -0.1100, -0.1745, -0.0697],
        [-0.0163,  0.1236,  0.1755,  ...,  0.0770,  0.0490,  0.0528]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 1024])
torch.Size([4, 10, 512])


In [24]:
outputs, output_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape) # (L, B, num_dirs * d_h)
print(output_lens)

torch.Size([20, 10, 1024])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


각각의 결과물의 shape 는 다음과 같습니다.

`outputs : (max_len, batch_size, num_dir * hidden_size)`  
`h_n : (num_layer * num_dirs, batch_size, hidden_size)`  

In [25]:
batch_size = h_n.shape[1]
print(h_n.view(num_layers, num_dirs, batch_size, hidden_size))
print(h_n.view(num_layers, num_dirs, batch_size, hidden_size).shape)

tensor([[[[-0.2623, -0.1769, -0.1118,  ..., -0.0034, -0.1100, -0.0783],
          [ 0.0996,  0.2707, -0.1935,  ...,  0.2685,  0.1765, -0.0115],
          [ 0.3811, -0.2315,  0.1433,  ..., -0.0525,  0.0680, -0.2746],
          ...,
          [-0.0486,  0.0248,  0.0622,  ..., -0.0385,  0.3415, -0.3320],
          [-0.2905,  0.0195, -0.0787,  ..., -0.0494, -0.2308,  0.2295],
          [ 0.0376,  0.1778, -0.1356,  ...,  0.2565, -0.0275, -0.3093]],

         [[-0.3307,  0.1377, -0.1100,  ..., -0.1522, -0.1077,  0.0790],
          [-0.1431, -0.2494,  0.2846,  ...,  0.2209,  0.2795, -0.0268],
          [-0.4334, -0.4231, -0.0604,  ..., -0.2446,  0.2793,  0.0018],
          ...,
          [-0.0895,  0.2337, -0.0922,  ...,  0.1417, -0.1363, -0.0085],
          [-0.1658,  0.1323, -0.4728,  ..., -0.2972,  0.0161,  0.1540],
          [ 0.1088, -0.1732,  0.0303,  ...,  0.0151, -0.3359,  0.1943]]],


        [[[-0.0163,  0.1236,  0.1755,  ..., -0.1768,  0.0082,  0.0333],
          [-0.2539, -0.0231,