# RNN과 LSTM을 이용한 Language Modeling 실습


RNN, LSTM 같은 Recurrent Model을 활용한 단어 단위의 Language Modeling을 직접 실습해본다.

※ 실행 환경: colab

## 1. 데이터 전처리 및 데이터셋 구성



### 1.1. torchdata 설치

In [None]:
!pip install folium==0.2.1

In [None]:
!pip install torchdata==0.4.0

### 1.2. WikiText-2 데이터 불러오기

In [None]:
from torchtext.datasets import WikiText2

In [None]:
train, valid, test = WikiText2()

### 1.3. Tokenization & Build Vocab

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
en_tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(en_tokenizer, train), min_freq=3, specials=['<unk>'])

In [None]:
vocab['<unk>'], len(vocab.get_stoi())

(0, 28782)

In [None]:
token2id = vocab.get_stoi()
id2token = vocab.get_itos()

### 1.4. Encoding, Decoding

In [None]:
import torch

In [None]:
def encode(data, token2id, tokenizer):
    encoded = [ torch.tensor(list(map(lambda x: token2id.get(x, token2id['<unk>']), tokens))).long() for tokens in map(tokenizer, data)]
    return torch.cat(encoded)

def decode(id_sequence, id2token):
    return " ".join([ id2token[s_id] for s_id in id_sequence ])

In [None]:
train_seq = encode(train, token2id, en_tokenizer)
valid_seq = encode(valid, token2id, en_tokenizer)
test_seq = encode(test, token2id, en_tokenizer)

In [None]:
print(train_seq[:10])
print(valid_seq[:10])
print(test_seq[:10])

tensor([    9,  3849,  3869,   881,     9, 20000,    83,  3849,    88,     0])
tensor([    9,  9606, 25610,     9,  9606, 25610,     2,   123,    14,     1])
tensor([  9, 632,   0,   9, 632,   0,  23,  30, 332, 103])


In [None]:
train_seq.size(), valid_seq.size(), test_seq.size()

(torch.Size([2049990]), torch.Size([214417]), torch.Size([241859]))

### 1.5. Batch 구성

In [None]:
def batchfy(data, batch_size, seq_len):
    samples = data.size()[0] // (batch_size * seq_len)
    data = data[:samples*batch_size*seq_len]
    data = data.view(batch_size,-1,seq_len).transpose(0,1)
    return data

In [None]:
batch_size = 128
seq_len = 64

In [None]:
train_batch = batchfy(train_seq, batch_size, seq_len)
valid_batch = batchfy(valid_seq, batch_size, seq_len)
test_batch = batchfy(test_seq, batch_size, seq_len)

In [None]:
train_batch.size(), valid_batch.size(), test_batch.size()

(torch.Size([250, 128, 64]),
 torch.Size([26, 128, 64]),
 torch.Size([29, 128, 64]))

## 2. 모델 구현하기

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

### 2.1. RNN 모델

* 하이퍼 파라미터
    * vocab_size: vocab 크기
    * emb_dim: embedding_dimension
    * hidden_dim: hidden-state vector dimension
    * num_layers: RNN 은닉층의 수
    * dropout: dropout 비율

In [None]:
class MyRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=2, dropout=0.5):
        super(MyRNN, self).__init__()
        self.model_type = 'RNN'

        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.RNN(input_size=emb_dim, hidden_size=hidden_dim, num_layers=num_layers, nonlinearity='relu',batch_first=True, dropout=dropout)
        self.output = nn.Linear(hidden_dim, vocab_size)

        self.init_weights()

    def forward(self, x, h):
        '''
        x: (batch_size, seq_len)
        h: (num_layers, batch_size, hidden_dim)
        '''

        x = self.embedding(x) # (batch_size, seq_len, emb_size)
        out, h_n = self.rnn(x, h) # (batch_size, seq_len, hidden_dim), (num_layer, batch_size, hidden_dim)
        out = F.log_softmax(self.output(out),dim=-1) # (batch_size, seq_len, vocab_size)
        return out, h_n

    def init_weights(self):
        k = torch.tensor(1/self.hidden_dim)
        for param in self.parameters():
            nn.init.uniform_(param.data, -torch.sqrt(k), torch.sqrt(k))

    def init_hidden(self, batch_size):
        return torch.zeros((self.num_layers, batch_size, self.hidden_dim))

### 2.2. LSTM 모델

* 하이퍼 파라미터
    * vocab_size: vocab 크기
    * emb_dim: embedding_dimension
    * hidden_dim: hidden-state vector dimension
    * num_layers: RNN 은닉층의 수
    * dropout: dropout 비율

In [None]:
class MyLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers=2, dropout=0.5):
        super(MyLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        self.model_type = 'LSTM'

        self.embedding = nn.Embedding(vocab_size, emb_dim)

        self.lstm = nn.LSTM(input_size=emb_dim, 
                            hidden_size=hidden_dim,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)
        self.output = nn.Linear(hidden_dim, vocab_size)

        self.init_param()

    def init_param(self):
        k = torch.tensor(1/self.hidden_dim)
        for param in self.parameters():
            nn.init.uniform_(param.data, -torch.sqrt(k), torch.sqrt(k))

    def init_hidden(self, batch_size):
        return torch.zeros((self.num_layers, batch_size, self.hidden_dim)), torch.zeros((self.num_layers, batch_size, self.hidden_dim))

    def forward(self, x, hidden):
        x = self.embedding(x)

        out, (next_h, next_c) = self.lstm(x, hidden)
        out = self.output(out)
        log_prob = F.log_softmax(out, dim=-1)

        return log_prob, (next_h, next_c)

## 3. 모델 학습

### 3.1. train

* 학습 함수

In [None]:
from tqdm.notebook import tqdm

In [None]:
def train(model, optimizer, data, device):
    model.train()
    model.to(device)

    total_loss = 0.
    bar = tqdm(data, desc='train')

    batch_size = data.size()[1]
    hidden = model.init_hidden(batch_size)

    for i, x in enumerate(bar, start=1):
        x = x.to(device)

        if model.model_type == 'LSTM':
            hidden = (hidden[0].to(device), hidden[1].to(device))
        else:
            hidden = hidden.to(device)

        out, next_hidden = model(x, hidden) # out: (batch_size, seq_len, vocab_size)

        if model.model_type == 'LSTM':
            hidden = tuple(tensor.detach() for tensor in next_hidden) # detach()를 하지 않으면 backward()를 2번 한다는 RuntimeError가 발생한다.
        else:
            hidden = next_hidden.detach()

        # 다음 단어를 예측하는 것이므로 예측값에서 마지막 시점의 출력값은 제외하고, 정답에서는 2번째 시점부터 가져와 비교한다.
        # out을 tranpose하는 이유는 nll_loss가 input:(batch_size, num_class, dim1, dim2,...) target:(batch_size, dim1, dim2,) 방식으로 입력을 받기 때문이다.
        cost = F.nll_loss(out[:,:-1,:].transpose(1,2), x[:,1:])

        total_loss += cost.item()
        current_loss = total_loss / i

        optimizer.zero_grad()
        cost.backward()
        optimizer.step()


        bar.set_description(f"Train-loss:{current_loss:.4f}")

### 3.2. evaluate 함수

* 검증∙평가 함수

In [None]:
def evaluate(model, data, device, mode='valid'):
    
    model.eval()
    model.to(device)

    total_loss = 0.
    bar = tqdm(data, desc=mode)

    batch_size = data.size()[1]
    hidden = model.init_hidden(batch_size)

    loss_avg = 0.

    for i, x in enumerate(bar, start=1):
        with torch.no_grad():
            x = x.to(device)

            if model.model_type == 'LSTM':
                hidden = (hidden[0].to(device), hidden[1].to(device))
            else:
                hidden = hidden.to(device)
            
            out, next_hidden = model(x, hidden) # out: (batch_size, seq_len, vocab_size)
            
            if model.model_type == 'LSTM':
                hidden = tuple(tensor.detach() for tensor in next_hidden) # detach()를 하지 않으면 backward()를 2번 한다는 RuntimeError가 발생한다.
            else:
                hidden = next_hidden.detach()

            # 다음 단어를 예측하는 것이므로 예측값에서 마지막 시점의 출력값은 제외하고, 정답에서는 2번째 시점부터 가져와 비교한다.
            # out을 tranpose하는 이유는 nll_loss가 input:(batch_size, num_class, dim1, dim2,...) target:(batch_size, dim1, dim2,) 방식으로 입력을 받기 때문이다.
            loss = F.nll_loss(out[:,:-1,:].transpose(1,2), x[:,1:])
        total_loss += loss.item()
        current_loss = total_loss / i
        loss_avg = current_loss

        bar.set_description(f"{mode}-loss:{current_loss:.4f}")
    
    return loss_avg

### 3.3. 학습

In [None]:
vocab_size = len(id2token)
emb_size = 256
hidden_dim = 256
num_epoch = 2

model = MyLSTM(vocab_size, emb_size, hidden_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = optim.Adam(model.parameters(),lr=1e-3)

for epoch in range(num_epoch):
    train(model, optimizer, train_batch, device)
    val_loss = evaluate(model, valid_batch, device, 'valid')
    print("="*60)
    print(f"END OF EPOCH:{epoch+1} | VALID LOSS: {val_loss:.4f}")
    print("="*60)

train:   0%|          | 0/250 [00:00<?, ?it/s]

valid:   0%|          | 0/26 [00:00<?, ?it/s]

END OF EPOCH:1 | VALID LOSS: 6.7639


train:   0%|          | 0/250 [00:00<?, ?it/s]

valid:   0%|          | 0/26 [00:00<?, ?it/s]

END OF EPOCH:2 | VALID LOSS: 6.7501


### 3.4. TEST

In [None]:
print("="*60)
test_loss = evaluate(model, test_batch, device, 'test')
print(f"END OF TEST | TEST LOSS: {test_loss:.4f}")
print("="*60)



test:   0%|          | 0/29 [00:00<?, ?it/s]

END OF TEST | TEST LOSS: 5.0599


## 4. 문장 생성

지금까지 학습한 모델을 바탕으로 1000개의 단어를 생성하는 작업을 해볼 것이다.

첫 시작 단어는 random sampling한다.

```
생성된 문장 예시

it was partially filmed for his prestigious diplomatic kingdoms , began with christie ' s performances ' exploration of piedras lopez , which herders was stopped behind jerkins .

he wrote at walking from the great new flight of constance informer and jan francis von general henry i
```

In [None]:
hidden = model.init_hidden(1) # 1개 단어로 시작하기 때문에 batch_size가 1이다.
input = torch.randint(low=1, high=len(id2token), size=(1,1)) # (batch_size, seq_len) -> (1,1)

id_sequence = [input.item()]
for i in tqdm(range(1000), desc='sentence generating'):
    input = input.to(device)
    hidden = (hidden[0].to(device), hidden[1].to(device))
    with torch.no_grad():
        out, next_hidden = model(input, hidden)
    weights = out.squeeze().exp() # exp를 해준 이유는 log_softmax이기 때문이다.
    token_id = torch.multinomial(weights, 1) # weights가 가진 확률(가중치)에 기반해 index를 sampling한다.
    input, hidden = token_id.unsqueeze(0), next_hidden
    id_sequence.append(token_id.item())

with open('output.txt', "w") as f:
    f.write(decode(id_sequence, id2token))

sentence generating:   0%|          | 0/1000 [00:00<?, ?it/s]