In [1]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

## BERT config

In [2]:
# BERT Parameters
# 버트 사용 파라미터
maxlen = 30 # 토큰의 최대길이
batch_size = 6 # 배치 사이즈
max_pred = 5 # max tokens of prediction # 예측 시, 최대 토큰 수
n_layers = 6 # 버트 레이어 숫자
n_heads = 12 # 인코더 내 멀티 헤드의 숫자
d_model = 768 # 모델의 길이??
d_ff = 768*4 # 4*d_model, FeedForward dimension # feed forward 시 4배 확장
d_k = d_v = 64  # dimension of K(=Q), V # Q, K, V에서 k와 v의 차원
n_segments = 2 # 문장 세그멘트 수?

## Bert 학습 데이터

In [3]:
text = (
    'Hello, how are you? I am Romeo.\n'
    'Hello, Romeo My name is Juliet. Nice to meet you.\n'
    'Nice meet you too. How are you today?\n'
    'Great. My baseball team won the competition.\n'
    'Oh Congratulations, Juliet\n'
    'Thanks you Romeo'
)

## 데이터 전처리
'.', ',', '?', '!' 값들 ''로 변경.
text 데이터들은 \n값으로 구분

In [4]:
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!'
print(sentences)

['hello how are you i am romeo', 'hello romeo my name is juliet nice to meet you', 'nice meet you too how are you today', 'great my baseball team won the competition', 'oh congratulations juliet', 'thanks you romeo']


## Vocab 생성

In [5]:
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i, w in enumerate(word_list):
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

## Sentece 단위 토큰화

In [15]:
token_list = list()
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)
    print(str(sentence)+' -> '+str(arr))

hello how are you i am romeo -> [7, 17, 13, 6, 4, 9, 25]
hello romeo my name is juliet nice to meet you -> [7, 25, 20, 21, 5, 28, 22, 10, 27, 6]
nice meet you too how are you today -> [22, 27, 6, 26, 17, 13, 6, 12]
great my baseball team won the competition -> [15, 20, 24, 8, 14, 16, 23]
oh congratulations juliet -> [11, 19, 28]
thanks you romeo -> [18, 6, 25]


## batch 데이터 생성

In [18]:
# sample IsNext and NotNext to be same in small batch size
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        # 1. 학습 데이터들 내에서 랜덤으로 index 추출.
        # NSP(Next Sentence Prediction)에 사용하기 위해
        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences)) # sample random index in sentences
        
        # 2. 위에서 생성된 랜덤 인덱스들에 대해 A, B로 문장을 나눈다.
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]
        
        # 3. [CLS] Sentence A [SEP] Sentence B [SEP] 로 구성된 토큰생성
        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]
        
        # 4.  [CLS] Sentence A [SEP] Sentence B [SEP]에 대해 첫번째 문장인지, 두번째 문장인지 token id를  0과 1로 구분
        # 첫째문장의 경우 0, 두번째 분장의 경우 1
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        # MASK LM(Masked Language Model)
        
        # M LM 태스트 에서 마스킹 대상 토큰
        # min(최대 출력 값과, 현재 입력  * 0.15)
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence
        
        # 토큰화된 입력에서 [CLS]와 [SEP]을 제외한 후보들 추출
        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        # 후보 토큰들 섞음
        shuffle(cand_maked_pos)
        
        
        masked_tokens, masked_pos = [], []
        # 후보 토큰 들중 n_pred개 Masking
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            # 전체 문장의 15%의 토큰 중 80%는 정상 마스킹
            if random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            # 15%의 10%는 랜덤 단어 마스킹
            elif random() < 0.5:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        # 현재 설정된 입력 토큰 갯수(maxlen)= 30
        # input_ids 이후 나머지들은 [0]으로 zero-padding
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

        # Zero Padding (100% - 15%) tokens
        # 예측 및 출력시 최대 가능한 토큰은 max_pred
        # n_pred가 출력시 최대 토큰 수 보다 작다면 [0]으로 패딩 
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)
            
        # tokenA 인덱스 +1이 tokenB 인덱스 값과 같다면 NSP에서 다음 문장으로 NSP 라벨 True
        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
            
        # tokenA 인덱스 +1이 tokenB 인덱스 값과 같지 않다면 NSP 값 False
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    # batch 값
    # (배치 사이즈, input_ids, segment_ids, masked_tokens, masked_pos, IsNext)
    return batch

## Attention Pad Mask

In [25]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k


## Gelu

In [26]:
def gelu(x):
    "Implementation of the gelu activation function by Hugging Face"
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

## BERT Input Embedding

In [27]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = nn.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)


## Scaled Dot Product Attention

In [28]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context, attn


## Multi Head Attention

In [29]:

class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)
        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]


## Position Wise Feed Forward Network

In [30]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
        return self.fc2(gelu(self.fc1(x)))

## Encoder 조립
앞서 작성한 scaledDotProductAttention, MultiHeadAttention, PoswiseFeedForwardNet을 이용해서 하나의 Encoder Block을 조립한다.

In [40]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.multiHeadAttention = MultiHeadAttention()
        self.poswiseFeedForwardNet = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.multiHeadAttention(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.poswiseFeedForwardNet(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn


## Pre-training을 위한 BERT 모델

In [36]:
class BERTforPreTraining(nn.Module):
    def __init__(self):
        super(BERTforPreTraining, self).__init__()
        
        # BERT의 입력으로 사용할 Embedding
        self.embedding = Embedding() 
        # n_layers의 수만큼 Encoder블록을 생성
        self.layers = nn.ModuleList([Encoder() for _ in range(n_layers)])
        
        # NSP의 IsNext,NoNext을 위한 Classification을 위한 Layer
        self.fc = nn.Linear(d_model, d_model)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.activ2 = gelu
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)

        # embedding weight 값을 decoder의 weight 값에 설정
        self.decoder.weight = embed_weight 
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        # it will be decided by first token(CLS)
        h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        # get masked position from final output of transformer.
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.norm(self.activ2(self.linear(h_masked)))
        
        # Masked Language Model의 Logit 
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_clsf

# BERT Pre-training 학습
앞서 작성한 모델들을 이용해 BERT pre-training 과정 학습

In [42]:
model = BERTforPreTraining()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 데이터 생성
batch = make_batch()

# 데이터 torch 텐서로 변경
input_ids, segment_ids, masked_tokens, masked_pos, isNext = zip(*batch)
input_ids, segment_ids, masked_tokens, masked_pos, isNext = \
    torch.LongTensor(input_ids),  torch.LongTensor(segment_ids), torch.LongTensor(masked_tokens), \
    torch.LongTensor(masked_pos), torch.LongTensor(isNext)

In [44]:
print('BERT pre-training start.')
for epoch in range(100):
    optimizer.zero_grad()
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
    
    #Masked LM과 NSP의 loss를 합쳐서 계산
    loss = loss_lm + loss_clsf
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

Epoch: 0010 cost = 1.777021
Epoch: 0020 cost = 1.644690
Epoch: 0030 cost = 1.693570
Epoch: 0040 cost = 1.715063
Epoch: 0050 cost = 1.736495
Epoch: 0060 cost = 1.656106
Epoch: 0070 cost = 1.664046
Epoch: 0080 cost = 1.653094
Epoch: 0090 cost = 1.697689
Epoch: 0100 cost = 1.676444


## Predict mask token and isNext

In [51]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = batch[3]
#print(text)
print([number_dict[w] for w in input_ids if number_dict[w] != '[PAD]'])

logits_lm, logits_clsf = model(torch.LongTensor([input_ids]), \
                               torch.LongTensor([segment_ids]), torch.LongTensor([masked_pos]))
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos for pos in masked_tokens if pos != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

['[CLS]', 'hello', 'romeo', 'my', 'name', 'is', 'juliet', 'nice', 'to', 'meet', 'you', '[SEP]', 'nice', 'meet', 'you', 'too', '[MASK]', 'are', '[MASK]', 'today', '[SEP]']
masked tokens list :  [17, 26, 6]
predict masked tokens list :  [25, 25, 25]
isNext :  True
predict isNext :  True
