In [1]:
import torch
import random
import numpy as np

from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertForNextSentencePrediction

- `bert-base-uncased` : Tokenize를 할 때, 대소문자를 구분하려면 `cased`, 모두 소문자로 변환하려면 `uncased` 사용합니다.
- `add_pooling_layer` : BERT 위에 추가적으로 pooling layer를 쌓을 것인지 여부를 결정합니다. pooling layer는 `[CLS]` token의 embedding만을 뽑아 linear 연산과 activation 연산을 수행하는 layer로, BERT를 이용해 classification task를 수행할 때 주로 사용합니다.
- `output_hidden_states` : BERT의 각 layer의 hidden state들을 출력할 것인지 여부를 결정합니다.(즉, 12개의 encoder layer가 출력한 hidden_state를 모두 반환.) False인 경우 마지막 encoder_layer의 hidden_state만 반환.
- `ouput_attentions` : BERT의 각 layer의 attention weight들을 출력할 것인지 여부를 결정합니다.(12개의 encoder layer가 출력한 attention_weight를 모두 반환.) 여기서 attention_weight는 Attention distribution을 말함.

# 모델 구조

In [2]:
model = BertModel.from_pretrained("bert-base-uncased",
                                  add_pooling_layer=False,
                                  output_hidden_states=True,
                                  output_attentions=True)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
print(model.num_parameters())
print(model)

108891648
BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inp

# Bert Embeddings

``` python
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
```

- word_embeddings : 시퀀스 데이터의 각 토큰들 즉, 단어들을 768차원의 벡터로 맵핑하는 임베딩. vocab의 크기는 30522.
- position_embeddings : 시퀀스 내 각 토큰들에 대한 위치정보를 반영. ```문장의 최대 길이는 512```. ```따라서 512개의 토큰들은 각각 768차원의 위치 임베딩 벡터로 맵핑된다.```
- token_type_embeddings(segment embedding) : 각 토큰이 어떤 문장에 속하는지를 나타내는 임베딩 벡터. 한 번에 두 문장을 입력 받았을 때 문장 A에 속하는 토큰에는 0, B에 속하는 토큰에는 1을 부여한다.

In [5]:
print(model.embeddings.word_embeddings)
print(model.embeddings.position_embeddings)
print(model.embeddings.token_type_embeddings)

Embedding(30522, 768, padding_idx=0)
Embedding(512, 768)
Embedding(2, 768)


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') ## BERT 토크나이저 불러오기

tokens = tokenizer("I love NLP!", "I love ai") ## 예제 문장 토큰화.
print(tokens)

print("input ids : ", tokens['input_ids']) ## 각 토큰이 가진 vocab내 idx
print("token_type_ids: ", tokens['token_type_ids']) ## segment embedding
print("input_tokens : ", tokenizer.convert_ids_to_tokens(tokens['input_ids'])) ## 토크나이저가 special tokens를 알아서 집어넣어준다.
print("lenght of input_tokens : ", len(tokenizer.convert_ids_to_tokens(tokens['input_ids'])))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 1045, 2293, 17953, 2361, 999, 102, 1045, 2293, 9932, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
input ids :  [101, 1045, 2293, 17953, 2361, 999, 102, 1045, 2293, 9932, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
input_tokens :  ['[CLS]', 'i', 'love', 'nl', '##p', '!', '[SEP]', 'i', 'love', 'ai', '[SEP]']
lenght of input_tokens :  11




# Bert Encoder Layer

학습을 통해 양방향 문맥 정보가 반영된 임베딩을 만들어내는 층.

첫번째 encoder layer는 embeddings 모듈의 출력값을 입력으로 받으며 총 12개의 encoder lauyer를 통과해 encoder_output이라는 출력값을 반환한다.

In [11]:
# bert encoder layer의 출력값 확인
model_input = tokenizer("I love NLP!" , "I love ai", return_tensors="pt") ## 문장 토큰화 수행.
output = model(**model_input) ## 모델에 입력
print(len(output))

print("BERT의 총 hidden_states 개수: ", len(output.hidden_states)) ## BERT Embedding의 hidden state 1개 + BERT Encoder의 hidden state 12개
print("Encoder layer의 출력 형태: ", output.last_hidden_state.shape) ## 예시로 사용한 문장은 토크나이저에 의해 11개의 토큰으로 분할되었고 마지막 encoder의 출력이므로 (1, 11, 768)

## Pretraining, Finetuning을 수행할 때는 last_hidden_state를 가지고 한다.

3
BERT의 총 hidden_states 개수:  13
Encoder layer의 출력 형태:  torch.Size([1, 11, 768])


# Pretraining

## Masked Language Model

In [12]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
text = """
Squid Game was released worldwide on September 17, 2021, to critical acclaim and
international attention. It is Netflix's most-watched series, becoming
the top-viewed program in 94 countries and attracting more than 142 million
member households and amassing 1.65 billion viewing hours during its first four
weeks from launch, surpassing Bridgerton for the title of most watched show.
The series has also received numerous accolades, including the Golden Globe
Award for Best Supporting Actor – Series, Miniseries or Television Film for
O Yeong-su and the Screen Actors Guild Award for Outstanding Performance by a
Male Actor in a Drama Series and Outstanding Performance by a Female Actor in a
Drama Series for Lee Jung-jae and HoYeon Jung, respectively, with all three making
history as the first Korean actors to win in those categories. A second season is in development.
"""

## Attention Mask는 attention 계산시 유의미한 단어는 1, PAD 같이 무의미한 단어는 0으로 하는 mask를 의미한다.
inputs = tokenizer(text, return_tensors='pt')
print(inputs)
print("토큰화 된 결과: ", tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

{'input_ids': tensor([[  101, 26852,  2208,  2001,  2207,  4969,  2006,  2244,  2459,  1010,
         25682,  1010,  2000,  4187, 10761,  1998,  2248,  3086,  1012,  2009,
          2003, 20907,  1005,  1055,  2087,  1011,  3427,  2186,  1010,  3352,
          1996,  2327,  1011,  7021,  2565,  1999,  6365,  3032,  1998, 15411,
          2062,  2084, 16087,  2454,  2266,  3911,  1998, 25933, 18965,  1015,
          1012,  3515,  4551, 10523,  2847,  2076,  2049,  2034,  2176,  3134,
          2013,  4888,  1010, 27097,  2958, 11715,  2005,  1996,  2516,  1997,
          2087,  3427,  2265,  1012,  1996,  2186,  2038,  2036,  2363,  3365,
         27447,  1010,  2164,  1996,  3585,  7595,  2400,  2005,  2190,  4637,
          3364,  1516,  2186,  1010, 13612,  2030,  2547,  2143,  2005,  1051,
          6300,  5063,  1011, 10514,  1998,  1996,  3898,  5889,  9054,  2400,
          2005,  5151,  2836,  2011,  1037,  3287,  3364,  1999,  1037,  3689,
          2186,  1998,  5151,  2836,  

MLM은 MASK 토큰에 있어야할 단어를 예측하는 것이므로 원래 단어가 곧 label에 해당. 따라서 기존 토큰을 그대로 복사하여 labels로 취급.

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()
print(inputs)

In [16]:
rand = torch.rand(inputs.input_ids.shape)

# CLS : 101, SEP : 102번 토큰 제외하고 15% 위치 선별
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)

# 전체 토큰 중 15%를 대체할 토큰으로 선정
selection = torch.flatten((mask_arr[0]).nonzero())

# 선별된 위치의 인덱스 번호
## 토큰의 정수값이 아니라 시퀀스 내에서 몇번째 idx의 토큰이 masking될 것인지를 말함.
print(selection)

# selection의 위치마다 0~1 값 부여
selection_val = np.random.random(len(selection))

# 80% : Mask 토큰 대체
mask_selection = selection[np.where(selection_val >= 0.2)[0]]

# 10% : 랜덤 토큰 대체
random_selection = selection[np.where(selection_val < 0.1)[0]]

print("[MASK]로 교체될 토큰의 index: ", mask_selection)
print("다른 토큰으로 교체될 토큰의 index: ", random_selection)

inputs.input_ids[0, mask_selection] = 103 ## Mask로 처리되는 위치의 토큰은 103번으로 맵핑
inputs.input_ids[0, random_selection] = torch.randint(0, 30522, size = random_selection.shape) ## 임의의 단어로 바뀌는 위치의 토큰은 BERT의 단어 집합에서 랜덤으로 단어 추출

print(inputs['input_ids'])

tensor([  5,   7,  16,  23,  33,  57,  74,  75,  82,  85,  89,  95, 104, 109,
        112, 113, 114, 116, 131, 132, 137, 142, 155, 166])
[MASK]로 교체될 토큰의 index:  tensor([  5,   7,  16,  57,  74,  82,  85,  89, 104, 109, 112, 114, 116, 131,
        132, 137, 142, 166])
다른 토큰으로 교체될 토큰의 index:  tensor([ 75, 155])
tensor([[  101, 26852,  2208,  2001,  2207,   103,  2006,   103,  2459,  1010,
         25682,  1010,  2000,  4187, 10761,  1998,   103,  3086,  1012,  2009,
          2003, 20907,  1005,  1055,  2087,  1011,  3427,  2186,  1010,  3352,
          1996,  2327,  1011,  7021,  2565,  1999,  6365,  3032,  1998, 15411,
          2062,  2084, 16087,  2454,  2266,  3911,  1998, 25933, 18965,  1015,
          1012,  3515,  4551, 10523,  2847,  2076,  2049,   103,  2176,  3134,
          2013,  4888,  1010, 27097,  2958, 11715,  2005,  1996,  2516,  1997,
          2087,  3427,  2265,  1012,   103, 21041,  2038,  2036,  2363,  3365,
         27447,  1010,   103,  1996,  3585,   103,  2400,

In [17]:
# Trainer에 들어갈 데이터 셋 클래스
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)
    

# 위에서 설정한 inputs를 데이터 셋 클래스로 지정
dataset = Dataset(inputs)

In [None]:
# Traning을 위한 argument 설정
args = TrainingArguments(
    output_dir='./', # 결과 정보를 받아볼 디렉토리
    per_device_train_batch_size=16, # gpu 당 batch size 수
    num_train_epochs=10 # epoch 수
)

trainer = Trainer(
    model=model, # 훈련을 수행할 모델
    args=args, # 지정한 argument
    train_dataset=dataset # 학습 데이터셋
)

# GPU 학습을 위해 모델을 선택된 device에 이동
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

model.train() # 학습 모드 설정
trainer.train()

## Next Sentence Prediction

In [None]:
# Wikitext 데이터셋 불러오기
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
# 학습 예시를 위한 일부 데이터 선정
data = dataset['test']['text'][:10]
data

['',
 ' = Robert Boulter = \n',
 '',
 ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n',
 ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill 

In [None]:
# 모델 입력을 위해 데이터 전처리 수행
text = []

for line in data:
  line = line.strip()
  if line:
    if line[0].isalpha():
      print("추출된 문장: ", line)
      text.append(line)

추출된 문장:  Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall .
추출된 문장:  In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on a 2006 episode

In [None]:
text

['Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall .',
 'In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on a 2006 episode of the tel

In [None]:
bag = [item for sentence in text for item in sentence.split('.') if item != '']
bag_size = len(bag)
print(bag_size)

14


In [None]:
bag

['Robert Boulter is an English film , television and theatre actor ',
 ' He had a guest @-@ starring role on the television series The Bill in 2000 ',
 ' This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre ',
 ' He had a guest role in the television series Judge John Deed in 2002 ',
 ' In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi ',
 ' He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London ',
 ' He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall ',
 'In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill ',
 ' He appeared o

In [None]:
sentence_a = []
sentence_b = []
label = []

for paragraph in text:
    sentences = [sentence for sentence in paragraph.split('.') if sentence != '']
    
    num_sentences = len(sentences)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50:50의 비율로 순서가 올바르거나 올바르지 않은 문장을 선정
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(sentences[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentences[start])
            sentence_b.append(bag[index])
            label.append(1)

In [None]:
for i in range(len(label)):
    print(label[i])
    print(sentence_a[i] + '\n---')
    print(sentence_b[i] + '\n')

1
 He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London 
---
 In May 2008 , Boulter made a guest appearance on a two @-@ part episode arc of the television series Waking the Dead , followed by an appearance on the television series Survivors in November 2008 

1
 How to Curse was performed at Bush Theatre in the London Borough of Hammersmith and Fulham 
---
 This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre 



In [None]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [None]:
inputs['labels'] = torch.LongTensor([label]).T
inputs.labels

tensor([[1],
        [1]])

In [None]:
# NLP 작업을 위한 모델 불러오기
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [None]:
# GPU 학습을 위해 모델을 선택된 device에 이동
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

model.train()  # 학습 모드 설정

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
dataset = Dataset(inputs)

In [None]:
# Traning을 위한 argument 설정
args = TrainingArguments(
    output_dir='./', # 결과 정보를 받아볼 디렉토리
    per_device_train_batch_size=16, # gpu 당 batch size 수
    num_train_epochs=10 # epoch 수
)

trainer = Trainer(
    model=model, # 훈련을 수행할 모델
    args=args, # 지정한 argument
    train_dataset=dataset # 학습 데이터셋
)

In [None]:
trainer.train()

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss


TrainOutput(global_step=10, training_loss=0.9699393272399902, metrics={'train_runtime': 2.2196, 'train_samples_per_second': 9.011, 'train_steps_per_second': 4.505, 'total_flos': 5262221107200.0, 'train_loss': 0.9699393272399902, 'epoch': 10.0})