In [23]:
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-ufl1cyfb
Collecting onnxruntime==1.8.0
  Downloading onnxruntime-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 18.6 MB/s eta 0:00:01
Collecting transformers>=4.8.1
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 40.8 MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 19.9 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 16.9 MB/s eta 0:00:01
Building wheels for collected packages: kobert
  Building wheel for kobert (setup.py)

In [45]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [46]:
# 학습에 사용할 머신 선택: cpu or gpu(cuda)
# device = torch.device("cpu")
device = torch.device("cuda:1")

In [47]:
# KoBERT 모델 및 사전 로딩
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

using cached model. /home/sks/dsba/lge/.cache/kobert_v1.zip
using cached model. /home/sks/dsba/lge/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [48]:
# Dataset 다운로드 및 불러오기
# !wget -O .cache/ratings_train.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt
# !wget -O .cache/ratings_test.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt

dataset_train = nlp.data.TSVDataset(".cache/ratings_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset(".cache/ratings_test.txt", field_indices=[1,2], num_discard_samples=1)

In [49]:
dataset_train[:10]

[['아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['너무재밓었다그래서보는것을추천한다', '0'],
 ['교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'],
 ['사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '1'],
 ['막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.', '0'],
 ['원작의 긴장감을 제대로 살려내지못했다.', '0'],
 ['별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단 낫겟다 납치.감금만반복반복..이드라마는 가족도없다 연기못하는사람만모엿네',
  '0'],
 ['액션이 없는데도 재미 있는 몇안되는 영화', '1'],
 ['왜케 평점이 낮은건데? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나?', '1']]

In [5]:
# BERT 토크나이저 로딩
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/sks/dsba/lge/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [6]:
# BERT에 입력하기 위한 데이터셋 로더 정의
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, # 버트 토크나이저 사용
            max_seq_length=max_len, # 배치로 입력할 텍스트 데이터의 최대 길이 설정: 만약 64로 설정 시, 그보다 짧은 문장의 경우 나머지를 padding 수행하여 길이를 맞춰줌
            pad=pad,
            pair=pair
        )

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [7]:
# 하이퍼파라미터 세팅
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

# 데이터셋(Loader) 정의 (학습/테스트셋)
data_train = BERTDataset(
    dataset=dataset_train,
    sent_idx=0,
    label_idx=1,
    bert_tokenizer=tok,
    max_len=max_len,
    pad=True,
    pair=False
)

data_test = BERTDataset(
    dataset=dataset_test,
    sent_idx=0,
    label_idx=1,
    bert_tokenizer=tok,
    max_len=max_len,
    pad=True,
    pair=False
)

In [8]:
# 데이터 확인: BERT에 입력하기 위한 데이터
data_train[100]

(array([   2, 3010, 7495, 7096, 1907, 7533, 7628, 7095, 3930, 7941, 6983,
         517,   46, 2149, 7063, 6983, 4928, 7495, 5655, 5330, 3342, 3942,
        7848, 7303, 6553, 1698, 5808, 6280, 7096, 6946,  517,   54,    3,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1], dtype=int32),
 array(33, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32),
 1)

In [9]:
train_dataloader = torch.utils.data.DataLoader(
    data_train,
    batch_size=batch_size,
    num_workers=8
)

test_dataloader = torch.utils.data.DataLoader(
    data_test,
    batch_size=batch_size,
    num_workers=8
)

In [19]:
class BERTClassifier(nn.Module):
    '''
    Arguments
        bert: 버트 모델 (KoBERT)
        hidden_size: 버트가 사용하는 임베딩 벡터들의 차원
        num_classes: 결과물의 클래스 수(긍/부정 분류의 경우 2가지이므로 2)
        dr_rate: Drop-out Rate
        params:
    '''
    def __init__(
        self,
        bert,
        hidden_size=768,
        num_classes=2,
        dr_rate=None
    ):
        
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes) # 분류기
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        
        if self.dr_rate:
            pooler = self.dropout(pooler)
            
        out = self.classifier(pooler)
            
        return out

In [20]:
# model 정의 및 설정된 device로 옮김
model = BERTClassifier(
    bertmodel,
    dr_rate=0.5
).to(device)

In [21]:
# 옵티마이저 설정: bias 및 Layer Normalization Layer의 경우 weight_decay 사용하지 않음
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Loss 함수 설정: 분류기 학습을 위한 크로스-엔트로피 Loss 설정
loss_fn = nn.CrossEntropyLoss()

In [22]:
# 설정된 warmup_ratio에 따른 Warmup 횟수 설정
# Warmup이 끝난 후에는 원래대로의 Learning Rate 스케쥴러를 따라 학습률이 설정됨
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [23]:
# Learning Rate 스케쥴러
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_step,
    num_training_steps=t_total
)

In [24]:
# 정확도 계산 함수
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [25]:
# 학습 수행
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        
        out = model(token_ids, valid_length, segment_ids) # 분류 결과
        loss = loss_fn(out, label) # Loss 계산
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        scheduler.step() # Update learning rate schedule
        
        train_acc += calc_accuracy(out, label)
        
        if batch_id % log_interval == 0:
            print(f"Epoch [{e+1}] Batch [{batch_id}/{len(train_dataloader)}] Loss {loss.data.cpu().numpy():.4f} Train Acc. {train_acc / (batch_id + 1):.4f}")

    print(f"Epoch [{e+1}] Train Acc. {train_acc / (batch_id + 1):.4f}")
    
    # Evaluation 수행
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
        
    print(f"Epoch [{e+1}] Test Acc. {test_acc / (batch_id+1):.4f}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

Epoch [1] Batch [0/2344] Loss 0.6741 Train Acc. 0.6094
Epoch [1] Batch [200/2344] Loss 0.5045 Train Acc. 0.5614
Epoch [1] Batch [400/2344] Loss 0.4633 Train Acc. 0.6627
Epoch [1] Batch [600/2344] Loss 0.4351 Train Acc. 0.7173
Epoch [1] Batch [800/2344] Loss 0.4152 Train Acc. 0.7485
Epoch [1] Batch [1000/2344] Loss 0.3261 Train Acc. 0.7685
Epoch [1] Batch [1200/2344] Loss 0.3099 Train Acc. 0.7827
Epoch [1] Batch [1400/2344] Loss 0.3590 Train Acc. 0.7932
Epoch [1] Batch [1600/2344] Loss 0.3129 Train Acc. 0.8018
Epoch [1] Batch [1800/2344] Loss 0.2536 Train Acc. 0.8090
Epoch [1] Batch [2000/2344] Loss 0.3018 Train Acc. 0.8156
Epoch [1] Batch [2200/2344] Loss 0.2683 Train Acc. 0.8212

Epoch [1] Train Acc. 0.8251


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))




KeyboardInterrupt: 

In [83]:
# 학습된 모델의 가중치 저장하기
# torch.save(model.state_dict(), 'kobert_sentiment.pt')

### 학습된 모델 가중치 사용하기

In [50]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [51]:
# 모델 구축하기 1 (BASE)
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

using cached model. /home/sks/dsba/lge/.cache/kobert_v1.zip
using cached model. /home/sks/dsba/lge/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [52]:
# 모델 구축하기 2 (Classifier)
class BERTClassifier(nn.Module):
    '''
    Arguments
        bert: 버트 모델 (KoBERT)
        hidden_size: 버트가 사용하는 임베딩 벡터들의 차원
        num_classes: 결과물의 클래스 수(긍/부정 분류의 경우 2가지이므로 2)
        dr_rate: Drop-out Rate
        params:
    '''
    def __init__(
        self,
        bert,
        hidden_size=768,
        num_classes=2,
        dr_rate=None
    ):
        
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.classifier = nn.Linear(hidden_size , num_classes) # 분류기
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, out = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(), attention_mask=attention_mask.float().to(token_ids.device))
        
        if self.dr_rate:
            out = self.dropout(out)
            
        out = self.classifier(out)
            
        return out

In [53]:
# model 정의 및 설정된 device로 옮김
model = BERTClassifier(
    bertmodel
)

In [54]:
# 저장해 둔 학습된 가중치 불러오기 (cpu로)
saved_weights = torch.load('kobert_sentiment.pt', map_location='cpu')

In [56]:
model.load_state_dict(saved_weights.state_dict())
model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

### 추론하기

In [57]:
# BERT 토크나이저 로딩
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/sks/dsba/lge/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [58]:
text_transformer = nlp.data.BERTSentenceTransform(
            tok,
            max_seq_length=64,
            pad=True,
            pair=False
        )

In [61]:
text = '이런 영화를 볼 수 있다는게 참 불행입니다'
text_infer = (text, '')

token_ids, valid_length, segment_ids = map(lambda l: torch.tensor(l), text_transformer(text_infer))

token_ids = token_ids.long().unsqueeze(0)
valid_length = valid_length.unsqueeze(0)
segment_ids = segment_ids.long().unsqueeze(0)

In [62]:
pred = model(token_ids, valid_length, segment_ids).squeeze(0)
pred = torch.argmax(torch.nn.functional.softmax(pred, dim=-1)).item()

print("Positive") if pred == 1 else print("Negative")

Negative
