In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

Collecting git+https://****@github.com/SKTBrain/KoBERT.git@master
  Cloning https://****@github.com/SKTBrain/KoBERT.git (to revision master) to /tmp/pip-req-build-1we33ft5
  Running command git clone -q 'https://****@github.com/SKTBrain/KoBERT.git' /tmp/pip-req-build-1we33ft5
Collecting boto3
  Downloading boto3-1.20.33-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 7.3 MB/s 
[?25hCollecting gluonnlp>=0.6.0
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[K     |████████████████████████████████| 344 kB 58.4 MB/s 
[?25hCollecting mxnet>=1.4.0
  Downloading mxnet-1.9.0-py3-none-manylinux2014_x86_64.whl (47.3 MB)
[K     |████████████████████████████████| 47.3 MB 54.4 MB/s 
[?25hCollecting onnxruntime==0.3.0
  Downloading onnxruntime-0.3.0-cp37-cp37m-manylinux1_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 51.8 MB/s 
[?25hCollecting sentencepiece>=0.1.6
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014

In [3]:
cd /content/drive/MyDrive/AIBootCamp/

/content/drive/MyDrive/AIBootCamp


# import Library

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [5]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [6]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [7]:
## GPU 사용
device = torch.device("cuda:0")

In [8]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [9]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

using cached model. /content/drive/My Drive/AIBootCamp/.cache/kobert_v1.zip
using cached model. /content/drive/My Drive/AIBootCamp/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


### 데이터 맞춰주기

In [10]:
dataset_train = nlp.data.TSVDataset('tsv_train.tsv', num_discard_samples=1)
dataset_test = nlp.data.TSVDataset('tsv_test.tsv', num_discard_samples=1)

## Tokenizer

In [11]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /content/drive/My Drive/AIBootCamp/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


##  KoBERT 입력 데이터로 만들기

In [12]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


## Setting parameters

In [13]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 10
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [14]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [15]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

  cpuset_checked))


In [16]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=3,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [17]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [18]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [19]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [20]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [21]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [22]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [23]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  cpuset_checked))


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.0529940128326416 train acc 0.484375
epoch 1 train acc 0.6794621394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 1 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.5980338454246521 train acc 0.84375
epoch 2 train acc 0.8357121394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 2 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.5614994168281555 train acc 0.84375
epoch 3 train acc 0.8357121394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 3 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.549100399017334 train acc 0.84375
epoch 4 train acc 0.8357121394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 4 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.559780478477478 train acc 0.84375
epoch 5 train acc 0.8357121394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 5 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.5625834465026855 train acc 0.84375
epoch 6 train acc 0.8357121394230769


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 6 test acc 0.8594460227272727


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.500923752784729 train acc 0.84375
epoch 7 train acc 0.8473557692307693


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 7 test acc 0.8718039772727273


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.4424988031387329 train acc 0.875
epoch 8 train acc 0.8635066105769231


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 8 test acc 0.8191051136363636


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.42359915375709534 train acc 0.890625
epoch 9 train acc 0.8854417067307693


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 9 test acc 0.8347301136363636


  0%|          | 0/16 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.41929370164871216 train acc 0.890625
epoch 10 train acc 0.8898737980769231


  0%|          | 0/4 [00:00<?, ?it/s]

epoch 10 test acc 0.8269176136363636


In [24]:
PATH = '/content/drive/MyDrive/AIBootCamp/'

torch.save(model, PATH + 'model.pt')  # 전체 모델 저장
torch.save(model.state_dict(), PATH + 'model_state_dict.pt')  # 모델 객체의 state_dict 저장
torch.save({
    'model': model.state_dict(),
    'optimizer': optimizer.state_dict()
}, PATH + 'all.tar')  # 여러 가지 값 저장, 학습 중 진행 상황 저장을 위해 epoch, loss 값 등 일반 scalar값 저장 가능

In [71]:
comm = pd.read_csv('comment_test.csv')
comm.drop('Unnamed: 0', axis=1, inplace=True)
comm.dropna(inplace=True)
comm.reset_index(drop=True,inplace=True)
comm

Unnamed: 0,0
0,우리가족표 다 몰아드릴게요 꼭 정권교체 해주세요
1,방역패스가 자영업자 큰 피해를 막아주었다는 황당한 말을 듣고 있자니 울화통이 터지네...
2,진짜 그만좀해라 어차피 코로나 종식? 안된다 또다른 변이 또나오고 또나온다. 그냥 ...
3,문죄인이 중동여행 간다던데?설마 아니겠지?
4,진짜 마트는 에바 아닌가?? 혼자 사는 사람이나 임산부는 급할때 구멍가게에서 분유 ...
...,...
32944,그렇게 뉴스 나왔는데.미접종자들. 왜갔누. 생각좀하고살자
32945,아니 바이러스가18세미만은 피해가나? 말뎌안되는 방역패스사망자가60대부터80대가95...
32946,접종률 100퍼 달성하면 화이자에서 포상금이라도 주는거냐? 그게 목표냐?
32947,뉴스들좀 보세요.. 알고갓음 편할것을


In [72]:
comm['0'].apply(lambda x: print(x) if type(x) != str else x)

0                               우리가족표 다 몰아드릴게요 꼭 정권교체 해주세요
1        방역패스가 자영업자 큰 피해를 막아주었다는 황당한 말을 듣고 있자니 울화통이 터지네...
2        진짜 그만좀해라 어차피 코로나 종식? 안된다 또다른 변이 또나오고 또나온다. 그냥 ...
3                                  문죄인이 중동여행 간다던데?설마 아니겠지?
4        진짜 마트는 에바 아닌가?? 혼자 사는 사람이나 임산부는 급할때 구멍가게에서 분유 ...
                               ...                        
32944                      그렇게 뉴스 나왔는데.미접종자들. 왜갔누. 생각좀하고살자
32945    아니 바이러스가18세미만은 피해가나? 말뎌안되는 방역패스사망자가60대부터80대가95...
32946             접종률 100퍼 달성하면 화이자에서 포상금이라도 주는거냐? 그게 목표냐?
32947                                 뉴스들좀 보세요.. 알고갓음 편할것을
32948                                         백신 계약서 공개하라~
Name: 0, Length: 32949, dtype: object

In [62]:
def getSentimentValue(comment, tok, max_len, batch_size, device):
  commnetslist = [] # 텍스트 데이터를 담을 리스트
  emo_list = [] # 감성 값을 담을 리스트
  for c in comment: # 모든 댓글
    commnetslist.append( [c, 5] ) # [댓글, 임의의 양의 정수값] 설정
    
  pdData_df = pd.DataFrame( commnetslist, columns = [['댓글', '감성']] )
  pdData = pdData_df.values
  test_set = BERTDataset(pdData, 0, 1, tok, max_len, True, False) 
  test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=5)
  
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_input):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length

    # 이때, out이 예측 결과 리스트
    out = model(token_ids, valid_length, segment_ids)
	
    # e는 3가지 실수 값으로 구성된 리스트
    # 0번 인덱스가 더 크면 부정, 긍정은 반대
    test_eval=[] # print를 위한
    for r in out:
      r = r.cpu().detach().numpy()
      if np.argmax(r) == 0: # nutral
        # test_eval.append('중립적인 감정이')
        value = 0
      elif np.argmax(r) == 1: # positive
        # test_eval.append('긍정적인 감정이')
        value = 1
      else: #부정
        # test_eval.append('부정적인 감정이')
        value = 2
      # print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")
      emo_list.append(value)

  
  return emo_list, pdData_df.to_numpy() # 텍스트 데이터에 1대1 매칭되는 감성값 리스트 반환

In [73]:
emo_list, df = getSentimentValue(comm['0'], tok, max_len, batch_size, device)

  cpuset_checked))


In [74]:
df = pd.DataFrame(df,columns=['댓글', '감성'])
df['감성'] = emo_list

In [76]:
df.to_csv('predict.csv', encoding='utf-8-sig')