In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [6]:
##GPU 사용 시
device = torch.device("cuda:0")
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /home/mglee/VSCODE/git_folder/comments_sentiment_label/code/.cache/kobert_v1.zip
using cached model. /home/mglee/VSCODE/git_folder/comments_sentiment_label/code/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [7]:
import pandas as pd
comments_unseen = pd.read_csv('../data/comments_unseen.tsv', sep = "\t")

In [8]:
comments_unseen

Unnamed: 0,comments,sentiment,replaced text
0,공산당이야 뭐야 중국처럼되고싶어?,0,공산당이야 뭐야 중국처럼되고싶어?
1,어쩐지 좀 이상하다 했다 문 정권에 잘 보였나 생각했었다 카카오,0,어쩐지 좀 이상하다 했다 문 정권에 잘 보였나 생각했었다 타겟
2,카카오 사용자만 매타버스 카카오 시스템 으로 연결해도 독보적 일듯 이인원을 어떤...,1,타겟 사용자만 매타버스 타겟 시스템 으로 연결해도 독보적 일듯 이인원을 어떤 기...
3,저렇게 소독하면 지폐 사이사이는 더럽겠네 겉만 소독하능거잖아 ㅋ,0,저렇게 소독하면 지폐 사이사이는 더럽겠네 겉만 소독하능거잖아 ㅋ
4,삼성전자가 카카오가 필요한 순간이 곧 오겠지,1,삼성전자가 타겟가 필요한 순간이 곧 오겠지
...,...,...,...
1662,편하긴 하겠네 ㅎㅎ거기다가 카톡으로 받고 바로 결제...,1,편하긴 하겠네 ㅎㅎ거기다가 카톡으로 받고 바로 결제...
1663,카카오 그만 써야겠다,0,타겟 그만 써야겠다
1664,카카오페이 가즈아~!,1,타겟 가즈아~!
1665,카카오뱅크 응원한다 그런데 대기업도 풀어 _으면 좋겠다 소니가 금융을 하고 GE가 ...,1,타겟 응원한다 그런데 대기업도 풀어 _으면 좋겠다 소니가 금융을 하고 GE가 전기전...


In [9]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
 
    def __len__(self):
        return (len(self.labels))


In [10]:
## Setting parameters
max_len = 64
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 500
learning_rate =  5e-5

In [11]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [13]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [15]:
dataset_train = nlp.data.TSVDataset("../data/ratings_train.txt?dl=1", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("../data/ratings_test.txt?dl=1", field_indices=[1,2], num_discard_samples=1)
dataset_unseen = nlp.data.TSVDataset("../data/comments_unseen.tsv", field_indices=[0,1], num_discard_samples=1)
print(dataset_train[3], '\n', dataset_test[3], '\n', dataset_unseen[2])

['교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'] 
 ['지루하지는 않은데 완전 막장임... 돈주고 보기에는....', '0'] 
 ['카카오 사용자만 매타버스 카카오 시스템 으로 연결해도  독보적 일듯  이인원을 어떤 기업이 가져가나 지금~!  익숙한게 가장 무서운것이다', '1']


In [16]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/mglee/VSCODE/git_folder/comments_sentiment_label/code/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [17]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
data_unseen = BERTDataset(dataset_unseen, 0, 1, tok, max_len, True, False)

train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
unseen_dataloader = torch.utils.data.DataLoader(data_unseen, batch_size=batch_size, num_workers=5)

for data in train_dataloader:
  print("Data: ", data)
  print("Waveform: {}\nSample rate: {}\nLabels: {}".format(data[0], data[1], data[2]))
  break

Data:  [tensor([[   2, 3093, 1698,  ...,    1,    1,    1],
        [   2,  517, 7989,  ...,    1,    1,    1],
        [   2, 1458, 7191,  ...,    1,    1,    1],
        ...,
        [   2,  517, 6592,  ...,    1,    1,    1],
        [   2, 3255, 7728,  ...,    1,    1,    1],
        [   2,  517, 7265,  ...,    1,    1,    1]], dtype=torch.int32), tensor([15, 36, 14, 26, 47, 41, 16, 64, 13, 38, 17, 35, 29, 34, 34, 13, 45, 23,
        35, 27, 27, 10, 64, 17, 11, 42, 19,  9,  6, 36, 33, 25],
       dtype=torch.int32), tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32), tensor([0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
        0, 0, 1, 0, 1, 1, 1, 0], dtype=torch.int32)]
Waveform: tensor([[   2, 3093, 1698,  ...,    1,    1,    1],
        [   2,  517, 7989,  ...,    1,   

In [18]:
model_naver = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [19]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_naver.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model_naver.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [20]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model_naver.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model_naver(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_naver.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model_naver.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model_naver(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/4688 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 0.702319860458374 train acc 0.625
epoch 1 batch id 501 loss 0.4313753843307495 train acc 0.6326097804391217
epoch 1 batch id 1001 loss 0.34515905380249023 train acc 0.7251810689310689
epoch 1 batch id 1501 loss 0.4603748917579651 train acc 0.7654480346435709
epoch 1 batch id 2001 loss 0.27208149433135986 train acc 0.7860132433783108
epoch 1 batch id 2501 loss 0.28715118765830994 train acc 0.7998050779688125
epoch 1 batch id 3001 loss 0.29279664158821106 train acc 0.8086991836054649
epoch 1 batch id 3501 loss 0.30540773272514343 train acc 0.8160971865181377
epoch 1 batch id 4001 loss 0.24426881968975067 train acc 0.8226146588352912
epoch 1 batch id 4501 loss 0.23216594755649567 train acc 0.8281076427460564
epoch 1 train acc 0.8300181313993175


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch 1 test acc 0.8775191938579654


  0%|          | 0/4688 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.4606610834598541 train acc 0.78125
epoch 2 batch id 501 loss 0.21892902255058289 train acc 0.8781811377245509
epoch 2 batch id 1001 loss 0.39979350566864014 train acc 0.8818993506493507
epoch 2 batch id 1501 loss 0.16275081038475037 train acc 0.8856387408394404
epoch 2 batch id 2001 loss 0.2931627631187439 train acc 0.8884620189905047
epoch 2 batch id 2501 loss 0.23587913811206818 train acc 0.8905687724910036
epoch 2 batch id 3001 loss 0.19419696927070618 train acc 0.8929835888037321
epoch 2 batch id 3501 loss 0.1633012294769287 train acc 0.8954405884033133
epoch 2 batch id 4001 loss 0.20195621252059937 train acc 0.8976349662584354
epoch 2 batch id 4501 loss 0.06842808425426483 train acc 0.8996264718951344
epoch 2 train acc 0.9008305780716723


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch 2 test acc 0.8867962252079334


  0%|          | 0/4688 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 0.3473733961582184 train acc 0.875
epoch 3 batch id 501 loss 0.17097227275371552 train acc 0.9195359281437125
epoch 3 batch id 1001 loss 0.1799362748861313 train acc 0.923233016983017
epoch 3 batch id 1501 loss 0.16419649124145508 train acc 0.9255704530313125
epoch 3 batch id 2001 loss 0.2521779239177704 train acc 0.9280047476261869
epoch 3 batch id 2501 loss 0.06768294423818588 train acc 0.9306527389044382
epoch 3 batch id 3001 loss 0.15877625346183777 train acc 0.932584971676108
epoch 3 batch id 3501 loss 0.06458266079425812 train acc 0.9350810482719223
epoch 3 batch id 4001 loss 0.018122706562280655 train acc 0.9367658085478631
epoch 3 batch id 4501 loss 0.025364721193909645 train acc 0.9381526327482782
epoch 3 train acc 0.9390265038395904


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch 3 test acc 0.8933541266794626


  0%|          | 0/4688 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.38753294944763184 train acc 0.875
epoch 4 batch id 501 loss 0.07821313291788101 train acc 0.9532809381237525
epoch 4 batch id 1001 loss 0.11439358443021774 train acc 0.9548888611388612
epoch 4 batch id 1501 loss 0.07711664587259293 train acc 0.9572160226515656
epoch 4 batch id 2001 loss 0.12486609071493149 train acc 0.9588174662668666
epoch 4 batch id 2501 loss 0.009812938049435616 train acc 0.9605032986805278
epoch 4 batch id 3001 loss 0.04543418064713478 train acc 0.961669026991003
epoch 4 batch id 3501 loss 0.10969195514917374 train acc 0.9632247929163096
epoch 4 batch id 4001 loss 0.006193788722157478 train acc 0.9641261559610097
epoch 4 batch id 4501 loss 0.0061893402598798275 train acc 0.9650563763608087
epoch 4 train acc 0.9655370093856656


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch 4 test acc 0.894533749200256


  0%|          | 0/4688 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.3397355079650879 train acc 0.90625
epoch 5 batch id 501 loss 0.0067449091002345085 train acc 0.9738023952095808
epoch 5 batch id 1001 loss 0.0951283872127533 train acc 0.9739323176823177
epoch 5 batch id 1501 loss 0.010719815269112587 train acc 0.9748292804796802
epoch 5 batch id 2001 loss 0.0037104443181306124 train acc 0.9753873063468266
epoch 5 batch id 2501 loss 0.007264475803822279 train acc 0.976047081167533
epoch 5 batch id 3001 loss 0.003999769687652588 train acc 0.976455764745085
epoch 5 batch id 3501 loss 0.010608158074319363 train acc 0.977336832333619
epoch 5 batch id 4001 loss 0.010225954465568066 train acc 0.97787271932017
epoch 5 batch id 4501 loss 0.0032477148342877626 train acc 0.9780604310153299
epoch 5 train acc 0.9783423101535836


  0%|          | 0/1563 [00:00<?, ?it/s]

epoch 5 test acc 0.8949336212412028


In [21]:
torch.save(model_naver.state_dict(), '../result/model/naver_e5_0304.pt')  # 전체 모델 저장

naver_model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
naver_model.load_state_dict(torch.load('../result/model/naver_e5_0304.pt'))
naver_model.eval()

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

### TEST with Unseen data

In [22]:
out_lst = []

for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(unseen_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = naver_model(token_ids, valid_length, segment_ids)
    out_lst.append(out.data.cpu())
    max_vals, max_indices = torch.max(out, 1)
    
pred = []
for batch in out_lst:
    for item in batch:
        pred.append(np.argmax(item.numpy()))
        
comments_unseen['pred'] = pred
print(comments_unseen.groupby('pred').size())
print((comments_unseen['sentiment'] == comments_unseen['pred']).mean())

100%|██████████| 53/53 [00:02<00:00, 25.12it/s]

pred
0    1150
1     517
dtype: int64
0.7738452309538092



