In [1]:
# !pip install ipywidgets  # for vscode
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import json

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import BertModel
from transformers import AutoTokenizer, AutoModel

from sklearn.utils.class_weight import compute_class_weight

In [5]:
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModel.from_pretrained("monologg/kobert")

In [6]:
class sentencEmojiDataset(Dataset):
    def __init__(self, directory, tokenizer):
        
        data = pd.read_csv(directory, encoding='UTF-8')
        
        self.tokenizer = tokenizer
        self.sentences = list(data.iloc[:,0])
       
        emojis = list(data.iloc[:,1])
        emojis_unique = list(set(emojis))
        
        self.labels = [emojis_unique.index(i) for i in emojis]
 
        self.labels_dict = {'key': range(len(emojis_unique)), 'value': emojis_unique}
        
    def __getitem__(self, i): #collate 이전 미리 tokenize를 시켜주자
        tokenized = self.tokenizer(str(self.sentences[i]), return_tensors='pt')
        #아래 세 개는 tokenizer가 기본적으로 반환하는 정보. BERT의 input이기도 함
        input_ids = tokenized['input_ids']
        token_type_ids = tokenized['token_type_ids']
        attention_mask = tokenized['attention_mask']
        
        return {'input_ids': input_ids, 'token_type_ids': token_type_ids, 
                'attention_mask': attention_mask, 'label': self.labels[i]}
         
    def __len__(self): #data loader가 필요로 하여 필수적으로 있어야 하는 함수
        return len(self.sentences)

In [7]:
class collate_fn:
    def __init__(self, labels_dict):
        self.num_labels = len(labels_dict)
        
    def __call__(self, batch): #batch는 dataset.getitem의 return 값의 List. eg. [{}, {}. ...]
        #batch내 최대 문장 길이(토큰 개수)를 먼저 구해서 padding할 수 있도록 하기
        batchlen = [sample['input_ids'].size(1) for sample in batch] #tensor값을 반환하기 때문에 1번째 차원의 길이를 구함
        maxlen = max(batchlen)
        input_ids = []
        token_type_ids = []
        attention_mask = []
        #padding: [5, 6] [0, 0,  ...]을 concatenate 하는 방식으로 패딩
        for sample in batch:
            pad_len = maxlen - sample['input_ids'].size(1)
            pad = torch.zeros((1, pad_len), dtype=torch.int)            
            input_ids.append(torch.cat([sample['input_ids'], pad], dim=1))
            token_type_ids.append(torch.cat([sample['token_type_ids'], pad], dim=1))
            attention_mask.append(torch.cat([sample['attention_mask'], pad], dim=1))
        #batch 구성
        input_ids = torch.cat(input_ids, dim=0)
        token_type_ids = torch.cat(token_type_ids, dim=0)
        attention_mask = torch.cat(attention_mask, dim=0)
        
        #one-hot encoding
        #batch 내 라벨을 tensor로 변환
        tensor_label = torch.tensor([sample['label'] for sample in batch])
        
        return input_ids, token_type_ids, attention_mask, tensor_label

In [8]:
df = pd.read_csv('data/twitter_clean.csv', encoding="UTF-8")

df['y'].value_counts(sort = True).head(10)

N    981
❤     34
💙     31
📸     26
❣     22
✨     21
💜     20
🎉     18
😝     18
😆     17
Name: y, dtype: int64

In [9]:
df['split'] = np.random.randn(df.shape[0], 1)
msk = np.random.rand(len(df)) <= 0.7

train = df[msk]
test = df[~msk]

train.to_csv('data/train.csv', index=False)
test.to_csv('data/test.csv', index=False)

In [10]:
train = sentencEmojiDataset('data/train.csv', tokenizer)
test = sentencEmojiDataset('data/test.csv', tokenizer)

train_collate_fn = collate_fn(train.labels_dict)
test_collate_fn = collate_fn(test.labels_dict)

train_collate_fn

<__main__.collate_fn at 0x24e04d78b70>

In [11]:
# Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5  
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [12]:
train_dataloader = DataLoader(train, batch_size=batch_size, collate_fn=train_collate_fn, shuffle = True, drop_last = True)
test_dataloader = DataLoader(test, batch_size=batch_size, collate_fn=test_collate_fn, shuffle = False, drop_last = False)
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x24e04d5d4e0>

In [13]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, input_ids, token_type_ids, attention_mask):
        pooler = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
        return self.classifier(self.dropout(pooler))

In [24]:
label = list(set(list(df.iloc[:,1])))

In [25]:
model = BERTClassifier(model,  dr_rate=0.5, num_classes = len(label))

In [26]:
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [27]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [33]:
#Class Imbalance 문제 해결을 위한 weighted cross entropy 
class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(train.labels), y = train.labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print(class_weights) #([1.0000, 1.0000, 4.0000, 1.0000, 0.5714])

loss_fn = nn.CrossEntropyLoss(weight = class_weights, reduction = 'mean') 

ImportError: DLL load failed: 지정된 프로시저를 찾을 수 없습니다.

In [15]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [16]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [17]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [18]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (input_ids, token_type_ids, attention_mask, tensor_label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        
        out = model(input_ids, token_type_ids, attention_mask)
        loss = loss_fn(out, tensor_label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        batch_acc = calc_accuracy(out, tensor_label)
        train_acc += batch_acc
        #f batch_id % log_interval == 0:
        print("epoch {} batch id {}/{} loss {} train acc {}".format(e+1, batch_id+1, len(train_dataloader), loss.data.cpu().numpy(), batch_acc))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    with torch.no_grad():
        for batch_id, (input_ids, token_type_ids, attention_mask, tensor_label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
            out = model(input_ids, token_type_ids, attention_mask)
            test_acc += calc_accuracy(out, tensor_label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

epoch 1 batch id 1/24 loss 5.525818347930908 train acc 0.0
epoch 1 batch id 2/24 loss 5.492289066314697 train acc 0.0
epoch 1 batch id 3/24 loss 5.473079681396484 train acc 0.0
epoch 1 batch id 4/24 loss 5.422167778015137 train acc 0.0
epoch 1 batch id 5/24 loss 5.393969535827637 train acc 0.0
epoch 1 batch id 6/24 loss 5.4018025398254395 train acc 0.0
epoch 1 batch id 7/24 loss 5.288366317749023 train acc 0.015625
epoch 1 batch id 8/24 loss 5.238229274749756 train acc 0.046875
epoch 1 batch id 9/24 loss 5.1960649490356445 train acc 0.140625
epoch 1 batch id 10/24 loss 5.046657562255859 train acc 0.1875
epoch 1 batch id 11/24 loss 5.101888179779053 train acc 0.28125
epoch 1 batch id 12/24 loss 5.13629674911499 train acc 0.28125
epoch 1 batch id 13/24 loss 5.077856540679932 train acc 0.328125
epoch 1 batch id 14/24 loss 4.797379493713379 train acc 0.46875
epoch 1 batch id 15/24 loss 4.785677909851074 train acc 0.453125
epoch 1 batch id 16/24 loss 4.888695240020752 train acc 0.328125
epo

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:23<00:00,  2.15s/it]


epoch 1 test acc 0.0
epoch 2 batch id 1/24 loss 4.134457588195801 train acc 0.46875
epoch 2 batch id 2/24 loss 4.06512975692749 train acc 0.453125
epoch 2 batch id 3/24 loss 4.005620002746582 train acc 0.46875
epoch 2 batch id 4/24 loss 4.129027843475342 train acc 0.421875
epoch 2 batch id 5/24 loss 3.547823429107666 train acc 0.578125
epoch 2 batch id 6/24 loss 3.913343667984009 train acc 0.453125
epoch 2 batch id 7/24 loss 3.7231194972991943 train acc 0.5
epoch 2 batch id 8/24 loss 4.211643695831299 train acc 0.359375
epoch 2 batch id 9/24 loss 3.6887378692626953 train acc 0.484375
epoch 2 batch id 10/24 loss 3.648498773574829 train acc 0.484375
epoch 2 batch id 11/24 loss 3.290537118911743 train acc 0.5625
epoch 2 batch id 12/24 loss 3.6407580375671387 train acc 0.46875
epoch 2 batch id 13/24 loss 3.5656893253326416 train acc 0.484375
epoch 2 batch id 14/24 loss 4.340575218200684 train acc 0.296875
epoch 2 batch id 15/24 loss 3.858834981918335 train acc 0.40625
epoch 2 batch id 16/2

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:26<00:00,  2.44s/it]


epoch 2 test acc 0.0
epoch 3 batch id 1/24 loss 2.9581985473632812 train acc 0.578125
epoch 3 batch id 2/24 loss 3.660240411758423 train acc 0.4375
epoch 3 batch id 3/24 loss 4.138099670410156 train acc 0.34375
epoch 3 batch id 4/24 loss 4.162060737609863 train acc 0.34375
epoch 3 batch id 5/24 loss 3.2673463821411133 train acc 0.5
epoch 3 batch id 6/24 loss 3.4956696033477783 train acc 0.46875
epoch 3 batch id 7/24 loss 3.5983002185821533 train acc 0.453125
epoch 3 batch id 8/24 loss 3.0491890907287598 train acc 0.5625
epoch 3 batch id 9/24 loss 3.7450473308563232 train acc 0.421875
epoch 3 batch id 10/24 loss 3.2963404655456543 train acc 0.5
epoch 3 batch id 11/24 loss 3.877441883087158 train acc 0.40625
epoch 3 batch id 12/24 loss 3.4673523902893066 train acc 0.46875
epoch 3 batch id 13/24 loss 3.777430534362793 train acc 0.40625
epoch 3 batch id 14/24 loss 3.158539056777954 train acc 0.53125
epoch 3 batch id 15/24 loss 3.243966817855835 train acc 0.5
epoch 3 batch id 16/24 loss 3.3

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:24<00:00,  2.23s/it]


epoch 3 test acc 0.0
epoch 4 batch id 1/24 loss 3.59659481048584 train acc 0.4375
epoch 4 batch id 2/24 loss 3.4427571296691895 train acc 0.46875
epoch 4 batch id 3/24 loss 3.2663912773132324 train acc 0.5
epoch 4 batch id 4/24 loss 3.478074073791504 train acc 0.46875
epoch 4 batch id 5/24 loss 3.6340036392211914 train acc 0.4375
epoch 4 batch id 6/24 loss 3.6822848320007324 train acc 0.421875
epoch 4 batch id 7/24 loss 3.614180326461792 train acc 0.453125
epoch 4 batch id 8/24 loss 3.1475412845611572 train acc 0.53125
epoch 4 batch id 9/24 loss 3.5636825561523438 train acc 0.453125
epoch 4 batch id 10/24 loss 3.5066943168640137 train acc 0.46875
epoch 4 batch id 11/24 loss 3.6407666206359863 train acc 0.421875
epoch 4 batch id 12/24 loss 3.8691985607147217 train acc 0.390625
epoch 4 batch id 13/24 loss 3.4275593757629395 train acc 0.46875
epoch 4 batch id 14/24 loss 3.8421318531036377 train acc 0.40625
epoch 4 batch id 15/24 loss 3.366194248199463 train acc 0.484375
epoch 4 batch id 1

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:24<00:00,  2.26s/it]


epoch 4 test acc 0.0
epoch 5 batch id 1/24 loss 2.9343271255493164 train acc 0.578125
epoch 5 batch id 2/24 loss 3.0540690422058105 train acc 0.546875
epoch 5 batch id 3/24 loss 3.8572373390197754 train acc 0.375
epoch 5 batch id 4/24 loss 3.3199269771575928 train acc 0.5
epoch 5 batch id 5/24 loss 3.2469935417175293 train acc 0.515625
epoch 5 batch id 6/24 loss 3.534562587738037 train acc 0.453125
epoch 5 batch id 7/24 loss 3.9197888374328613 train acc 0.390625
epoch 5 batch id 8/24 loss 2.9824368953704834 train acc 0.546875
epoch 5 batch id 9/24 loss 3.0393800735473633 train acc 0.546875
epoch 5 batch id 10/24 loss 3.8134477138519287 train acc 0.375
epoch 5 batch id 11/24 loss 3.843388080596924 train acc 0.390625
epoch 5 batch id 12/24 loss 4.219090938568115 train acc 0.3125
epoch 5 batch id 13/24 loss 3.4723236560821533 train acc 0.46875
epoch 5 batch id 14/24 loss 3.632444381713867 train acc 0.4375
epoch 5 batch id 15/24 loss 3.1510159969329834 train acc 0.515625
epoch 5 batch id 1

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:25<00:00,  2.29s/it]

epoch 5 test acc 0.0



