In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.autograd import Variable
import random
import matplotlib.pyplot as plt
import gluonnlp as nlp
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from gluonnlp.data import SentencepieceTokenizer

bertmodel, vocab = get_pytorch_kobert_model()

# train, test dataset 불러오기.
dataset_train = nlp.data.TSVDataset('/Users/waterpurifier/Comment_train.txt',encoding='utf-8', field_indices=[2,3], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset('/Users/waterpurifier/Comment_test.txt',encoding='utf-8', field_indices=[2,3], num_discard_samples=1)

token_path = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(token_path, vocab, lower=False)

# BERTDataset
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):

        transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        #self.adsf = transform(dataset)[0]
        #print(self.adsf[0])
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]
        
        # 기존의 방식에서 model에 적용할 때 data type이 안맞아서 <list> (int) 로 맞췄음.
        self.sentences = [i[0].tolist() for i in self.sentences]
        self.labels = [i.tolist() for i in self.labels]
        
    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
    
    def __len__(self):
        return (len(self.labels))
    

max_len = 50
vocab_size = len(vocab)

data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

using cached model
using cached model
using cached model


In [2]:
def convert_to_long_variable(w2i_ls):
    return Variable(torch.LongTensor(w2i_ls))

In [3]:
x_train = convert_to_long_variable(data_train.sentences)
y_train = convert_to_long_variable(data_train.labels)

x_test = convert_to_long_variable(data_test.sentences)
y_test = convert_to_long_variable(data_test.labels)

# CNN Pytorch

In [3]:
# 케라스 모델의 CNN Summary
# embedding dimmension = 128
# dropout = (0.5, 0.8)
# filter size = 64
# model input = max_len : 50
# z = Embedding(vocab_size, embedding dimmension, input_length = 50, )
# z = dropout(0.5)

# conv 1d = conv1d(filter = 64, kernel 3, 4, 5, 6, padding = valid, activation = Relu)
# Global Max Pooling 1D
# Flatten
# dropout 0.8
# hidden layer =128#
# model output = sigmoid 1

In [480]:
class CNN_text(nn.Module):
    
    def __init__(self, n_words, embed_size, pad_index, hid_size, drop_rate, kernel_size_ls, num_filter, n_class):
        super(CNN_text, self).__init__()
        
        self.pad_index = pad_index              # 단어 embedding 과정에서 제외시킬 padding token
        self.embed_size = embed_size            # 임베딩 차원의 크기
        self.hid_size = hid_size                # 히든 레이어 갯수
        self.drop_rate = drop_rate              # 드롭아웃 비율
        self.num_filter = num_filter            # 필터의 갯수 
        self.kernel_size_ls = kernel_size_ls    # 각기 다른 필터 사이즈가 담긴 리스트
        self.num_kernel = len(kernel_size_ls)   # 필터 사이즈의 종류 수
        self.n_class = n_class                  # 카테고리 갯수
        
        self.embed = nn.Embedding(
            num_embeddings=n_words, 
            embedding_dim=embed_size,
            padding_idx=self.pad_index
        )
        
        
        # kernel size는 (n-gram, embed_size)이다.
        # 커널의 열(column)의 크기는 embed_size와 일치하므로, 단어 임베딩 벡터를 모두 커버한다.
        # 따라서, n의 row 크기를 갖는 커널은 한번에 n개의 단어를 커버하는 n-gram 커널이라고 볼 수 있다.
        self.convs = nn.ModuleList([nn.Conv1d(1, num_filter, (kernel_size, embed_size)) for kernel_size in kernel_size_ls])
        
        self.lin = nn.Sequential(
            nn.Linear(self.num_kernel*num_filter, hid_size), nn.ReLU(), nn.Dropout(drop_rate),
            nn.Linear(hid_size, n_class),
        )
        
    def forward(self, x):
        embed = self.embed(x) # batch_size x max_length x embed_size
        embed.unsqueeze_(1)       # batch_size, 1, max_length, embed_size : convolution을 위해 4D로 차원을 조절
        
        # convolution
        conved = [conv(embed).squeeze(3) for conv in self.convs] # [batch_size, num_filter, max_length -kernel_size +1]
        
        # max_pooling
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] # [batch_size, num_kernel, num_filter]
            
        # dropout
        dropouted = [F.dropout(pool, self.drop_rate) for pool in pooled]
        
        # concatenate
        concated = torch.cat(dropouted, dim = 1) # [batch_size, num_kernel * num_filter]
        
        
        logit = self.lin(m(concated))
        
        return logit

In [487]:
params = {
    'n_words' : vocab_size,        # 고유한 단어 토큰의 갯수
    'embed_size' : 128,                # 임베딩 차원의 크기
    'pad_index' : max_len,  # 패딩 토큰
    'hid_size' : 128,                  # 히든 레이어 갯수
    'drop_rate' : 0.5,                 # 드롭아웃 비율          (원문에서는 0.5를 사용)
    'kernel_size_ls' : [2,3,4,5],      # 커널 사이즈 리스트        (원문애서는 3,4,5를 사용)
    'num_filter' : 32,                 # 각 사이즈 별 커널 갯수 (원문에서는 100을 사용)
    'n_class' : 1,                  # 카테고리 갯수
}

In [488]:
model = CNN_text(**params)

In [532]:
epochs = 5
lr = 0.0003
batch_size = 128*2

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr) # 원문에서는 Adadelta 알고리즘을 사용
#criterion = nn.CrossEntropyLoss(reduction='sum')
m = nn.Sigmoid()
criterion = nn.BCELoss()

train_loss_ls = []
test_loss_ls = []
for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0
    test_loss = 0
    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].float()
        
        scores = model(x_batch)
        #predict = F.softmax(scores, dim=1).argmax(dim = 1)
        predict = []
        for i in m(scores):
            if i > 0.5: 
                predict.append(1)
            else:
                predict.append(0)
        predict = convert_to_long_variable(predict)
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(m(scores), y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    
    train_loss_ls.append(acc)
    if (epoch+1) % 1 == 0:
        model.eval()
        scores = model(x_test)
        #predict = F.softmax(scores, dim=1).argmax(dim = 1)
        predict = []
        for i in m(scores):
            if i > 0.5: 
                predict.append(1)
            else:
                predict.append(0)
        predict = convert_to_long_variable(predict)
        
        acc = (predict == y_test.long()).sum().item() / y_test.size(0)
        loss = criterion(m(scores), y_test.float())
        test_loss += loss.item()
        test_loss_ls.append(test_loss)
        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, test_loss/y_test.size(0), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')

Train epoch : 1,  loss : 0.25065280636772513,  accuracy :0.645
*************************************************************************************************
*************************************************************************************************
Test Epoch : 1, Test Loss : 0.000 , Test Accuracy : 0.696
*************************************************************************************************
*************************************************************************************************
Train epoch : 2,  loss : 0.2235829319106415,  accuracy :0.793
*************************************************************************************************
*************************************************************************************************
Test Epoch : 2, Test Loss : 0.000 , Test Accuracy : 0.805
*************************************************************************************************
*************************************************************************

In [463]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
def predict_fn(text):
    Text = text[0]
    text = BERTDataset([[Text, '1']], 0, 1, tok, 50, True, False)
    text = convert_to_long_variable(text.sentences[0])
    return m(model(text.unsqueeze(0)))

explainer = LimeTextExplainer()
predict_fn('뭐어쩌라구욧!')
explanation = explainer.explain_instance('뭐 어쩌라구욧', predict_fn, num_features = 2)

IndexError: index 1 is out of bounds for dimension 1 with size 1

In [543]:
test_text = '뭐하자는 거지요~>~?'
#data_train = ElectraDataset(dataset_train['comment'], dataset_train['label'], max_len)
tests = BERTDataset([[test_text, '1']], 0, 1, tok, 50, True, False)
print(tests.sentences)
tests = convert_to_long_variable(tests.sentences[0])
#tests.unsqueeze(0).shape
#print(predict_fn(tests))
predict = model(tests.unsqueeze(0))
print(dataset_train[1])
print(predict)
print(m(predict))


[[2, 2145, 7818, 5760, 862, 7318, 6999, 517, 463, 632, 517, 463, 633, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
['ㅂ2염', '0']
tensor([[-3.4441]], grad_fn=<AddmmBackward>)
tensor([[0.0309]], grad_fn=<SigmoidBackward>)


tensor([[0.6489]], grad_fn=<SigmoidBackward>)