In [1]:
import pandas as pd
import mxnet
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split


device = torch.device("cuda:1")
print(f"Using {device}")

2022-06-23 12:39:31.968173: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Using cuda:1


In [2]:
data = pd.read_pickle("../data/sample5_df.pkl")

In [33]:
pred_data = data.iloc[1000000:, :].sample(frac = 0.1)

In [34]:
pred_data = pred_data[['제목', '본문', '민원발생지', '접수기관']]

In [35]:
pred_data.reset_index(inplace = True, drop = True)

In [36]:
pred_data['본문'] = pred_data['본문'].str.strip()
pred_data['본문'] = pred_data['본문'].str.replace("\r\n", '')

In [37]:
pred_data.head(1)

Unnamed: 0,제목,본문,민원발생지,접수기관
0,구로구 도림로 47-1 (구로동 769-267) 금영커피숍 ...,구로구 도림로 47-1 (구로동 769-267) 금영커피숍 앞 파란투명봉투에 음식물...,NONE,서울특별시


In [38]:
from konlpy.tag import Okt
okt = Okt()

okt.nouns("토크나이저 import가 잘 되었는지 확인합니다")

['토크', '나이', '저', '확인']

In [39]:
pred_data['token'] = pred_data['본문'].copy()

In [40]:
import copy
from tqdm import tqdm
pred_data['token'] = copy.deepcopy(pred_data['본문'])
for idx, text in enumerate(tqdm(pred_data['token'])):
    text = str(text)
    if len(text) > 3000:
        tok = "TOO LONG"
    else:
        tok = okt.nouns(text)
        if len(tok) < 512:
            pred_data['token'][idx] = tok
        else:
            pred_data['token'][idx] = "TOO LONG"

100%|██████████| 70528/70528 [57:39<00:00, 20.39it/s]  


In [45]:
label_data = data[:1000000]

label_to_int = {}
for i, item in enumerate(label_data['접수기관'].unique()):
    label_to_int[item] = i

In [50]:
idx_list = []
for idx, item in enumerate(pred_data['접수기관']):
    if item in label_to_int.keys():
        idx_list.append(idx)

In [72]:
pred_data = pred_data.iloc[idx_list, :]

In [73]:
def department_to_int(x):
    try:
        return label_to_int[x]
    except:
        print('error')

In [74]:
pred_data['접수기관'] = pred_data['접수기관'].apply(lambda x : department_to_int(x))
pred_data = pred_data[['token', '접수기관']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data['접수기관'] = pred_data['접수기관'].apply(lambda x : department_to_int(x))


In [76]:
class BERTDataset(Dataset):
    def __init__(self, dataset,bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([" ".join(dataset.iloc[i]['token'])]) for i in range(len(dataset))]
        self.labels = [np.int32(dataset.iloc[i]['접수기관']) for i in range(len(dataset))]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))



print('get bertmodel and vocab')
bertmodel, vocab = get_pytorch_kobert_model()


get bertmodel and vocab
using cached model. /home/mglee/VSCODE/git_folder/complain_department_classification/code/.cache/kobert_v1.zip
using cached model. /home/mglee/VSCODE/git_folder/complain_department_classification/code/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [77]:
max_grad_norm = 1
log_interval = 1000
warmup_ratio = 0.1
batch_size = 4
max_len = 512


print("data setting")
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

pred_data_B = BERTDataset(pred_data, tok, max_len, True, False)


pred_dataloader = torch.utils.data.DataLoader(
    pred_data_B, batch_size = batch_size, num_workers = 8)


data setting
using cached model. /home/mglee/VSCODE/git_folder/complain_department_classification/code/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [81]:

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = len(label_data['접수기관'].unique()),   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


In [82]:
PATH = '../result/kobert/'

pred_model = torch.load(PATH + 'KoBERT_0623_e5.pt')  # 전체 모델을 통째로 불러옴, 클래스 선언 필수
pred_model.load_state_dict(torch.load(PATH + 'Kobert_0623_e5_state_dict.pt'))  # state_dict를 불러 온 후, 모델에 저장

<All keys matched successfully>

In [83]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [85]:
out_lst = []
pred_acc = 0
pred_model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(pred_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = pred_model(token_ids, valid_length, segment_ids)
    out_lst.append(out)
    pred_acc += calc_accuracy(out, label)
pred_acc / (batch_id+1)

  0%|          | 8/17632 [00:01<1:08:11,  4.31it/s]


RuntimeError: CUDA out of memory. Tried to allocate 48.00 MiB (GPU 1; 23.70 GiB total capacity; 20.22 GiB already allocated; 27.06 MiB free; 20.82 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF