In [1]:
import pandas as pd
import mxnet
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

from sklearn.model_selection import train_test_split


device = torch.device("cuda:1")
print(f"Using {device}")

2023-03-10 15:51:06.954716: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


Using cuda:1


In [2]:
data = pd.read_csv("../../data/news.csv")

In [3]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,url,title,body,key
0,0,https://n.news.naver.com/mnews/article/021/000...,카드업계도…‘모바일 페이’ 수수료 전쟁 시작된다,[\n\n\n\n\n\n간편결제 시장 日평균 2500억7개 카드사 결제대행업체 설립...,021/0002303358
1,1,https://n.news.naver.com/mnews/article/015/000...,카카오페이도 소상공인에 '활짝'…결제 API 공개,"[\n쉽게 가져다 쓰는 오픈 플랫폼 전략""온라인 결제시스템 도입 부담 덜 것""\n\...",015/0003716660


In [5]:
from konlpy.tag import Okt
okt = Okt()

In [6]:
def tokenizing(x):
    x = str(x)
    if len(x) < 3000:
        return okt.nouns(x)
    else:
        return ['Too Long']

In [7]:
data['token'] = data['body'].apply(lambda x : tokenizing(x))

In [8]:
data.to_csv("../../data/news_token.csv")

In [4]:
data = pd.read_csv("../../data/news_token.csv")

In [5]:
data['label'] = 1
data.tail(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,body,key,portal,token,label
62465,62465,62465,내년부터 '깜깜이' 간편결제 수수료 공시된다,"[ 금감원, 전자금융업자 수수료 공시 가이드라인 시행네이버페이, 카카오페이 10개사...",https://v.daum.net/v/20221228120203288,daum,"['금감원', '전자금융업', '수수료', '공시', '가이드라인', '시행', '...",1
62466,62466,62466,"파리바게뜨, 1월1일 '파바데이'서 3000원 할인 받으세요","[ 1월1일 하루, 1만5000원 이상 구매 시 3천원 혜택 파...",https://v.daum.net/v/20221230091108475,daum,"['하루', '이상', '구매', '시', '혜택', '파', '바데', '프로모션...",1


In [6]:

pred_data = data[['token', 'label']]

pred_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62467 entries, 0 to 62466
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   token   62467 non-null  object
 1   label   62467 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 976.2+ KB


In [12]:
train = pd.read_csv("../../data/ESG_split/ESG_label_token.csv")

In [13]:
label_to_int = {}
for i, item in enumerate(train['keyword'].unique()):
    label_to_int[item] = i

In [14]:
label_to_int

{'풍수해보험MOU': 0,
 '동네무료보험서비스': 1,
 'QR체크인재활용': 2,
 '영세중소상공인수수료우대': 3,
 '아름다운재단 기부': 4,
 '자립준비청년금융안심보험기부': 5,
 '같이가치와연말기부캠페인': 6,
 '상생기금 10억 조성': 7,
 '금융취약계층 응대 매뉴얼': 8,
 '금융사기및송금피해예방': 9,
 '악성앱탐지솔류션적용': 10,
 '계좌지킴이서비스': 11,
 'UNGC가입': 12,
 'TCFD지지선언': 13,
 'ESG추진위원회': 14,
 '지속가능경영보고서약속': 15,
 'normal': 16}

In [16]:
def department_to_int(x):
    try:
        return label_to_int[x]
    except:
        print('error')

In [7]:
class BERTDataset(Dataset):
    def __init__(self, dataset,bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([" ".join(dataset.iloc[i]['token'])]) for i in range(len(dataset))]
        self.labels = [np.int32(dataset.iloc[i]['label']) for i in range(len(dataset))]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

print('get bertmodel and vocab')
bertmodel, vocab = get_pytorch_kobert_model()


get bertmodel and vocab
using cached model. /home/mglee/VSCODE/git_folder/KakaoPay_ESG/code/Kobert/.cache/kobert_v1.zip
using cached model. /home/mglee/VSCODE/git_folder/KakaoPay_ESG/code/Kobert/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [8]:
max_grad_norm = 1
log_interval = 1000
warmup_ratio = 0.1
batch_size = 16
max_len = 512

print("data setting")
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

pred_data_B = BERTDataset(pred_data, tok, max_len, True, False)

pred_dataloader = torch.utils.data.DataLoader(
    pred_data_B, batch_size = batch_size, num_workers = 8)


data setting
using cached model. /home/mglee/VSCODE/git_folder/KakaoPay_ESG/code/Kobert/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [20]:
for i in pred_dataloader:
    print(i[0]) # input
    print(i[1]) # ?
    print(i[2]) # label
    print(i[3]) # ?
    break

tensor([[   2,  777, 7720,  ...,    1,    1,    1],
        [   2, 3432, 4907,  ...,    1,    1,    1],
        [   2, 1684, 1720,  ...,    1,    1,    1],
        ...,
        [   2,  693,  427,  ...,    1,    1,    1],
        [   2, 1832, 1706,  ...,    1,    1,    1],
        [   2, 4092, 7318,  ...,    1,    1,    1]], dtype=torch.int32)
tensor([365, 256,  99, 512, 177, 108, 195, 263, 512, 262, 512, 116, 109,   8,
        315, 123], dtype=torch.int32)
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32)


In [9]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 2,   ##클래스 수 조정##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


In [10]:

pred_model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
pred_model.load_state_dict(torch.load('/home/mglee/VSCODE/git_folder/comments_sentiment_label/result/model/ourB_e5_0307.pt'))  # state_dict를 불러 온 후, 모델에 저장

# checkpoint = torch.load(PATH + 'all.tar')   # dict 불러오기
# pred_model.load_state_dict(checkpoint['model'])
#optimizer.load_state_dict(checkpoint['optimizer'])

<All keys matched successfully>

In [11]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    #print(f"acc {train_acc} /  mi {max_indices} ==  {Y}")
    return train_acc

### Data Set만 준비한다면 여기를 계속 다른 인풋으로 돌려보면서 확인 가능

In [12]:
MAX_VALS_LST = []
MAX_IND_LST = []

out_lst = []
pred_acc = 0
pred_model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(pred_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = pred_model(token_ids, valid_length, segment_ids)
    out_lst.append(out.data.cpu())
    pred_acc += calc_accuracy(out, label)
    max_vals, max_indices = torch.max(out, 1)
    #MAX_VALS, MAX_IND = torch.topk(out, k =3)
    #MAX_VALS_LST.append(MAX_VALS)
    #MAX_IND_LST.append(MAX_IND)
final_acc = pred_acc / (batch_id+1)
print(f"accuracy of test data is {final_acc}")

  0%|          | 0/3905 [00:00<?, ?it/s]

accuracy of test data is 0.0


In [26]:
int_to_label = {v:k for k,v in label_to_int.items()}

In [13]:
pred = []
for batch in out_lst:
    for item in batch:
        pred.append(int(np.argmax(item)))

In [16]:
data['pred'] = pred

In [32]:
potential_dep = []

for batch in out_lst:
    for item in batch:
        poten = np.argsort(item)[-1:]
        temp = []
        for dep in poten:
            temp.append(int_to_label[int(dep)])
        potential_dep.append(temp)

In [33]:
pred_data['pred'] = potential_dep

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data['pred'] = potential_dep


In [30]:
actual = []

for i in pred_data['접수기관']:
    actual.append(int_to_label[i])

In [70]:
pred_data['real'] = actual

In [34]:
pred_data

Unnamed: 0,token,label,pred
0,"[간편, 결제, 시장, 평균, 개, 카드, 사, 결제, 대행, 업체, 설립, 업체,...",1,[normal]
1,"[오픈, 플랫폼, 전략, 온라인, 결제, 시스템, 도입, 부담, 덜, 것, 카카오,...",1,[normal]
2,"[대한항공, 도입, 온라인, 결제, 시스템, 카카오, 페이, 페이, 코, 서비스, ...",1,[normal]
3,"[연합뉴스, 제공, 국세청, 전국, 무관, 서장, 회의, 종합소득세, 전화, 번, ...",1,[normal]
4,"[카카오, 페이, 타이틀, 화면, 카카오, 제공, 서울, 연합뉴스, 김태균, 기자,...",1,[normal]
...,...,...,...
62462,"[이상, 구매, 시, 혜택, 설, 선물, 계, 이상, 구매, 시, 할인, 이데일리,...",1,[normal]
62463,"[용인, 청년, 전세, 보증금, 반환, 보증, 보증, 료, 지원, 사업, 홍보, 포...",1,[normal]
62464,"[금융, 중심, 서민, 자금, 달이, 금융, 당국, 대응, 당부, 사진, 이미지, ...",1,[normal]
62465,"[금감원, 전자금융업, 수수료, 공시, 가이드라인, 시행, 네이버, 페이, 카카오,...",1,[normal]


In [45]:
pred_data.to_csv("../../result/news_pred.csv")

In [38]:
pred_data['prediction'] = pred_data['pred'].apply(lambda x : x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_data['prediction'] = pred_data['pred'].apply(lambda x : x[0])


In [43]:
pred_data = pred_data.drop('pred', axis =1)

In [75]:
score = 0
for pred, real in zip(pred_data['pred'], pred_data['real']):
    if real in pred:
        score += 1


In [39]:
print(f"accuracy of this model is {score / len(pred_data)} when recomend 3 best department")