In [1]:
import re
import pandas as pd
import numpy as np
import os
import glob
import json
import time
from datetime import datetime, timedelta
import pickle
import statsmodels.api as sm

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns
from kss import split_sentences
from operator import itemgetter

def pickle_reader(filename) :
    with open(filename, 'rb') as f :
        output_df = pickle.load(f)
    return output_df

def pickle_writer(objecttosave, filename) :
    with open(filename, 'wb') as f :
        pickle.dump(objecttosave, f, protocol=5)
        
basic_header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)\AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}

### 1. 데이터 호출 및 전처리

In [3]:
np.random.seed(0)

jobplanetdf = pd.read_csv('./data/jobplanet_review.csv', index_col=0)
moviedf = pd.read_csv('./data/naver_movie.csv', index_col=0).sample(500)
shoppingdf = pd.read_csv('./data/naver_shopping.csv', index_col=0).sample(500)
enterdf = pd.read_csv('./data/beep_dataset.csv', index_col=0).sample(500)
welfarenewsdf = pd.read_csv('./data/welfare_news.csv', index_col=0)

moviedf.columns = ['text', 'label']
shoppingdf.columns = ['text', 'label']
enterdf.columns = ['text', 'label']

### 2. 사전 기반 알고리즘은 왜 성능이 좋지 않을까?

In [4]:
np.random.seed(0)

textdf = \
pd.concat([pd.DataFrame(jobplanetdf[jobplanetdf['label']==1]['good'].rename('text')).assign(label=1),
           pd.DataFrame(jobplanetdf[jobplanetdf['label']==0]['bad'].rename('text')).assign(label=0),
           moviedf, shoppingdf, enterdf], 
          axis=0, ignore_index=True)


textdf = textdf.sample(frac=1).reset_index(drop=True)

In [5]:
negative_dict = \
['나쁜', '노잼', '쓰레기', '별로', '최악', '구린', '꼰대', '박봉', '수직적', '하락', 
 '위계적', '하향', '부족', '정치질', '지루', '졸림', '불편']

In [6]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [7]:
f1_score(np.where(textdf['text'].str.contains('|'.join(negative_dict)), 0, 1), textdf['label'])

0.6990846681922197

### 3. 토큰화(Tokenization)

In [9]:
corpus = pd.concat([textdf['text'], welfarenewsdf['title'], welfarenewsdf['content']], axis=0, ignore_index=True)
corpus = corpus.str.replace('\[(.*?)\]', '')
corpus = corpus.str.split().str.join(' ')
corpus

0        복리후생이 좋습니다. 도서관, 헬스장, 구내식당, 시간선택제 근무 등 직원들이 누릴...
1                          B급도 아닌 C급 영화...하나도 안무서우니 참고하길..
2        일정에 차질이 없다면 연차 사용에 대한 터치 제로. 워라밸이 잘 지켜짐. 다들 야근...
3                   자체개발 Erp 거지같음, 결재하나 하려면 해야할 과정이 너무 많음,
4                                     마지막의 억지 신파극만 아니면 볼만함
                               ...                        
13450    공무원이 무급 육아휴직 기간 기초생활수급비를 수령한 것은 정당하다는 판결이 나왔다....
13451    무급 육아휴직 기간에 기초생활수급비를 수령해 반환 명령을 받은 전 강릉시 공무원이 ...
13452    무급 육아휴직 중 기초생활수급비를 수령해 급여 반환 명령을 받은 전 강릉시 공무원이...
13453    ESG(환경·사회·지배구조) 열풍이 불면서 벤처캐피털(VC) 사이에서 ‘임팩트 투자...
13454    춘천지법 강릉지원 행정1부는 무급 육아휴직 중에 기초생활수급비를 받은 강릉시 전 공...
Length: 13455, dtype: object

In [10]:
from tokenizers import SentencePieceBPETokenizer
bpe_tokenizer = SentencePieceBPETokenizer()
bpe_tokenizer.train_from_iterator(corpus)

In [11]:
bpe_tokenizer.encode('이회사는연봉상승률이 너무 짭니다').tokens

['▁이', '회사', '는', '연봉', '상승', '률이', '▁너무', '▁', '니다']

In [14]:
bpe_tokenizer.encode('KT는보상정책빨리내라').tokens

['▁KT는', '보상', '정책', '빨리', '내', '라']

In [15]:
bpe_tokenizer.save_model('./model', 'company_tokenizer')

['./model\\company_tokenizer-vocab.json',
 './model\\company_tokenizer-merges.txt']

In [16]:
bpe_tokenizer = \
SentencePieceBPETokenizer('./model/company_tokenizer-vocab.json',
                          './model/company_tokenizer-merges.txt')

In [17]:
pos_corpus = [bpe_tokenizer.encode(sent).tokens for sent in tqdm(corpus)]

100%|██████████████████████████████████████████████████████████████████████████| 13455/13455 [00:05<00:00, 2278.19it/s]


### 4. 단어-벡터 임베딩(Tokenization)

In [18]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText

w2v_model = Word2Vec(pos_corpus, sg=1, epochs=7)
ft_model = FastText(pos_corpus, sg=1, epochs=7)

w2v_model.save('./model/word2vec.model')
ft_model.save('./model/fasttext.model')

In [19]:
w2v_model = Word2Vec.load('./model/word2vec.model')
ft_model = FastText.load('./model/fasttext.model')

In [20]:
w2v_model.wv['연봉']

array([-6.98279560e-01,  3.41737062e-01,  4.81373429e-01,  3.54369789e-01,
       -2.79701859e-01,  1.10048808e-01, -1.22517772e-01, -5.21665497e-04,
        1.12048402e-01,  1.17490590e-01, -1.56347454e-01, -6.73366129e-01,
       -3.80719360e-03,  3.58526319e-01, -1.47793680e-01,  6.98661357e-02,
       -1.11944415e-02,  3.43430080e-02,  2.76096612e-01,  1.71718538e-01,
       -2.39261791e-01,  7.06743121e-01, -2.96038598e-01,  1.93332553e-01,
        2.76378971e-02, -2.52391368e-01, -1.96680278e-01,  3.61599505e-01,
       -2.63215840e-01, -3.40067118e-01,  1.65190861e-01,  1.17417775e-01,
       -1.76941484e-01,  3.36162060e-01, -3.97318453e-01, -3.13215256e-01,
        3.53982449e-01, -2.79376924e-01,  2.68941730e-01,  2.26636097e-01,
        2.97791541e-01,  2.77890172e-02, -8.18398148e-02, -1.17654547e-01,
       -1.14365876e-01,  1.55705705e-01,  4.62470949e-02, -5.12289286e-01,
        6.46722270e-03, -1.27233341e-01, -1.40132651e-01,  3.88348498e-03,
        1.42480075e-01, -

In [21]:
w2v_model.wv['고인물']

KeyError: "Key '고인물' not present"

In [22]:
ft_model.wv['고인물']

array([-0.09940056, -0.03564939,  0.09870532,  0.02966512, -0.01790801,
       -0.01233355, -0.00055947,  0.09083677, -0.05760286, -0.0041881 ,
        0.02426949, -0.13433151,  0.00862485,  0.10676658, -0.12296668,
        0.04470526,  0.06632576, -0.01397197,  0.01218883, -0.05793754,
       -0.1553409 ,  0.14258975, -0.05602446, -0.02750884,  0.00367039,
       -0.00680569, -0.04691146,  0.12103314,  0.00774853, -0.00323641,
        0.0120238 , -0.08482864, -0.0089274 ,  0.05476988, -0.06147758,
       -0.05811347, -0.10383523, -0.00151175,  0.04954945,  0.05613916,
       -0.02653487,  0.02346732, -0.01719736,  0.10358842,  0.02268302,
        0.04163911, -0.06888217, -0.15899532, -0.04606161,  0.00088532,
       -0.06294958,  0.05990667,  0.01108983,  0.01025069, -0.01414967,
       -0.07395557,  0.0735611 ,  0.00243848,  0.04878221,  0.12280196,
        0.00050942, -0.00198849, -0.07220486,  0.08857828,  0.00114007,
       -0.03337764,  0.04306204,  0.03597564, -0.073215  ,  0.10

In [30]:
ft_model.wv.most_similar('데이터')

[('빅데이터', 0.9183898568153381),
 ('▁빅데이터', 0.87186199426651),
 ('▁데이터베이스', 0.8555660843849182),
 ('중앙화', 0.8530203104019165),
 ('▁데이터센터', 0.8470616936683655),
 ('▁DLP', 0.8467522263526917),
 ('▁보안관제', 0.8399450778961182),
 ('위협', 0.835642397403717),
 ('접근', 0.8345462679862976),
 ('▁데이터', 0.832171618938446)]

In [31]:
ft_model.wv.most_similar('보상')

[('징수', 0.8502567410469055),
 ('계약서', 0.8480408787727356),
 ('여금', 0.8358345031738281),
 ('유지', 0.8314090371131897),
 ('보험의', 0.8235677480697632),
 ('청구', 0.8230589628219604),
 ('▁산업재해보상', 0.8089543581008911),
 ('▁살생물', 0.808185875415802),
 ('개선', 0.8065752983093262),
 ('▁징수', 0.7960982918739319)]

In [37]:
ft_model.wv.similarity('계약직', '비정규직')

0.77200586

In [38]:
ft_model.wv.similarity('보상', '청구')

0.8230589

In [39]:
ft_model.wv.similarity('보상', '계약직')

0.5089019

In [40]:
ft_model.wv.similarity('보상', '할리우드')

0.28621542

### 5. 문장 분류 (Document Classification)

In [41]:
tokens = bpe_tokenizer.encode('이 회사는연봉상승이 너무 짜다').tokens
embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
embedding.shape

(100,)

In [42]:
def text_to_embedding(input_sent) :
    tokens = bpe_tokenizer.encode(input_sent).tokens
    embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
    return embedding

In [43]:
textdf['embedding'] = [text_to_embedding(text) for text in tqdm(textdf['text'])]

100%|████████████████████████████████████████████████████████████████████████████| 3623/3623 [00:00<00:00, 5919.97it/s]


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [45]:
np.random.seed(0)
X_train, X_test, y_train, y_test =\
train_test_split(np.vstack(textdf['embedding'].values), textdf['label'].values, test_size=.25, random_state=0)

In [46]:
# Logistic Regression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# Support Vector Machine
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True, random_state=0)

In [47]:
accuracy_score(y_test, lr_clf.predict(X_test)), f1_score(y_test, lr_clf.predict(X_test))

(0.7737306843267108, 0.7683615819209039)

In [48]:
accuracy_score(y_test, svm_clf.predict(X_test)), f1_score(y_test, svm_clf.predict(X_test))

(0.7924944812362031, 0.7772511848341231)

In [49]:
accuracy_score(y_test, rf_clf.predict(X_test)), f1_score(y_test, rf_clf.predict(X_test))

(0.7770419426048565, 0.7645687645687647)

In [50]:
def hr_sentiment_classifier(input_sent) :
    embedding = text_to_embedding(input_sent)
    output = svm_clf.predict(embedding.reshape(1,-1))[0]
    return output    

In [51]:
hr_sentiment_classifier('자유분방하고 수평적인 분위기')

1

In [52]:
hr_sentiment_classifier('연봉이 거의 안 오름')

0

In [53]:
hr_sentiment_classifier('초봉이 높아서 만족')

0

In [54]:
hr_sentiment_classifier('적어도 내가 속한 팀은 꼰대 쌉쓰레기임')

0

In [55]:
hr_sentiment_classifier('KT는 배상하라 배상하라')

0

In [56]:
hr_sentiment_classifier('믿음직한 회사 LG U+')

1

In [57]:
hr_sentiment_classifier('믿고 쓰는 SKT')

1