In [1]:
import re
import pandas as pd
import numpy as np
import os
import glob
import json
import time
from datetime import datetime, timedelta
import pickle
import statsmodels.api as sm

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns
from kss import split_sentences
from operator import itemgetter

def pickle_reader(filename) :
    with open(filename, 'rb') as f :
        output_df = pickle.load(f)
    return output_df

def pickle_writer(objecttosave, filename) :
    with open(filename, 'wb') as f :
        pickle.dump(objecttosave, f, protocol=5)
        
basic_header = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)\AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}

### 1. 데이터 호출 및 전처리

In [2]:
jobplanetdf = pd.read_csv('./data/20211122_jobplanet_review.csv', index_col=0)
newsdf = pd.read_csv('./data/20211124_companywelfare_newsdf.csv', index_col=0)

### 2. 사전 기반 알고리즘은 왜 성능이 좋지 않을까?

In [3]:
textdf = \
pd.concat([pd.DataFrame(jobplanetdf[jobplanetdf['label']==1]['good'].rename('text')).assign(label=1),
           pd.DataFrame(jobplanetdf[jobplanetdf['label']==0]['bad'].rename('text')).assign(label=0)], axis=0, ignore_index=True)

np.random.seed(0)
textdf = textdf.sample(frac=1).reset_index(drop=True)

In [4]:
negative_dict = \
['나쁜', '별로', '최악', '구린', '꼰대', '박봉', '수직적', '하락', '위계적', '하향', '부족', '정치']

In [5]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [6]:
f1_score(np.where(textdf['text'].str.contains('|'.join(negative_dict)), 0, 1), textdf['label'])

0.7627494456762749

### 3. 토큰화(Tokenization)

In [7]:
corpus = pd.concat([jobplanetdf['good'], jobplanetdf['bad'], newsdf['title'], newsdf['content']], axis=0, ignore_index=True)
corpus = corpus.str.replace('\[(.*?)\]', '')
corpus = corpus.str.split().str.join(' ')

In [8]:
from tokenizers import SentencePieceBPETokenizer
bpe_tokenizer = SentencePieceBPETokenizer()
bpe_tokenizer.train_from_iterator(corpus)

In [9]:
bpe_tokenizer.encode('이회사는연봉상승률이 너무 짭니다').tokens

['▁이', '회사는', '연봉', '상승', '률이', '▁너무', '▁', '니다']

In [10]:
bpe_tokenizer.save_model('./model', 'company_tokenizer')

['./model\\company_tokenizer-vocab.json',
 './model\\company_tokenizer-merges.txt']

In [11]:
bpe_tokenizer = \
SentencePieceBPETokenizer('./model/company_tokenizer-vocab.json',
                          './model/company_tokenizer-merges.txt')

In [12]:
pos_corpus = [bpe_tokenizer.encode(sent).tokens for sent in tqdm(corpus)]

100%|██████████████████████████████████████████████████████████████████████████| 14078/14078 [00:06<00:00, 2253.87it/s]


### 4. 단어-벡터 임베딩(Tokenization)

In [13]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText

w2v_model = Word2Vec(pos_corpus, sg=1, epochs=7)
ft_model = FastText(pos_corpus, sg=1, epochs=7)

w2v_model.save('./model/word2vec.model')
ft_model.save('./model/fasttext.model')

In [14]:
w2v_model = Word2Vec.load('./model/word2vec.model')
ft_model = FastText.load('./model/fasttext.model')

In [15]:
w2v_model.wv['연봉']

array([-0.3045936 ,  0.24693088,  0.14991562, -0.19427846,  0.02942798,
       -0.5580456 ,  0.36470547,  0.603408  ,  0.02112223, -0.2710036 ,
        0.23175699, -0.51069605, -0.12321895, -0.06243563,  0.25831795,
       -0.48998237, -0.12501289, -0.09887471,  0.21377285, -0.17097631,
        0.05593832,  0.03498149, -0.5302111 , -0.03486763, -0.36997467,
       -0.00781262, -0.14409682,  0.30812708,  0.4147889 , -0.10641888,
       -0.3635585 ,  0.28883496,  0.30548534,  0.1190175 , -0.55669063,
        0.1307146 ,  0.37754717, -0.33167517, -0.27467594,  0.6216755 ,
        0.01451701,  0.01517205, -0.3401161 , -0.4441151 ,  0.1682369 ,
        0.11139352,  0.27253142, -0.08971144, -0.41238874,  0.1733015 ,
        0.2512298 , -0.23599195, -0.1531766 ,  0.3504402 , -0.40995514,
       -0.02472387,  0.32006145, -0.28731182,  0.05471123,  0.47436896,
       -0.09398881, -0.11160287, -0.05029441, -0.2770624 ,  0.65167373,
        0.21116589,  0.13485175,  0.57634723,  0.01546386,  0.17

In [16]:
w2v_model.wv['고인물']

KeyError: "Key '고인물' not present"

In [17]:
ft_model.wv['고인물']

array([-0.0965476 ,  0.04265649,  0.20970912, -0.11436263, -0.07362945,
       -0.15878826, -0.08139955,  0.08280712, -0.08363578, -0.07037161,
        0.08758793, -0.01722459, -0.04463173,  0.03919538,  0.10181395,
       -0.04534522,  0.1079892 ,  0.005405  , -0.02099776, -0.05251576,
       -0.08790803,  0.05344651,  0.02138226, -0.16532232, -0.03179331,
       -0.09447569,  0.01245358,  0.19224785,  0.17586099,  0.08797318,
       -0.07642836, -0.06394865, -0.19346134,  0.17152007, -0.07800832,
       -0.01796177,  0.0741407 ,  0.01764975, -0.10188123,  0.11379147,
       -0.02929472, -0.02628394, -0.10384399, -0.08732098,  0.02019322,
        0.04238122,  0.01734072, -0.19451149, -0.06206492,  0.04358501,
       -0.1237733 , -0.0689846 , -0.00496455,  0.04128155, -0.0274299 ,
       -0.10521095,  0.13660187, -0.1518419 ,  0.0666372 ,  0.13695988,
        0.09532055, -0.05728863, -0.06408585,  0.15336484,  0.03646451,
        0.02260339,  0.00561686,  0.06553128, -0.17305465,  0.03

In [18]:
ft_model.wv.most_similar('고인물')

[('▁고인물', 0.8431247472763062),
 ('▁꼰대', 0.7812214493751526),
 ('으신', 0.7570650577545166),
 ('▁맛집', 0.7554405331611633),
 ('아서', 0.7437090277671814),
 ('▁윗', 0.739393949508667),
 ('▁꼰대가', 0.7362608909606934),
 ('▁고인물이', 0.733004629611969),
 ('시절', 0.725588321685791),
 ('▁싸', 0.7252355217933655)]

In [19]:
ft_model.wv.most_similar('비정규직')

[('▁비정규직', 0.9578745365142822),
 ('정규직', 0.917824923992157),
 ('▁정규직', 0.837492048740387),
 ('▁정규직과', 0.7432001829147339),
 ('계약직', 0.7379484176635742),
 ('▁자영업', 0.7310437560081482),
 ('▁교사의', 0.7232523560523987),
 ('▁계약직', 0.7094197869300842),
 ('업종', 0.704563319683075),
 ('▁저임금', 0.6973295211791992)]

In [20]:
ft_model.wv.similarity('계약직', '비정규직')

0.7379484

In [21]:
ft_model.wv.similarity('계약직', '육아휴직')

0.29617518

In [22]:
ft_model.wv.similarity('출산', '육아휴직')

0.6617119

### 5. 문장 분류 (Document Classification)

In [23]:
tokens = bpe_tokenizer.encode('이 회사는연봉상승이 너무 짜다').tokens
embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
embedding.shape

(100,)

In [24]:
def text_to_embedding(input_sent) :
    tokens = bpe_tokenizer.encode(input_sent).tokens
    embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
    return embedding

In [25]:
textdf['embedding'] = [text_to_embedding(text) for text in tqdm(textdf['text'])]

100%|████████████████████████████████████████████████████████████████████████████| 2123/2123 [00:00<00:00, 5203.23it/s]


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [27]:
np.random.seed(0)
X_train, X_test, y_train, y_test =\
train_test_split(np.vstack(textdf['embedding'].values), textdf['label'].values, test_size=.25, random_state=0)

In [28]:
# Logistic Regression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# Support Vector Machine
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True, random_state=0)

In [29]:
accuracy_score(y_test, lr_clf.predict(X_test)), f1_score(y_test, lr_clf.predict(X_test))

(0.8926553672316384, 0.905785123966942)

In [30]:
accuracy_score(y_test, svm_clf.predict(X_test)), f1_score(y_test, svm_clf.predict(X_test))

(0.9001883239171374, 0.9115191986644408)

In [31]:
accuracy_score(y_test, rf_clf.predict(X_test)), f1_score(y_test, rf_clf.predict(X_test))

(0.864406779661017, 0.8842443729903537)

In [32]:
def hr_sentiment_classifier(input_sent) :
    embedding = text_to_embedding(input_sent)
    output = svm_clf.predict(embedding.reshape(1,-1))[0]
    return output    

In [33]:
hr_sentiment_classifier('자유분방하고 수평적인 분위기')

1

In [34]:
hr_sentiment_classifier('연봉이 거의 안 오름')

0

In [35]:
hr_sentiment_classifier('초봉이 높아서 만족')

1

In [36]:
hr_sentiment_classifier('적어도 내가 속한 팀은 꼰대 쌉쓰레기임')

0

In [37]:
hr_sentiment_classifier('월급 꿀빨 고인물들 너무 많음')

0

In [38]:
hr_sentiment_classifier('믿음직하고 배울점 많은 선배들')

0

In [39]:
hr_sentiment_classifier('여긴 천국이야')

0

In [40]:
hr_sentiment_classifier('여기서는 2년 넘게 있으면 안 되는거 다들 알죠? ^오^')

0