In [1]:
import numpy as np
import pandas as pd
import tokenizers # 토큰화(tokenization)
import gensim # 임베딩(embedding) : 토큰을 숫자(벡터/좌표)로 변환
from tqdm import tqdm

### 1. 데이터 기초 전처리 및 사전 기반 알고리즘의 활용

In [2]:
np.random.seed(0)
moviedf = pd.read_csv('./data/naver_movie.csv', index_col=0).sample(500)
shoppingdf = pd.read_csv('./data/naver_shopping.csv', index_col=0).sample(500)
beepdf = pd.read_csv('./data/beep_dataset.csv', index_col=0).sample(500)
jobplanetdf = pd.read_csv('./data/jobplanet_review.csv', index_col=0)

moviedf.columns = ['text', 'label']
shoppingdf.columns = ['text', 'label']

In [3]:
textdf =\
pd.concat([moviedf, shoppingdf, beepdf,
           jobplanetdf[jobplanetdf['label']==1][['good', 'label']].rename({'good':'text'}, axis=1),
           jobplanetdf[jobplanetdf['label']==0][['bad', 'label']].rename({'bad':'text'}, axis=1)],
          axis=0, ignore_index=True)
textdf['label'] = textdf['label'].replace({0:1, 1:0})

In [4]:
bad_words = ['나쁜', '구린', '썩은', '별로', '최악', '끔찍', 
             '지루', '잠', '환불', '늑장', '늦장', '꼰대', '정치질', '억압'
             '하향', '수직']
textdf['pred'] = textdf['text'].str.contains('|'.join(bad_words)).astype(int)
textdf['correct'] = (textdf['label']==textdf['pred']).astype(int)
np.mean(textdf['correct'])

0.5465084184377588

### 2. 토큰화

In [5]:
mobiledf = pd.read_csv('./data/mobilecarrier_news.csv', index_col=0)
welfaredf = pd.read_csv('./data/welfare_news.csv', index_col=0)

In [6]:
corpus = pd.concat([textdf['text'], 
                    welfaredf['title'], welfaredf['content'], 
                    mobiledf['title'].drop_duplicates()], axis=0, ignore_index=True)
corpus = corpus.str.replace('\[(.*?)\]', '')
corpus = corpus.str.split().str.join(' ')

In [7]:
from tokenizers import SentencePieceBPETokenizer
bpe_tokenizer = SentencePieceBPETokenizer()
bpe_tokenizer.train_from_iterator(corpus)

In [8]:
bpe_tokenizer.encode('안녕하세요저는방송사에서PD로일을하고있습니다').tokens

['▁안녕', '하세요', '저', '는', '방송', '사에서', 'P', 'D', '로', '일을', '하고', '있', '습니다']

In [9]:
from tqdm import tqdm

In [10]:
token_corpus = []
for text in tqdm(corpus) :
    token_corpus.append(bpe_tokenizer.encode(text).tokens)
    
token_corpus = pd.Series(token_corpus)

100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:11<00:00, 1314.44it/s]


In [11]:
token_corpus = pd.Series([bpe_tokenizer.encode(text).tokens for text in tqdm(corpus)])

100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:11<00:00, 1371.39it/s]


In [12]:
token_corpus

0        [▁가르, 시아, 를, ▁알게, ▁해, 준, ▁영화, .., 마, 지막, ▁장면, ...
1        [▁이영, 화를, 보면서, ▁아프, 겠다, 라는, 생각, 이, 들, 엇, 다, ▁ᄏ...
2                          [▁이, ▁영화를, ▁보고, ▁암, 이, ▁나, 았다.]
3        [▁민, 들, 레, 부모가, ▁전하는, ▁아, 가, 페, 적인, 사랑, ..., 가...
4          [▁어린이집, ▁재, 롱, 잔, 치를, ▁만, 원, 주고, ▁보는, ▁기분, 이라면]
                               ...                        
15583            [▁SKT,, ▁3년, ▁연속, ▁', 최고, ▁이통사, ', ▁영, 예]
15584                [▁KT,, ▁AI, ▁코딩, ▁초등, 생, ▁꿈, 나무, ▁시상]
15585    [▁', 딸, ▁KT, ▁특별, 채용, ', ▁김성, 태, ▁사퇴, 에, .., 민...
15586    [▁SKT,, ▁내일, ▁분할, ▁후, ▁재, 상장, .., 주가, ▁날아, 오를, 까]
15587     [▁LG유플러스,, ▁인공지능, ▁관리, ▁'스마트, ▁양, 계, 장, ', ▁만든다]
Length: 15588, dtype: object

### 3. 임베딩

In [None]:
from gensim.models import Word2Vec, FastText

w2v_model = Word2Vec(token_corpus, sg=1, epochs=7)
ft_model = FastText(token_corpus, sg=1, epochs=7)

w2v_model.save('./model/word2vec.model')
ft_model.save('./model/fasttext.model')

In [13]:
from gensim.models import Word2Vec, FastText
w2v_model = Word2Vec.load('./model/word2vec.model')
ft_model = FastText.load('./model/fasttext.model')

FileNotFoundError: [Errno 2] No such file or directory: './model/word2vec.model'

### 4. 머신러닝

In [41]:
tokens = bpe_tokenizer.encode('이 회사는 분위기가 너무 별로다').tokens
embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
embedding.shape

(100,)

In [42]:
def text_to_embedding(input_sent) :
    tokens = bpe_tokenizer.encode(input_sent).tokens
    embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
    return embedding

In [43]:
textdf['embedding'] = [text_to_embedding(text) for text in tqdm(textdf['text'])]

100%|████████████████████████████████████████████████████████████████████████████| 3623/3623 [00:00<00:00, 5919.97it/s]


In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [45]:
np.random.seed(0)
X_train, X_test, y_train, y_test =\
train_test_split(np.vstack(textdf['embedding'].values), textdf['label'].values, test_size=.25, random_state=0)

In [46]:
# Logistic Regression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# Support Vector Machine
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True, random_state=0)

In [47]:
accuracy_score(y_test, lr_clf.predict(X_test)), f1_score(y_test, lr_clf.predict(X_test))

(0.7737306843267108, 0.7683615819209039)

In [48]:
accuracy_score(y_test, svm_clf.predict(X_test)), f1_score(y_test, svm_clf.predict(X_test))

(0.7924944812362031, 0.7772511848341231)

In [49]:
accuracy_score(y_test, rf_clf.predict(X_test)), f1_score(y_test, rf_clf.predict(X_test))

(0.7770419426048565, 0.7645687645687647)

In [50]:
def hr_sentiment_classifier(input_sent) :
    embedding = text_to_embedding(input_sent)
    output = svm_clf.predict(embedding.reshape(1,-1))[0]
    return output    

In [51]:
hr_sentiment_classifier('자유분방하고 수평적인 분위기')

1

In [52]:
hr_sentiment_classifier('연봉이 거의 안 오름')

0

In [53]:
hr_sentiment_classifier('초봉이 높아서 만족')

0

In [54]:
hr_sentiment_classifier('적어도 내가 속한 팀은 꼰대 쌉쓰레기임')

0

In [55]:
hr_sentiment_classifier('KT는 배상하라 배상하라')

0

In [56]:
hr_sentiment_classifier('믿음직한 회사 LG U+')

1

In [57]:
hr_sentiment_classifier('믿고 쓰는 SKT')

1