In [1]:
import numpy as np
import pandas as pd
import tokenizers # 토큰화(tokenization)
import gensim # 임베딩(embedding) : 토큰을 숫자(벡터/좌표)로 변환
from tqdm import tqdm

### 1. 데이터 기초 전처리 및 사전 기반 알고리즘의 활용

In [2]:
np.random.seed(0)
moviedf = pd.read_csv('./data/naver_movie.csv', index_col=0).sample(500)
shoppingdf = pd.read_csv('./data/naver_shopping.csv', index_col=0).sample(500)
beepdf = pd.read_csv('./data/beep_dataset.csv', index_col=0).sample(500)
jobplanetdf = pd.read_csv('./data/jobplanet_review.csv', index_col=0)

moviedf.columns = ['text', 'label']
shoppingdf.columns = ['text', 'label']

In [3]:
textdf =\
pd.concat([moviedf, shoppingdf, beepdf,
           jobplanetdf[jobplanetdf['label']==1][['good', 'label']].rename({'good':'text'}, axis=1),
           jobplanetdf[jobplanetdf['label']==0][['bad', 'label']].rename({'bad':'text'}, axis=1)],
          axis=0, ignore_index=True)
textdf['label'] = textdf['label'].replace({0:1, 1:0})

In [4]:
bad_words = ['나쁜', '구린', '썩은', '별로', '최악', '끔찍', 
             '지루', '잠', '환불', '늑장', '늦장', '꼰대', '정치질', '억압'
             '하향', '수직']
textdf['pred'] = textdf['text'].str.contains('|'.join(bad_words)).astype(int)
textdf['correct'] = (textdf['label']==textdf['pred']).astype(int)
np.mean(textdf['correct'])

0.5465084184377588

### 2. 토큰화

In [5]:
mobiledf = pd.read_csv('./data/mobilecarrier_news.csv', index_col=0)
welfaredf = pd.read_csv('./data/welfare_news.csv', index_col=0)

In [6]:
corpus = pd.concat([textdf['text'], 
                    welfaredf['title'], welfaredf['content'], 
                    mobiledf['title'].drop_duplicates()], axis=0, ignore_index=True)
corpus = corpus.str.replace('\[(.*?)\]', '')
corpus = corpus.str.split().str.join(' ')

In [7]:
from tokenizers import SentencePieceBPETokenizer
bpe_tokenizer = SentencePieceBPETokenizer()
bpe_tokenizer.train_from_iterator(corpus)

In [8]:
bpe_tokenizer.encode('안녕하세요저는방송사에서PD로일을하고있습니다').tokens

['▁안녕', '하세요', '저', '는', '방송', '사에서', 'P', 'D', '로', '일을', '하고', '있', '습니다']

In [9]:
from tqdm import tqdm

In [10]:
token_corpus = []
for text in tqdm(corpus) :
    token_corpus.append(bpe_tokenizer.encode(text).tokens)
    
token_corpus = pd.Series(token_corpus)

100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:11<00:00, 1314.44it/s]


In [11]:
token_corpus = pd.Series([bpe_tokenizer.encode(text).tokens for text in tqdm(corpus)])

100%|██████████████████████████████████████████████████████████████████████████| 15588/15588 [00:11<00:00, 1371.39it/s]


In [12]:
token_corpus

0        [▁가르, 시아, 를, ▁알게, ▁해, 준, ▁영화, .., 마, 지막, ▁장면, ...
1        [▁이영, 화를, 보면서, ▁아프, 겠다, 라는, 생각, 이, 들, 엇, 다, ▁ᄏ...
2                          [▁이, ▁영화를, ▁보고, ▁암, 이, ▁나, 았다.]
3        [▁민, 들, 레, 부모가, ▁전하는, ▁아, 가, 페, 적인, 사랑, ..., 가...
4          [▁어린이집, ▁재, 롱, 잔, 치를, ▁만, 원, 주고, ▁보는, ▁기분, 이라면]
                               ...                        
15583            [▁SKT,, ▁3년, ▁연속, ▁', 최고, ▁이통사, ', ▁영, 예]
15584                [▁KT,, ▁AI, ▁코딩, ▁초등, 생, ▁꿈, 나무, ▁시상]
15585    [▁', 딸, ▁KT, ▁특별, 채용, ', ▁김성, 태, ▁사퇴, 에, .., 민...
15586    [▁SKT,, ▁내일, ▁분할, ▁후, ▁재, 상장, .., 주가, ▁날아, 오를, 까]
15587     [▁LG유플러스,, ▁인공지능, ▁관리, ▁'스마트, ▁양, 계, 장, ', ▁만든다]
Length: 15588, dtype: object

### 3. 임베딩

In [None]:
from gensim.models import Word2Vec, FastText

w2v_model = Word2Vec(token_corpus, sg=1, epochs=7)
ft_model = FastText(token_corpus, sg=1, epochs=7)

In [48]:
w2v_model.wv.similarity('연봉', '코로나')

0.1151544

In [49]:
w2v_model.wv['행복']

array([ 0.16521913, -0.18701477, -0.39698306,  0.0355202 ,  0.05798006,
       -0.13161759, -0.37331873, -0.11356381, -0.10481597, -0.17221938,
        0.06244675, -0.2788491 , -0.12742016, -0.0970618 , -0.03446411,
        0.08549043,  0.4998588 ,  0.12588818,  0.4782302 , -0.7285214 ,
        0.02492709,  0.23192786, -0.75775456,  0.07073252,  0.11411864,
        0.25108618,  0.03886351,  0.06889918, -0.69241524, -0.28796047,
       -0.06283448,  0.09872311,  0.07186755,  0.12980345,  0.0334126 ,
        0.05788888, -0.05179298, -0.00893757,  0.09150636,  0.10221846,
        0.03644586,  0.29304215, -0.04010261,  0.08229563,  0.22286223,
       -0.38315177, -0.5096461 , -0.09943447,  0.21590006,  0.10774019,
        0.03086062, -0.06111997, -0.2096073 ,  0.5522317 , -0.610717  ,
       -0.15873753, -0.44212157, -0.17393778, -0.12312325,  0.09924411,
        0.00348979, -0.07833613, -0.5191681 ,  0.21401459, -0.03272829,
       -0.10123712,  0.09055684,  0.3246691 , -0.04846293,  0.39

### 4. 머신러닝

In [50]:
tokens = bpe_tokenizer.encode('이 회사는 분위기가 너무 별로다').tokens
embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)

In [52]:
tokens

['▁이', '▁회사는', '▁분위기가', '▁너무', '▁별로', '다']

In [53]:
ft_model.wv['▁이']

array([-1.03703193e-01,  1.69759005e-01,  1.23933546e-01,  8.58766019e-01,
       -2.31962800e-01, -1.75621659e-01, -4.91009712e-01,  4.05360132e-01,
       -2.08736733e-02, -7.65724182e-02, -1.16519898e-01,  2.12718487e-01,
        5.64340055e-02, -2.61876851e-01, -9.41539407e-02, -4.33692902e-01,
        3.42092156e-01,  3.70937586e-02,  3.45549166e-01, -2.68702686e-01,
        6.30878136e-02,  1.94062203e-01,  4.24055099e-01,  3.10822845e-01,
       -1.54041573e-01,  8.44970942e-02, -2.47456565e-01,  1.96155161e-01,
        2.00928092e-01,  9.69892740e-01,  5.60990393e-01, -5.56201935e-01,
        1.32961914e-01, -5.05102426e-02,  1.26770914e-01,  3.62415195e-01,
       -1.93500459e-01,  3.75687853e-02,  4.75911945e-02, -2.59729683e-01,
       -2.50874996e-01,  1.78010389e-01,  2.44793147e-01, -2.59054899e-01,
       -4.21470135e-01, -8.07233036e-01,  1.21662267e-01,  3.34195524e-01,
       -6.93352103e-01,  5.15397549e-01, -6.21641397e-01, -3.13594043e-02,
        1.16999477e-01,  

In [54]:
def text_to_embedding(input_sent) :
    tokens = bpe_tokenizer.encode(input_sent).tokens
    embedding = np.mean([ft_model.wv[tk] for tk in tokens], axis=0)
    return embedding

In [57]:
textdf['embedding'] = [text_to_embedding(text) for text in textdf['text']]

In [64]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm


from sklearn.metrics import accuracy_score, f1_score

In [66]:
np.random.seed(0)
X_train, X_test, y_train, y_test =\
train_test_split(np.vstack(textdf['embedding'].values), textdf['label'].values, test_size=.25, random_state=0)

In [67]:
# Logistic Regression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# Support Vector Machine
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=0)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(oob_score=True, random_state=0)

In [70]:
# 로지스틱 회귀
accuracy_score(y_test, lr_clf.predict(X_test)), f1_score(y_test, lr_clf.predict(X_test))

(0.7527593818984547, 0.7533039647577092)

In [71]:
# 서포트벡터머신
accuracy_score(y_test, svm_clf.predict(X_test)), f1_score(y_test, svm_clf.predict(X_test))

(0.7781456953642384, 0.7895287958115182)

In [72]:
# 랜덤포레스트
accuracy_score(y_test, rf_clf.predict(X_test)), f1_score(y_test, rf_clf.predict(X_test))

(0.7770419426048565, 0.7823275862068965)

In [74]:
def sentiment_classifier(input_sent) :
    embedding = text_to_embedding(input_sent)
    output = svm_clf.predict(embedding.reshape(1,-1))[0]
    return output    

In [82]:
sentiment_classifier('7ㅐㄴㅏ븐ㅅH77ㅣ')

0