## 네이버 영화평 감성분석
- Tokenizer function
- TfidfVectorzier + Logistic Regression

In [27]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [28]:
# 네이버 영화 리뷰 데이터로 검색
train_df = pd.read_csv('./data/Naver_movie_train_preprocesssed,tsv', sep='\t')
test_df = pd.read_csv('./data/Naver_movie_test_preprocessed.tsv', sep='\t')
train_df.shape, test_df.shape

FileNotFoundError: [Errno 2] No such file or directory: './data/naver_movie_train_전처리완료,tsv'

In [29]:
from konlpy.tag import Okt
okt = Okt()

In [30]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']
a = '은 는 이 가'.split() # stop words 추가
stopwords.append(a)

In [31]:
def okt_tokenizer(text):
    morphs = okt.morphs(text, stem=True)
    tokens = [word for word in morphs if word not in stopwords]
    return tokens


In [32]:
okt_tokenizer('열심히 일한 당신 주말엔 여행을 떠나봐요.')

['열심히', '일', '당신', '주말', '엔', '여행', '떠나다', '보다', '.']

- Pipeline으로 특성 변환과 분류를 동시에

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [36]:
train_df.dropna(how='any', inplace=True)
train_df.isna().sum()

id          0
document    0
label       0
dtype: int64

In [37]:
# pipeline을 활용하면 한글이나 영문이나 동일하게 취급할 수 있음
pipeline = Pipeline([
    ('TFIDF', TfidfVectorizer(tokenizer=okt_tokenizer)),
    ('LR',LogisticRegression(random_state=2022))
])
pipeline.fit(train_df.document, train_df.label)

Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(tokenizer=<function okt_tokenizer at 0x000001E7D22AF8B0>)),
                ('LR', LogisticRegression(random_state=2022))])

In [39]:
test_df.dropna(how='any', inplace=True)
test_df.isna().sum()

id          0
document    0
label       0
dtype: int64

In [41]:
pipeline.score(test_df.document, test_df.label)

0.8474508470508231

- Test

In [42]:
import re
reviews = ['모든 국민이 봤으면 하는 영화입니다.',
            '생각보다 지루하고 별로였네요... 보면서 좀 졸았습니다.']
reviews = map(lambda x: re.sub('[^가-힣]',' ',x), reviews)

In [43]:
pipeline.predict(reviews)

array([1, 0], dtype=int64)

- Parameter Tuning

In [44]:
from sklearn.model_selection import GridSearchCV
params = {
    'TFIDF__ngram_range':[(1,1), (1,2)],
    'TFIDF__max_df':[0.95, 0.98],
    'LR__C': [1, 5]
}

In [47]:
grid_pipe = GridSearchCV(pipeline, params, cv=5, scoring='accuracy')
# %time grid_pipe.fit(train_df.document, train_df.label)

- Eval by Parameters of CountVectorizer exam

In [48]:
pipeline = Pipeline([
    ('TFIDF', TfidfVectorizer(tokenizer=okt_tokenizer, max_df=0.95, ngram_range=(1,2))),
    ('LR', LogisticRegression(random_state=2022))
])
%time pipeline.fit(train_df.document, train_df.label)

CPU times: total: 12min 56s
Wall time: 11min 58s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(max_df=0.95, ngram_range=(1, 2),
                                 tokenizer=<function okt_tokenizer at 0x000001E7D22AF8B0>)),
                ('LR', LogisticRegression(random_state=2022))])

In [49]:
pipeline.score(test_df.document, test_df.label)

0.8619717183030982