# KoNLPy를 사용한 영화 리뷰 분석

In [None]:
from preamble import *
%config InlineBackend.figure_format='retina'

import konlpy
konlpy.__version__

In [None]:
df_train = pd.read_csv("data/ratings_train.txt", delimiter="\t", keep_default_na=False)

df_train.head()

In [None]:
text_train, y_train = df_train["document"].values, df_train["label"].values

같은 방식으로 테스트 데이터를 읽습니다.

In [None]:
df_test = pd.read_csv("data/ratings_test.txt", delimiter="\t", keep_default_na=False)
text_test = df_test["document"].values
y_test = df_test["label"].values

훈련 데이터와 테스트 데이터의 크기를 확인합니다.

In [None]:
len(text_train), np.bincount(y_train)

In [None]:
len(text_test), np.bincount(y_test)

#### Okt

In [None]:
from konlpy.tag import Okt


class PicklableOkt(Okt):

    def __init__(self, *args):
        self.args = args
        Okt.__init__(self, *args)

    def __setstate__(self, state):
        self.__init__(*state["args"])

    def __getstate__(self):
        return {"args": self.args}


okt = PicklableOkt()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

param_grid = {
    "tfidfvectorizer__min_df": [3, 5, 7],
    "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "logisticregression__C": [0.1, 1, 10],
}
pipe = make_pipeline(TfidfVectorizer(tokenizer=okt.morphs), LogisticRegression())
grid = GridSearchCV(pipe, param_grid, n_jobs=-1)

# 그리드 서치를 수행합니다
grid.fit(text_train[:1000], y_train[:1000])
print("최상의 크로스 밸리데이션 점수: {:.3f}".format(grid.best_score_))
print("최적의 크로스 밸리데이션 파라미터: ", grid.best_params_)

In [None]:
tfidfvectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
X_test = tfidfvectorizer.transform(text_test[:1000])
logisticregression = grid.best_estimator_.named_steps["logisticregression"]
score = logisticregression.score(X_test, y_test[:1000])

print("테스트 세트 점수: {:.3f}".format(score))