In [6]:
# konipy 설치
pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 42.6MB/s 
[?25hCollecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.0MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 47.1MB/s 
Installing collected packages: colorama

In [1]:
import pandas as pd
import io
from google.colab import files

# 파일을 통해 업로드
uploaded = files.upload()

Saving ratings_test.txt to ratings_test.txt
Saving ratings_train.txt to ratings_train.txt


In [5]:
train_df = pd.read_csv(io.BytesIO(uploaded['ratings_test.txt']), sep = "\t")
test_df = pd.read_csv(io.BytesIO(uploaded['ratings_train.txt']), sep = "\t")

In [7]:
train_df['label'].value_counts()

1    25173
0    24827
Name: label, dtype: int64

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [9]:
# 정규표현식(특정한 규칙을 가진 문자열의 집합을 표현하는 데에 사용)을 사용하여 앞 숫자를 없애는 과정
# \d+ 의미: 1개이상 숫자가 있는 것은 " "으로 치환

import re

train_df = train_df.fillna(' ')
train_df['document'] = train_df['document'].apply( lambda x : re.sub(r"\d+", " ", str(x)) ) 
train_df.drop('id', axis=1, inplace=True)

test_df = test_df.fillna(' ')
test_df['document'] = test_df['document'].apply( lambda x : re.sub(r"\d+", " ", str(x)))
test_df.drop('id', axis=1, inplace=True)

In [10]:
# 형태소 분석 - 리스트로 된 형태소들을 반환하게 됨
from konlpy.tag import Twitter

twitter = Twitter()

def tw_tokenizer(text):
    tokens_ko = twitter.morphs(text) #morphs:형태소 분석을 해주는 함수
        return tokens_ko

        tw_tokenizer('아버지가 방에 들어가신다.')
from konlpy.tag import Twitter

twitter = Twitter()

def tw_tokenizer(text):
    tokens_ko = twitter.morphs(text) #morphs:형태소 분석을 해주는 함수
    return tokens_ko

tw_tokenizer('아버지가 방에 들어가신다.')

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


['아버지', '가', '방', '에', '들어가신다', '.']

In [12]:
# 텍스트를 Feature Vector화
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#형태소 분석함수(앞서 만든)를 tokenizer에 삽입. 그걸로 각 조건 맞춰 필터링. min_df:3보다 작은횟수 feature 삭제. max_df: 90%이상 언급되는 feature 삭제
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train_df['document'])
tfidf_matrix_train = tfidf_vect.transform(train_df['document'])



In [None]:
# 피처 벡터화가 완료되었으면, 로지스틱회귀를 통해 감성분석 classification 수행
lg_clf = LogisticRegression(random_state=0)

# 파라미터 C 최적화 위해 GridSearchCV 활용
# C 값: 로지스틱 회귀의 알파 역수
params = { 'C':[1, 3.5, 4.5, 5.5, 10] }
grid_cv = GridSearchCV(lg_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1 )
grid_cv.fit(tfidf_matrix_train, train_df['label'] )
print(grid_cv.best_params_ , round(grid_cv.best_score_, 4))

In [None]:
from sklearn.metrics import accuracy_score

#주의: 여기서 fit_transform으로 할 수 없다. 이유: 찾아보기
tfidf_matrix_test = tfidf_vect.transform(test_df['document'])
# 최적값이 학습된 것으로 test 데이터 학습
best_estimator = grid_cv.best_estimator_
preds = best_estimator.predict(tfidf_matrix_test)