# 감성분석 모델 구축

In [None]:
# ! pip install scikit-learn
# ! pip install konlpy

In [1]:
import re
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.request
from collections import Counter
from konlpy.tag import Okt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# from konlpy.tag import Mecab
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

warnings.filterwarnings(action = 'ignore') # warning 메시지 표시x

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 4. 훈련모델Ⅳ - 214,060개로 먼저 선훈련
- 네이버+쿠팡 리뷰합친 데이터로 먼저 훈련을 진행하고,
- 쿠팡데이터로만 재평가 실시
- 214,060 : 14,060

In [2]:
model4_naver_coupang = pd.read_excel('(4)all_reviews_214060.xlsx') # 214,060개
test_data = pd.read_excel('(3)coupang_reviews.xlsx') # 14,060개

# 0. 훈련 데이터와 테스트 데이터를 3:1 비율로 분리
train_data = model4_naver_coupang

print('훈련용 리뷰의 개수 :', len(train_data))
# 훈련용 리뷰의 개수 : 160545


# 2. 한글 외의 문자 제거
train_data['reviews'] = train_data['reviews'].apply(lambda x : re.sub(r'[^ ㄱ-ㅣ가-힣]+', " ", x))
test_data['reviews'] = test_data['reviews'].apply(lambda x : re.sub(r'[^ ㄱ-ㅣ가-힣]+', " ", x))

# train_data.head()
# test_data.head()

훈련용 리뷰의 개수 : 214060


In [3]:
# 3. 분석 모델 구축 전 작업

# 1)형태소 분석 - 문장을 토큰화
okt = Okt()
def okt_tokenizer(text):
    tokens = okt.morphs(text)
    return tokens

# 2)TF-IDF 벡터화에 사용할 tfidf 객체 생성 -> 벡터로 변환(transform)
tfidf = TfidfVectorizer(tokenizer=okt_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf.fit(train_data['reviews'])
model4_train_tfidf = tfidf.transform(train_data['reviews'])
model4_train_tfidf

# 20분 41초

<214060x205025 sparse matrix of type '<class 'numpy.float64'>'
	with 5607484 stored elements in Compressed Sparse Row format>

In [4]:
# 4. 분석 모델 구축

SA_lr = LogisticRegression(random_state=0)

SA_lr.fit(model4_train_tfidf, train_data['label'])

params = {'C': [1,3,3.5,4,4.5,5]}
SA_lr_grid_cv = GridSearchCV(SA_lr, param_grid=params, cv=3, scoring='accuracy', verbose=1)

SA_lr_grid_cv.fit(model4_train_tfidf, train_data['label'])

print(SA_lr_grid_cv.best_params_, round(SA_lr_grid_cv.best_score_, 4))

SA_lr_best1 = SA_lr_grid_cv.best_estimator_ 

# {'C': 3.5} 0.8977

Fitting 3 folds for each of 6 candidates, totalling 18 fits
{'C': 3.5} 0.8977


In [5]:
# 5. 분석 모델 평가 - 모델 정확도 확인

model4_test_tfidf = tfidf.transform(test_data['reviews'])

test_predict = SA_lr_best1.predict(model4_test_tfidf)

print('감성 분석 정확도 : ', round(accuracy_score(test_data['label'], test_predict), 3))
# 감성 분류 모델의 정확도가 98%! -> 이게 바로 오퍼피팅의 끝
# (2분 2초)

감성 분석 정확도 :  0.98


### Why? 
- 흠..model1에 비해 정확도가 떨어질 것이라 생각했는데, 오히려 오름