In [None]:
# pip install konlpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
from konlpy.tag import Okt
from collections import Counter
from konlpy.tag import Mecab
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
review_train = pd.read_csv('data/convenience_food_reviews_50.csv')
review_test = pd.read_csv('data/convenience_food_reviews.csv')

# columns 이름 바꾸기
review_train.rename(columns={'리뷰':'review', '라벨':'label'}, inplace=True)
review_test.rename(columns={'리뷰':'review', '라벨':'label'}, inplace=True)

In [None]:
# train data 정제 / 한글, 공백 제외하고 모두 제거
review_train['review'] = review_train['review'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

# test data 정제 / 한글, 공백 제외하고 모두 제거
review_test['review'] = review_test['review'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", )

In [None]:
#토큰화
okt = Okt()

stopwords = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게']


In [None]:
review_train['tokenized'] = review_train['review'].apply(okt.morphs)
review_train['tokenized'] = review_train['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
review_test['tokenized'] = review_test['review'].apply(okt.morphs)
review_test['tokenized'] = review_test['tokenized'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
negative_words = np.hstack(review_train[review_train.label == 0]['tokenized'].values)
positive_words = np.hstack(review_train[review_train.label == 1]['tokenized'].values)

In [None]:
negative_word_count = Counter(negative_words)
print(negative_word_count.most_common(20))

positive_word_count = Counter(positive_words)
print(positive_word_count.most_common(20))

In [None]:
X_train = review_train['tokenized'].values
y_train = review_train['label'].values
X_test= review_test['tokenized'].values
y_test = review_test['label'].values

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
# 텍스트 시퀀스 -> 정수 시퀀스, 정수 인코딩에서 큰 숫자가 부여된 단어 oov 변환
vocab_size = 632 # total - rare + 2

tokenizer = Tokenizer(vocab_size, oov_token = 'OOV') 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
max_len = 10

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
from tensorflow.keras.layers import Embedding, Dense, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GRU(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=70, callbacks=[es, mc], batch_size=64, validation_split=0.2)

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence)
  new_sentence = [word for word in new_sentence if not word in stopwords]
  encoded = tokenizer.texts_to_sequences([new_sentence])
  pad_new = pad_sequences(encoded, maxlen = max_len)

  score = float(loaded_model.predict(pad_new))
  if(score > 0.5):
    print("{:.2f}% 긍정 리뷰입니다.".format(score * 100))
  else:
    print("{:.2f}% 부정 리뷰입니다.".format((1 - score) * 100))
