감성 분석

- 케라스의  GRU 모델을 이용한 IMDB 영화 리뷰 감성 분석

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
#kears에서 제공하는 IMDB 리뷰 데이터 불러오기
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000) #빈도수 상위 10000개 단어만 가져오기
print(X_train.shape)
print(y_train.shape)
print(X_train[:2]) #작은 값일 수록 자주 등장한 단어
print(y_train[:2]) #0:부정, 1:긍정

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
(25000,)
(25000,)
[list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283,

In [None]:
#인덱스 리스트를 다시 텍스트로 바꿔보기
word_index = imdb.get_word_index()
word_index2 = {v:k for k,v in word_index.items()} 

for word in X_train[0]:
  print(word_index2.get(word-3), end=" ") #0:padding, 1:시작, 2:없는단어 의미하는 인덱스-> word-3

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
None this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert None is an amazing actor and now the same being director None father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for None and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also None to the two little boy's that played the None of norman and paul they were just brilliant children are often left out of the None list i think because the stars that play them all grown up are such

In [None]:
#각 리뷰 길이 1000으로 맞춰주기
X_train = pad_sequences(X_train, maxlen=1000) #초과하면 삭제, 부족하면 0으로 채우기
X_test = pad_sequences(X_test, maxlen=1000)

In [None]:
#딥러닝 모델 생성
model = Sequential()
model.add(Embedding(10000, 100))
model.add(GRU(128)) 
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=['acc'])

In [None]:
#모델 학습
mc = ModelCheckpoint('my_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True) #이전보다 검증 정확도가 높아지면 모델 저장
history = model.fit(X_train, y_train, epochs=5, callbacks=[mc], batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 1: val_acc improved from -inf to 0.77320, saving model to my_model.h5
Epoch 2/5
Epoch 2: val_acc improved from 0.77320 to 0.87420, saving model to my_model.h5
Epoch 3/5
Epoch 3: val_acc improved from 0.87420 to 0.88820, saving model to my_model.h5
Epoch 4/5
Epoch 4: val_acc did not improve from 0.88820
Epoch 5/5
Epoch 5: val_acc improved from 0.88820 to 0.88920, saving model to my_model.h5


In [None]:
#저장한 모델 로드
loaded_model = load_model('my_model.h5')
print("정확도:",(loaded_model.evaluate(X_test, y_test)[1]))

정확도: 0.8823999762535095


In [None]:
#실제 리뷰 긍정/부정 예측해보기
import re

#아케인 리뷰 출처: https://www.imdb.com/title/tt11126994/reviews?ref_=tt_urv

#부정적 리뷰 예측

#아케인 1점 리뷰
bad_sentence = '''What the hell was that really wasted my time there. Everyone tries their best to do something and viola that jinx is there to ruin it all,can there be more annoying character than her wtf writing.'''

bad_sentence = re.sub('[^\w ]', '', bad_sentence).lower() #word(숫자, 알파벳, _)와 space만 추출

encoded = []
for word in bad_sentence.split():
    if word in word_index and word_index[word] <= 10000:
      encoded.append(word_index[word]+3)
    else: 
      encoded.append(2) #사전에 없거나 10000이상인 경우, 2(None 의미하는 인덱스) 넣기

pad_sequence = pad_sequences([encoded], maxlen=1000) 
score = float(loaded_model.predict(pad_sequence)) # 로드한 모델로 예측하기

print(f"긍정적인 리뷰일 확률: {score * 100}% ")


긍정적인 리뷰일 확률: 1.0024458169937134% 


In [None]:
#긍정적 리뷰 예측

#아케인 10점 리뷰
good_sentence = '''So many positives i don't know where to start! This is unique, gripping probably won't happen again like this! Phenomenal. Art is an anomaly; i don't get how but i know it is reality.'''

good_sentence = re.sub('[^\w ]', '', good_sentence).lower() #word(숫자, 알파벳, _)와 space만 추출

encoded = []
for word in good_sentence.split():
    if word in word_index and word_index[word] <= 10000:
      encoded.append(word_index[word]+3)
    else: 
      encoded.append(2) #사전에 없거나 10000이상인 경우, 2(None 의미하는 인덱스) 넣기

pad_sequence = pad_sequences([encoded], maxlen=1000) 
score = float(loaded_model.predict(pad_sequence)) # 로드한 모델로 예측하기

print(f"긍정적인 리뷰일 확률: {score * 100}% ")

긍정적인 리뷰일 확률: 97.97621965408325% 
