In [6]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb

In [7]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [8]:
word_to_index = imdb.get_word_index()
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value+3] = key

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [9]:
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index] = token

In [10]:
print(' '.join([index_to_word[index] for index in X_train[0]]))

<sos> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and shoul

감성 분류

In [11]:
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [12]:
vocab_size = 10000
max_len = 500

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [13]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GRU(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.77820, saving model to GRU_model.h5
Epoch 2/15
Epoch 2: val_acc improved from 0.77820 to 0.84240, saving model to GRU_model.h5
Epoch 3/15
Epoch 3: val_acc did not improve from 0.84240
Epoch 4/15
Epoch 4: val_acc improved from 0.84240 to 0.86960, saving model to GRU_model.h5
Epoch 5/15
Epoch 5: val_acc improved from 0.86960 to 0.88400, saving model to GRU_model.h5
Epoch 6/15
Epoch 6: val_acc did not improve from 0.88400
Epoch 7/15
Epoch 7: val_acc did not improve from 0.88400
Epoch 8/15
Epoch 8: val_acc did not improve from 0.88400
Epoch 9/15
Epoch 9: val_acc did not improve from 0.88400
Epoch 10/15
Epoch 10: val_acc did not improve from 0.88400
Epoch 10: early stopping


In [14]:
loaded_model = load_model('GRU_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8802


In [17]:
def sentiment_predict(new_sentence):
  # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
  new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
  encoded = []

  # 띄어쓰기 단위 토큰화 후 정수 인코딩
  for word in new_sentence.split():
    try :
      # 단어 집합의 크기를 10,000으로 제한.
      if word_to_index[word] <= 10000:
        encoded.append(word_to_index[word]+3)
      else:
      # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
        encoded.append(2)
    # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
    except KeyError:
      encoded.append(2)

  pad_sequence = pad_sequences([encoded], maxlen=max_len)
  score = float(loaded_model.predict(pad_sequence)) # 예측

  if(score > 0.5):
    print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
  else:
    print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))

In [20]:
#존윅4 10점 리뷰
sentences_pos = "As Mr Wick nears the end of his journey, we are given a delight of a story well structured across its constant world building. It gives a never-ending stream of new structural and actionable aspects that feel natural as it progresses, from the action, to plot points, and personal development. An intertwining of interests rooted in common found circumstances, history, connection, as well as coincidence and luck. As he came back into this life a free man, he wills to fight to leave just as he came no matter the cost. As the nature of it expands with new and unsurprising aspects, this feature more than delivers and entertains."

sentiment_predict(sentences_pos)

99.69% 확률로 긍정 리뷰입니다.


In [21]:
#존윅4 1점 리뷰
sentences_neg = "Wick needs to go from spot A to spot B. There are hundreds of enemies trying to kill him. He will fight against them all alone and will kill them with his weapon. Now he is at the spot B and needs to go to spot C. There are hundreds of enemies trying to kill him. He will fight against them all alone and will kill them with his weapon. Now he is at the spot C and needs to go to spot D. There are hundreds of enemies trying to kill him. He will fight against them all alone and will kill them with his weapon. Now he is at the spot D and needs to go to spot E. Yeah that's the same over and over again for three hours. Make yourself a favor a skip this movie. Thank me later."

sentiment_predict(sentences_neg)

97.38% 확률로 부정 리뷰입니다.
