# 影評資料集(IMDB movie review)情緒分析 

In [55]:
# 載入相關套件
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [56]:
# 參數設定
batch_size = 128            # 批量
embedding_output_dims = 15  # 嵌入層輸出維度
max_sequence_length = 300   # 句子最大字數
num_distinct_words = 5000   # 字典
number_of_epochs = 5        # 訓練執行週期
validation_split = 0.20     # 驗證資料比例
verbosity_mode = 1          # 訓練資料訊息顯示程度

In [57]:
# 載入 IMDB 影評資料集，TensorFlow 已將資料轉為索引值
(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=num_distinct_words)
print(x_train.shape)
print(x_test.shape)

# 長度不足時補 0
padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length
                              , value = 0.0) 
padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length
                                   , value = 0.0) 

# 建立模型
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, 
                    input_length=max_sequence_length))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=['accuracy'])

# 模型彙總資訊
model.summary()

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


(25000,)
(25000,)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 15)           75000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 10)                1040      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 76,051
Trainable params: 76,051
Non-trainable params: 0
_________________________________________________________________


In [58]:
y_test

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [59]:
# 訓練模型
history = model.fit(padded_inputs, y_train, batch_size=batch_size, 
            epochs=number_of_epochs, verbose=verbosity_mode, 
            validation_split=validation_split)

# 模型評估
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Loss: {test_results[0]}, Accuracy: {100*test_results[1]}%')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.3329777419567108, Accuracy: 87.00399994850159%


In [60]:
# 模型存檔
model.save('LSTM_IMDB.h5')

In [61]:
# 取得字詞與索引的對照表字典
imdb_dict = imdb.get_word_index()
list(imdb_dict.keys())[:10]

['fawn',
 'tsukino',
 'nunnery',
 'sonja',
 'vani',
 'woods',
 'spiders',
 'hanging',
 'woody',
 'trawling']

In [62]:
# 反轉字典，變成索引與字詞的對照表
imdb_dict_reversed = {}
for k, v in imdb_dict.items():
    imdb_dict_reversed[v] = k

In [64]:
# 顯示測試資料前兩筆為文字
text = []
for i, line in enumerate(padded_inputs_test[:2]):
    text.append('')
    for j, word in enumerate(line):
        if word != 0:
            text[i] += imdb_dict_reversed[word]+' '

print('\n\n\n'.join(text))

the wonder own as by is sequence i i and and to of hollywood br of down and getting boring of ever it sadly sadly sadly i i was then does don't close and after one carry as by are be and all family turn in does as three part in another some to be probably with world and her an have and beginning own as is sequence 


the as you world's is quite br and most that quest are chase to being quickly of little it time hell to plot br of something long put are of every place this and and of and storytelling being nasty not of you warren in is failed club i i of films pay so sequences and film okay uses to received and if time done for room and viewer as cartoon of gives to forgettable br be because many these of and and contained gives it wreck scene to more was two when had find as you another it of themselves probably who and storytelling if itself by br about 1950's films not would effects that her box to miike for if hero close seek end is very together movie of and got say kong and fred c

In [65]:
imdb_dict_reversed[488]

'close'

In [66]:
imdb_dict['close']

488

In [67]:
import nltk

# 以上述語句測試
X_tokens = []
for line in text:
    tokens = nltk.word_tokenize(line)
    tokens = [token.strip() for token in tokens]
    X_tokens.append(tokens)
    
# 轉為索引值
import numpy as np
X_index = np.zeros((len(text), max_sequence_length))
for i, line in enumerate(X_tokens):
    for j, word in enumerate(line):
        if j >= max_sequence_length:
            break
        if word in imdb_dict:
            # 因為num_distinct_words=5000, 怕反轉為數字時會出錯，超過5000時設為0
            if imdb_dict[word] < num_distinct_words:
                X_index[i, j] = imdb_dict[word]


In [68]:
X_index[0, :65].astype(np.int)

array([   1,  591,  202,   14,   31,    6,  717,   10,   10,    2,    2,
          5,    4,  360,    7,    4,  177,    2,  394,  354,    4,  123,
          9, 1035, 1035, 1035,   10,   10,   13,   92,  124,   78,    0,
        488,    2,  100,   28, 1668,   14,   31,   23,   27,    2,   29,
        220,  468,    8,  124,   14,  286,  170,    8,  157,   46,    5,
         27,  239,   16,  179,    2,   38,   32,   25,    2,  451])

In [69]:
# 長度不足時補 0
padded_inputs = pad_sequences(X_index, maxlen=max_sequence_length, 
                      value = 0.0) 

# 預測
np.argmax(model.predict(padded_inputs), axis=-1)



array([[0],
       [1]])

In [70]:
# 以原資料預測，確認答案相同
np.argmax(model.predict(padded_inputs_test[:2]), axis=-1)

array([[0],
       [1]])