# 情緒分析(Sentiment Analysis)

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Activation, Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.models import load_model


## 資料前置處理

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# https://www.kdnuggets.com/2020/03/tensorflow-keras-tokenization-text-data-prep.html
num_words = 1000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

In [4]:
# 將訓練資料轉成索引
# 讀檔
with open('./data/Sentiment_data.txt','r+', encoding='UTF-8') as f:
    train_data = f.readlines()

# 取得標註(y)、語句(x)
x=[]
y=[]
for line in train_data:
    label, sentence = line.strip().split("\t")
    x.append(sentence)
    y.append(int(label))

# 分詞    
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(x)

# 取得單字與索引對照表
word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(x)

# 計算訓練資料的字句最大字數
maxlen = max([len(i) for i in train_sequences])

# 補 0
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

# Output the results of our work
print("Word index:\n", len(word_index.keys()))
# print("\nTraining sequences:\n", train_sequences)
print("\nPadded training sequences:\n", train_padded)
print("\nPadded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))

Word index:
 2222

Padded training sequences:
 [[  3   8   7 ...   0   0   0]
 [ 57  15   3 ...   0   0   0]
 [  2 109   3 ...   0   0   0]
 ...
 [ 29   2 290 ...   0   0   0]
 [ 85  10  11 ...   0   0   0]
 [ 81   4  10 ...   0   0   0]]

Padded training shape: (7086, 40)
Training sequences data type: <class 'list'>
Padded Training sequences data type: <class 'numpy.ndarray'>


In [5]:
train_padded[-10:]

array([[ 10,  11,  13,  78,  71,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [100, 206,  51,  10,  11,  52,  13,   3, 108,  64,  21,   2,  90,
        121, 128,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 83, 285,  58, 286,  30, 287, 153, 205,  20,  30, 213,   4,  83,
        288,  99,  30,  35,  10,  11, 204, 214,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 30, 289,  82,  35,  91,  10,  11,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0],
       [ 81,   4,  10,  11,  13,  14,  79,  21,   0,   0,   0,   0, 

## 建立模型

In [6]:
# 可輸入不定長度的整數陣列
inputs = keras.Input(shape=(None,), dtype="int32")

x = Embedding(len(word_index.keys()), 128)(inputs)
# 使用 2 個 bidirectional LSTM
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Bidirectional(LSTM(64))(x)
# 分類
outputs = Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         284416    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 482,177
Trainable params: 482,177
Non-trainable params: 0
_________________________________________________________________


## 訓練模型

In [7]:
Xtrain, Xtest, ytrain, ytest = train_test_split(train_padded, np.array(y), test_size=0.2, random_state=42)
type(Xtrain), type(ytrain[0])

(numpy.ndarray, numpy.int32)

In [8]:
model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])
model.fit(Xtrain, ytrain, batch_size=64, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e084744d00>

In [9]:
model.evaluate(Xtest, ytest)



[0.0427795872092247, 0.9873060584068298]

## 將測試的語句轉為索引後，預測

In [10]:
x = ['i like the movie very much', 'i hate it']
test_sequences = tokenizer.texts_to_sequences(x)

# Pad the testing sequences
test_padded = pad_sequences(test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

model.predict(test_padded)

array([[9.8375064e-01],
       [9.6081186e-04]], dtype=float32)