# 自然語言處理(NLP)實作

## 程式參考來源：
- https://keras.io/api/layers/core_layers/embedding/
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://keras.io/guides/working_with_rnns/


## Embedding

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [38]:
model = tf.keras.Sequential()

# 字彙表最大為1000，輸出維度為 64，輸入的字數為 10
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# 產生亂數資料，32筆資料，每筆 10 個數字
input_array = np.random.randint(1000, size=(32, 10))

# 指定損失函數
model.compile('rmsprop', 'mse')

# 預測
output_array = model.predict(input_array)
print(output_array.shape)
output_array[0]

(32, 10, 64)


array([[ 1.58804767e-02,  1.71773918e-02, -1.62376054e-02,
        -2.38022935e-02,  3.62489112e-02,  2.38510408e-02,
         4.03696336e-02,  6.44271448e-03,  3.51262204e-02,
        -2.70358566e-02,  4.50535864e-03,  4.37803976e-02,
        -4.43189740e-02,  2.06765272e-02, -1.67950280e-02,
         4.75644954e-02,  2.89392732e-02,  8.01891088e-03,
        -1.17008574e-02,  2.79867686e-02,  7.28737563e-04,
         4.34629992e-03,  3.50525863e-02,  3.18518169e-02,
        -3.75759602e-03, -2.35618278e-03,  4.21545841e-02,
        -1.17904320e-02, -1.81266889e-02, -2.18390301e-03,
         2.88045295e-02,  2.50958316e-02,  3.09901945e-02,
        -2.30692513e-02, -2.09498405e-02,  3.55203412e-02,
         1.25880949e-02, -3.52970958e-02, -4.21873108e-02,
        -1.78522244e-02, -3.74946371e-02, -2.22765934e-02,
        -4.33247574e-02,  2.64650844e-02, -1.74780004e-02,
        -7.14511797e-03, -4.38716263e-03, -3.69833484e-02,
        -3.89942639e-02,  5.34845516e-03,  2.33540200e-0

## 使用真實的資料

In [39]:
import tensorflow as tf
from tensorflow.keras import layers
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 測試資料
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']


vocab_size = 50
maxlen = 4

# 先轉成 one-hot encoding
encoded_docs = [one_hot(d, vocab_size) for d in docs]

# 轉成固定長度，長度不足則後面補空白
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

# 模型只有 Embedding
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 64, input_length=maxlen))
model.compile('rmsprop', 'mse')

# 預測
output_array = model.predict(padded_docs)
output_array.shape




(10, 4, 64)

## 加上Dense

In [23]:
import tensorflow as tf
from tensorflow.keras import layers
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

vocab_size = 50
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))
model.add(layers.Flatten())
# 加上一般的完全連接層(Dense)
model.add(layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))


Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 4, 8)              400       
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 80.000001


## 加上 RNN

In [25]:
import tensorflow as tf
from tensorflow.keras import layers
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

vocab_size = 50
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))
# Add a RNN layer with 128 internal units.
model.add(layers.SimpleRNN(128))
model.add(layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))


Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 4, 8)              400       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               17536     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 18,065
Trainable params: 18,065
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 100.000000


## 使用詞向量(Word2Vec)

In [29]:
import tensorflow as tf
from tensorflow.keras import layers
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


## 讀取 GloVe 100維的詞向量，產生字典資料型的變數，方便搜尋

In [28]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove/glove.6B.100d.txt', encoding='utf8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.array(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

## 分詞

In [34]:
vocab_size = 50
maxlen = 4


# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

## 轉換為GloVe 100維的詞向量

In [32]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

## Embedding 設為不需訓練，直接輸入轉換後的向量

In [36]:
model = tf.keras.Sequential()

# trainable=False
model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False))

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(128))
model.add(layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 4, 100)            5000      
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 122,377
Trainable params: 117,377
Non-trainable params: 5,000
_________________________________________________________________
None
Accuracy: 100.000000
