In [1]:
from pythonbq import pythonbq

my_project_id='maximal-cabinet-254805'
my_dataset_id='news_keywords'
credentials_path = 'C:/Users/GameToGo/MyCredentials/My First Project-7226ff9d97d3.json'

In [2]:
myProject=pythonbq(
  bq_key_path=credentials_path,
  project_id=my_project_id
)
SQL_CODE="""
    SELECT title_array, content_array, label
    FROM `maximal-cabinet-254805.news_keywords.news`
"""

output=myProject.query(sql=SQL_CODE)
print(output.shape)

Downloading: 100%|████████████████████████████████████████████████████████████| 8400/8400 [00:00<00:00, 10739.52rows/s]


(8400, 3)


In [3]:
import keras
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

title_SEQUENCE_LENGTH = 5
content_SEQUENCE_LENGTH = 20
num_classes = 7

title = [eval(x) for x in output['title_array']]
content = [eval(x) for x in output['content_array']]
title_seq = keras.preprocessing.sequence.pad_sequences(title, maxlen = title_SEQUENCE_LENGTH)
content_seq = keras.preprocessing.sequence.pad_sequences(content, maxlen = content_SEQUENCE_LENGTH)

# one-hot-encoding
label = keras.utils.to_categorical(output['label'],num_classes=num_classes)

# 將資料打亂
x1, x2, y = shuffle(title_seq, content_seq, label, random_state=42)

x1_train, x1_test, x2_train, x2_test, y_train, y_test = \
                                        train_test_split(x1, x2, y, test_size=0.2, random_state=42)

print(x2_train.shape)
print(x2_test.shape)
print(y_train.shape)
print(y_test.shape)

Using TensorFlow backend.


(6720, 20)
(1680, 20)
(6720, 7)
(1680, 7)


In [4]:
# 建立孿生 LSTM 架構（Siamese LSTM）
from keras import Input
from keras.layers import Embedding, LSTM, concatenate, Dense
from keras.models import Model


MAX_NUM_WORDS = 30843   # 分詞字典庫裡有多少詞彙
NUM_EMBEDDING_DIM = 256 # 詞向量的維度
NUM_LSTM_UNITS = 128    # LSTM 輸出的向量維度

# 分別定義 input shape
x1_input = Input(shape=(title_SEQUENCE_LENGTH, ), dtype='int32')
x2_input = Input(shape=(content_SEQUENCE_LENGTH, ), dtype='int32')

# 詞嵌入層
# 將新聞數字序列變成詞向量的序列，每個詞向量的維度為 256
embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
x1_embedded = embedding_layer(x1_input)
x2_embedded = embedding_layer(x2_input)

# LSTM層 兩個新聞標題經過此層後為一個 128 維度向量
shared_lstm = LSTM(NUM_LSTM_UNITS)
x1_output = shared_lstm(x1_embedded)
x2_output = shared_lstm(x2_embedded)

dense = Dense(units = num_classes, activation='softmax')
predictions = dense(x2_output)

model = Model(inputs = x2_input, outputs = predictions)
model.summary()

# ---------------------------------------------------------------------------------------------
# 兩個input
# 
# 將兩個input串接為單一向量方便跟全連結層相連
# merged = concatenate([x1_output, x2_output], axis = -1)
# dense = Dense(units = 5, activation='softmax')
# predictions = dense(merged)
# 
# model = Model(inputs=[x1_input, x2_input], outputs=predictions)
# model.summary()

# optimizer: rmsprop
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])




Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 20)                0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  7895808   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
Total params: 8,093,831
Trainable params: 8,093,831
Non-trainable params: 0
_________________________________________________________________




In [5]:
BATCH_SIZE = 512 # 決定一次要放多少成對標題給模型訓練
EPOCHS = 10 # 決定模型要看整個訓練資料集幾遍

# 實際訓練模型
history = model.fit(x = x2_train, y = y_train,
                    validation_data=(x2_test, y_test),
                    batch_size=BATCH_SIZE,epochs=EPOCHS,shuffle=True, verbose=2)

# ---------------------------------------------------------------------------------------------
# 兩個input
# history = model.fit(x=[x1_train, x2_train], y=y_train,
#     batch_size=BATCH_SIZE,epochs=EPOCHS,
#     validation_data=([x1_val, x2_val], y_val),shuffle=True)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 6720 samples, validate on 1680 samples
Epoch 1/10
 - 8s - loss: 1.9051 - acc: 0.4482 - val_loss: 1.7411 - val_acc: 0.6054
Epoch 2/10
 - 6s - loss: 1.3406 - acc: 0.5503 - val_loss: 1.0801 - val_acc: 0.6435
Epoch 3/10
 - 6s - loss: 0.6800 - acc: 0.7863 - val_loss: 0.6054 - val_acc: 0.8268
Epoch 4/10
 - 6s - loss: 0.2753 - acc: 0.9415 - val_loss: 0.4383 - val_acc: 0.8821
Epoch 5/10
 - 6s - loss: 0.1271 - acc: 0.9690 - val_loss: 0.4233 - val_acc: 0.8875
Epoch 6/10
 - 6s - loss: 0.0877 - acc: 0.9765 - val_loss: 0.4347 - val_acc: 0.8833
Epoch 7/10
 - 6s - loss: 0.0621 - acc: 0.9820 - val_loss: 0.4265 - val_acc: 0.8881
Epoch 8/10
 - 6s - loss: 0.0516 - acc: 0.9835 - val_loss: 0.4204 - val_acc: 0.8917
Epoch 9/10
 - 6s - loss: 0.0510 - acc: 0.9838 - val_loss: 0.4343 - val_acc: 0.8881
Epoch 10/10
 - 6s - loss: 0.0392 - acc: 0.9845 - val_loss: 0.4493 - val_acc: 0.8917


In [6]:
from sklearn.metrics import confusion_matrix

s = ["政治", "運動", "財經", "社會", "房產", "國際", "娛樂"]

row = [c + "(答案)" for c in s]
col = [c + "(預測)" for c in s]

pre = model.predict(x2_test)

pd.DataFrame(confusion_matrix(y_test.argmax(axis=1), pre.argmax(axis=1)),
             columns=col,index=row)

Unnamed: 0,政治(預測),運動(預測),財經(預測),社會(預測),房產(預測),國際(預測),娛樂(預測)
政治(答案),206,0,20,6,0,11,16
運動(答案),1,254,1,0,0,1,7
財經(答案),13,1,206,3,16,3,5
社會(答案),4,0,11,222,0,2,2
房產(答案),0,0,11,1,207,0,0
國際(答案),1,8,7,4,0,184,3
娛樂(答案),11,5,1,5,0,2,219


### Save Model ###

In [7]:
model.save('7news_lstm_acc89.h5')

In [None]:
from keras.models import load_model
model = load_model('7news_lstm_acc89.h5')