In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from OpenFabLibrary import JeibaCutWords
from OpenFabLibrary import AppendKeywordCheck

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### create word ID mapping and word vector

In [2]:
data_dir = "/".join((".", "data"))

In [3]:
#w2v = word2vec.Word2Vec.load('word2vec_model/CBOW')
w2v = word2vec.Word2Vec.load('word2vec_model/zh.bin')
word2id = {k:i for i, k in enumerate(w2v.wv.vocab.keys())}
id2word = {i:k for k, i in word2id.items()}
word2id_len = len(word2id) - 1
print('word2id_len:', word2id_len)


word2id_len: 50100


##  AI預測 + 關鍵字檢查

In [4]:
def jieba_validation(input_text):
    single_ad = 1  # 若是單一則廣告輸入，設 1
               # 若是一大批廣告輸入，設 0
        
    ad_ID = 0
    ad_Name = "測試產品"
    ad_Class = 0

    ad_Description = input_text
    
    if single_ad:
        # 單一廣告輸入
        test_data_df = pd.DataFrame({'ID': [ad_ID], 
                                     'Name':[ad_Name],
                                     'Description':[ad_Description],
                                     'Class':[ad_Class]})
    else:
        # 大批廣告輸入
        test_data_source = "test_private.csv"
        test_data_df = pd.read_csv(open(data_dir + '/' + test_data_source, 'r', encoding='utf8'), delimiter=',')


    # 斷詞處理
    test_df = JeibaCutWords(test_data_df)

    # 關鍵字檢查
    test_df['keyword_flag'], keywords_list = AppendKeywordCheck(test_df)
    
    #
    # 選取多少詞來當作輸入
    #
    PICK_WORDS = 40  # 選前面40個詞當作輸入，這個長度要跟訓練模型的長度一樣
    batch_size = 16  # 若是資料筆數很多，一次讀batch_size筆資料來預測

    docs_pred_id = []
    for doc in test_df['sentence']:
        text = doc[:PICK_WORDS]
        ids = [word2id_len+1]*PICK_WORDS
        ids[:len(text)] = [word2id[w] if w in word2id else word2id_len+1 for w in text]
        docs_pred_id.append(ids)

    # 轉換後的sequence合併到dataframe    
    test_df['sentence_seq'] = docs_pred_id

    x = test_df['sentence_seq'].tolist()
    X_pred = np.array(x)
    y_actual = test_df['class'].as_matrix()
    y_keyword_flag = test_df['keyword_flag'].as_matrix()
    
    #
    # Load trained model and feed data to predict
    #
    pred_input = X_pred
    pred_batch_size = batch_size
    output_class = []
    output_probability = []

    with tf.gfile.GFile("./model/frozen_model.pb", "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        
    with tf.Graph().as_default() as graph:
        # The name var will prefix every op/nodes in your graph
        # Since we load everything in a new graph, this is not needed
        tf.import_graph_def(graph_def, name="prefix")
        
    with tf.Session(graph=graph) as sess:
        #saver = tf.train.import_meta_graph('./model/lstm_model.meta')
        #saver.restore(sess, tf.train.latest_checkpoint('./model/'))
        #graph = tf.get_default_graph()
            
        inputs = graph.get_tensor_by_name('prefix/input_layer/input_data:0')
        keep_prob = graph.get_tensor_by_name('prefix/input_layer/keep_prob:0')
        class_prob = graph.get_tensor_by_name('prefix/output_layer/class_probability:0')
        #predict_out = graph.get_tensor_by_name('prefix/evaluate/predictions:0')
        
        for start in range(0, len(pred_input), pred_batch_size):
            end = min(start + batch_size, len(pred_input))

            x_pred_batch = pred_input[start:end]        

            if np.ndim(x_pred_batch)==1:
                x_pred_batch = x_pred_batch.reshape([1,-1])

            #
            # 把剛剛載入的模型拿來用
            #
            #pred_result, pred_prob = sess.run([predict_out, class_prob],
            #                                  feed_dict = {inputs:x_pred_batch})
            pred_prob = sess.run([class_prob], feed_dict = {inputs:x_pred_batch, keep_prob:1})
            pred_result = np.around(pred_prob)  #四捨五入，機率 > 0.5，視為class "1"

            output_class.extend(pred_result)
            output_probability.extend(pred_prob)

    # 預測的類別
    y_pred_class = output_class
    

    # 預測的類別機率值
    #kvdbg-Legal_prob = output_probability[:,0]    # column[0]是class 0的機率
    #kvdbg-Violate_prob = output_probability[:,1]  # column[1]是class 1的機率
    
    if single_ad:
        # 單一廣告判別
        if y_pred_class[0] == 0:
            keywords_list = []  # 合法廣告不用列出違規關鍵字
            return "合法", output_probability, keywords_list
        else:
            return "違法", output_probability, keywords_list
    else:
        # 大批廣告判別
        return y_pred_class, output_probability, keywords_list

### 載入測試資料集，並進行預測

In [5]:
# 單一廣告
ad_text = "含500億活菌數及八種益生菌，排便不順，氣味難聞，當心健康拉警報\
服用本產品可達到體內環保、增強抵抗力並強化細胞功能，可改善體質、促進新陳代謝、幫助維持消化道機能、促進食慾、開胃，促進腸道蠕動改變細菌叢生態，使排便順暢。\
"

result, probability, keywords = jieba_validation(ad_text)

print("辨識結果: ", result)
print("違規機率: ", probability)
print("違規字詞: ", keywords)

Building prefix dict from C:\Users\User\Desktop\AIGO\Jeiba\dict.txt.big ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.ub75bbb3384af150c32db207c6bfbd71d.cache
Loading model cost 1.451 seconds.
Prefix dict has been built succesfully.


辨識結果:  違法
違規機率:  [array([[0.998085]], dtype=float32)]
違規字詞:  ['服用', '增強', '強化', '改善', '開胃']
