In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import platform
import datetime,pytz

root_ = '/content/drive/My Drive/colab/' if platform.system() == 'Linux' else '/Users/love/Test/'

chinese_text_classifier_ = os.path.join(root_, 'ChineseTextClassifier')

if not os.path.exists(chinese_text_classifier_):
    os.makedirs(chinese_text_classifier_)

data_ = os.path.join(chinese_text_classifier_, 'data')
if not os.path.exists(data_):
    os.makedirs(data_)

wordJson_ = os.path.join(data_, 'wordJson')
if not os.path.exists(wordJson_):
    os.makedirs(wordJson_)
   
bigru_model_ = os.path.join(chinese_text_classifier_, 'bigru_model')
if not os.path.exists(bigru_model_):
    os.makedirs(bigru_model_)

Mounted at /content/drive


# GRU+attention实现中文商品评论二分类

In [None]:
import tensorflow as tf
print(tf.__version__)
print(tf.keras.__version__)

2.3.0
2.4.0


In [None]:
import os
import csv
import time
import datetime
import random
import json
from collections import Counter
from math import sqrt
import gensim
import pandas as pd
import numpy as np


from tensorflow.keras import backend
from tensorflow.keras.layers import Layer,TimeDistributed,Input,Conv2D,MaxPool2D,concatenate,Flatten,Dense,Dropout,Embedding,Reshape,GRU
from tensorflow.keras import Sequential,optimizers,losses
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from bs4 import BeautifulSoup
import logging
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec

import multiprocessing
import yaml
import jieba

# 参数配置

In [None]:
class Config(object):
    
    #数据集路径
    dataSource = os.path.join(data_, 'dataset.txt')
    stopWordSource = os.path.join(data_, 'stopword.txt')
    
    
    #分词后保留大于等于最低词频的词
    miniFreq=1
    
    
    #统一输入文本序列的定长，取了所有序列长度的均值。超出将被截断，不足则补0
    sequenceLength = 200  
    batchSize=64
    epochs=10
    
    numClasses = 2
    #训练集的比例
    rate = 0.8  
    
    
    #生成嵌入词向量的维度
    embeddingSize = 150
    
    #卷积核数
    numFilters = 30
    
    #卷积核大小
    filterSizes = [2,3,4,5]
    dropoutKeepProb = 0.5
    
    #L2正则系数
    l2RegLambda = 0.1
    
   

    
# 实例化配置参数对象
config = Config()

In [None]:
config.batchSize

64

# 预训练词向量

In [None]:
#中文语料
#设置输出日志
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

file = open(os.path.join(data_, 'dataset.txt')) 
sentences=[]
for line in file:
    temp=line.replace('\n','').split(',@=,')
    sentences.append(jieba.lcut(temp[0]))
file.close()


model = word2vec.Word2Vec(sentences,size=config.embeddingSize,
                     min_count=config.miniFreq,
                     window=10,
                     workers=multiprocessing.cpu_count(),sg=1,
                     iter=20)
model.save(os.path.join(data_, 'word2VecModel'))


Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.979 seconds.
Prefix dict has been built successfully.
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
model = gensim.models.Word2Vec.load(os.path.join(data_, 'word2VecModel'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# 数据预处理

In [None]:
# 数据预处理的类，生成训练集和测试集
class Dataset(object):
    def __init__(self, config):
        self.dataSource = config.dataSource
        self.stopWordSource = config.stopWordSource  
        
        # 每条输入的序列处理为定长
        self.sequenceLength = config.sequenceLength  
        
        self.embeddingSize = config.embeddingSize
        self.batchSize = config.batchSize
        self.rate = config.rate
        self.miniFreq=config.miniFreq
        
        self.stopWordDict = {}
        
        self.trainReviews = []
        self.trainLabels = []
        
        self.evalReviews = []
        self.evalLabels = []
        
        self.wordEmbedding =None
        self.n_symbols=0
        
        self.wordToIndex = {}
        self.indexToWord = {}
        
        
        
    def readData(self, filePath):
        file = open(filePath) 
        text=[]
        label=[]
        for line in file:
            temp=line.replace('\n','').split(',@=,')
            text.append(temp[0])
            label.append(temp[1])
        file.close()
        
        print('data:',len(text),len(label))
        texts = [jieba.lcut(document.replace('\n', '')) for document in text]

        return texts, label

    
    def readStopWord(self, stopWordPath):
        """
        读取停用词
        """
        
        with open(stopWordPath, "r") as f:
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            # 将停用词用列表的形式生成，之后查找停用词时会比较快
            self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
    
    
    def getWordEmbedding(self, words):
        """
        按照我们的数据集中的单词取出预训练好的word2vec中的词向量
        """
        
        #中文
        model = gensim.models.Word2Vec.load(os.path.join(data_, 'word2VecModel'))
        
        vocab = []
        wordEmbedding = []
        
        # 添加 "pad" 和 "UNK", 
        vocab.append("pad")
        wordEmbedding.append(np.zeros(self.embeddingSize))
        
        vocab.append("UNK")
        wordEmbedding.append(np.random.randn(self.embeddingSize))
        
        for word in words:
            try:
                
                #中文
                vector =model[word]
                
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                
                print(word + " : 不存在于词向量中")
                
        return vocab, np.array(wordEmbedding)
    
    
    
    def genVocabulary(self, reviews):
        """
        生成词向量和词汇-索引映射字典，可以用全数据集
        """
        
        allWords = [word for review in reviews for word in review]
        
        # 去掉停用词
        subWords = [word for word in allWords if word not in self.stopWordDict]
        
        wordCount = Counter(subWords)  # 统计词频，排序
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        
        # 去除低频词
        words = [item[0] for item in sortWordCount if item[1] >= self.miniFreq ]
        
        
        #获取词列表和顺序对应的预训练权重矩阵
        vocab, wordEmbedding = self.getWordEmbedding(words)
        
        self.wordEmbedding = wordEmbedding
        
        self.wordToIndex = dict(zip(vocab, list(range(len(vocab)))))
        self.indexToWord = dict(zip(list(range(len(vocab))), vocab))
        self.n_symbols = len(self.wordToIndex) + 1
        
        # 将词汇-索引映射表保存为json数据，之后做inference时直接加载来处理数据
        with open(os.path.join(wordJson_, 'wordToIndex.json'), "w", encoding="utf-8") as f:
            json.dump(self.wordToIndex, f)
        
        with open(os.path.join(wordJson_, 'indexToWord.json'), "w", encoding="utf-8") as f:
            json.dump(self.indexToWord, f)


    def reviewProcess(self, review, sequenceLength, wordToIndex):
        """
        将数据集中的每条评论里面的词，根据词表，映射为index表示
        每条评论 用index组成的定长数组来表示
        
        """
        
        reviewVec = np.zeros((sequenceLength))
        sequenceLen = sequenceLength
        
        # 判断当前的序列是否小于定义的固定序列长度
        if len(review) < sequenceLength:
            sequenceLen = len(review)
            
        for i in range(sequenceLen):
            if review[i] in wordToIndex:
                reviewVec[i] = wordToIndex[review[i]]
            else:
                reviewVec[i] = wordToIndex["UNK"]

        return reviewVec

    
    
    
    def genTrainEvalData(self, x, y, rate):
        """
        生成训练集和验证集
        """
        
        reviews = []
        labels = []
        
        # 遍历所有的文本，将文本中的词转换成index表示
        for i in range(len(x)):
            
            reviewVec = self.reviewProcess(x[i], self.sequenceLength, self.wordToIndex)
            reviews.append(reviewVec)
            
            labels.append([y[i]])
            
        trainIndex = int(len(x) * rate)
        
       
        #trainReviews = sequence.pad_sequences(reviews[:trainIndex], maxlen=self.sequenceLength)
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.array(labels[:trainIndex], dtype="float32")
        trainLabels = to_categorical(trainLabels,num_classes=2) 
        
        #evalReviews = sequence.pad_sequences(reviews[trainIndex:], maxlen=self.sequenceLength)
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.array(labels[trainIndex:], dtype="float32")
        print(evalLabels[:3])
        evalLabels = to_categorical(evalLabels,num_classes=2) 
        print(evalLabels[:3])
        return trainReviews, trainLabels, evalReviews, evalLabels
        
        
        
 
            
    def dataGen(self):
        """
        初始化训练集和验证集
        """
        
        #读取停用词
        self.readStopWord(self.stopWordSource)
        
        #读取数据集
        reviews, labels = self.readData(self.dataSource)
        
        #分词、去停用词
        #生成 词汇-索引 映射表和预训练权重矩阵，并保存
        self.genVocabulary(reviews)
        
        
        #初始化训练集和测试集
        trainReviews, trainLabels, evalReviews, evalLabels = self.genTrainEvalData(reviews, labels, self.rate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels
        
        
data = Dataset(config)
data.dataGen()

data: 119988 119988


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


真以 : 不存在于词向量中
多咯 : 不存在于词向量中
快完了 : 不存在于词向量中
Atat : 不存在于词向量中
rk : 不存在于词向量中
Havaalan : 不存在于词向量中
zO1RKFo : 不存在于词向量中
冷得直 : 不存在于词向量中
inches : 不存在于词向量中
土银 : 不存在于词向量中
高卢 : 不存在于词向量中
laurendancer : 不存在于词向量中
YeemanL : 不存在于词向量中
彭惠娟 : 不存在于词向量中
Christo : 不存在于词向量中
he1r76 : 不存在于词向量中
aoS1HS : 不存在于词向量中
汽车网 : 不存在于词向量中
横评 : 不存在于词向量中
武文涛 : 不存在于词向量中
晚景凄凉 : 不存在于词向量中
[[1.]
 [0.]
 [1.]]
[[0. 1.]
 [1. 0.]
 [0. 1.]]


In [None]:
print("train data shape: {}".format(data.trainReviews.shape))
print("train label shape: {}".format(data.trainLabels.shape))
print("eval data shape: {}".format(data.evalReviews.shape))

train data shape: (95990, 200)
train label shape: (95990, 2)
eval data shape: (23998, 200)


# 定义网络结构

In [None]:
class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        # W.shape = (time_steps, time_steps)
        self.W = self.add_weight(name='att_weight', 
                                 shape=(input_shape[1], input_shape[1]),
                                 initializer='uniform',
                                 trainable=True)
        self.b = self.add_weight(name='att_bias', 
                                 shape=(input_shape[1],),
                                 initializer='uniform',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        # inputs.shape = (batch_size, time_steps, seq_len)
        x = backend.permute_dimensions(inputs, (0, 2, 1))
        # x.shape = (batch_size, seq_len, time_steps)
        a = backend.softmax(backend.tanh(backend.dot(x, self.W) + self.b))
        outputs = backend.permute_dimensions(a * x, (0, 2, 1))
        outputs = backend.sum(outputs, axis=1)
        return outputs

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[2]



def bigru(n_symbols,embedding_weights,config):
    
    model =Sequential([
        Embedding(input_dim=n_symbols, output_dim=config.embeddingSize,
                        weights=[embedding_weights],
                        input_length=config.sequenceLength),
        
    #LSTM层
    #LSTM(50,activation='tanh', dropout=0.5, recurrent_dropout=0.5,kernel_regularizer=regularizers.l2(config.model.l2RegLambda)),
    GRU(50,activation='tanh', dropout=0.5, recurrent_dropout=0.5,return_sequences=True),
    Dropout(config.dropoutKeepProb),
    AttentionLayer(),
    Dense(2, activation='softmax')])
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

    return model
    
    
    
wordEmbedding = data.wordEmbedding
n_symbols=data.n_symbols
model = bigru(n_symbols,wordEmbedding,config)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 150)          30130800  
_________________________________________________________________
gru (GRU)                    (None, 200, 50)           30300     
_________________________________________________________________
dropout (Dropout)            (None, 200, 50)           0         
_________________________________________________________________
attention_layer (AttentionLa (None, 50)                40200     
_________________________________________________________________
dense (Dense)                (None, 2)                 102       
Total params: 30,201,402
Trainable params: 30,201,402
Non-trainable params: 0
_________________________________________________________________


# 训练模型

In [None]:
x_train = data.trainReviews
y_train = data.trainLabels
x_eval = data.evalReviews
y_eval = data.evalLabels

wordEmbedding = data.wordEmbedding
n_symbols=data.n_symbols


reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=10, mode='auto')

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint(os.path.join(bigru_model_, 'best_model','model_{epoch:02d}-{val_accuracy:.2f}.hdf5'), save_best_only=True, save_weights_only=True)
history = model.fit(x_train, y_train, batch_size=config.batchSize, epochs=config.epochs, validation_split=0.3,shuffle=True, callbacks=[reduce_lr,early_stopping,model_checkpoint])
#验证

scores = model.evaluate(x_eval, y_eval)

#保存模型
yaml_string = model.to_yaml()
with open(os.path.join(bigru_model_, 'bigru.yml'), 'w') as outfile:
    outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
model.save_weights(os.path.join(bigru_model_, 'bigru.h5'))


print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
test_loss: 0.107263, accuracy: 0.973539


In [None]:
result = model.predict(x_eval)

result = np.argmax(result, axis=1)
y_eval = np.argmax(y_eval, axis=1)

from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

print('acc:',accuracy_score(y_eval, result))
print('pc',precision_score(y_eval, result))
print('rc:',recall_score(y_eval, result))
print('f1:',f1_score(y_eval, result))

acc: 0.9255794616218018
pc 0.9033518121715545
rc: 0.951666242813099
f1: 0.9268882066473383
