In [1]:
import jieba
from tensorflow.keras import Model, utils
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout

In [2]:
filename = "./data/cnews.train.txt"

MAXLEN = 600
BATCH_SIZE = 128
embedding_dims = 50
EPOCHS = 7

#  预处理

In [3]:
%%time

#  文本的类别及其对应id的字典
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
cat_to_id = dict(zip(categories, range(len(categories)))) 

contents, labels = [], []
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
    for line in f:
        try:
            label, content = line.strip().split('\t')
            if content:
                contents.append(list(jieba.cut(content)))
                labels.append(cat_to_id.get(label))
                if len(labels)%5000==0:
                    print(f'已处理 {len(labels)} 条！！！')
        except:
            pass

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ZCF\AppData\Local\Temp\jieba.cache
Loading model cost 1.149 seconds.
Prefix dict has been built successfully.


已处理 5000 条！！！
已处理 10000 条！！！
已处理 15000 条！！！
已处理 20000 条！！！
已处理 25000 条！！！
已处理 30000 条！！！
已处理 35000 条！！！
已处理 40000 条！！！
已处理 45000 条！！！
已处理 50000 条！！！
Wall time: 4min 48s


In [4]:
# 将contents中的每个词转换为数字列表，使用每个词的编号进行编号
lang_tokenizer = Tokenizer(filters='')
lang_tokenizer.fit_on_texts(contents)

# 使用keras提供的pad_sequences来将文本pad为固定长度
contents_tensor = lang_tokenizer.texts_to_sequences(contents)
contents_tensor = pad_sequences(contents_tensor, maxlen=MAXLEN)

# 数据
X = contents_tensor
Y = utils.to_categorical(labels, num_classes=10)

x_train ,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
vocab_size = len(lang_tokenizer.index_word)+1

# 定义模型

* 1. TextCNN 模型

In [5]:
class TextCNN(Model):

    def __init__(self,
                 maxlen,
                 vocab_size ,
                 embedding_dims,
                 kernel_sizes=[3, 4, 5],
                 class_num=10,
                 last_activation='softmax'):
        super(TextCNN, self).__init__()
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embedding_dims = embedding_dims
        self.kernel_sizes = kernel_sizes
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.vocab_size, self.embedding_dims, input_length=self.maxlen)
        self.convs = []
        self.max_poolings = []
        for kernel_size in self.kernel_sizes:
            self.convs.append(Conv1D(128, kernel_size, activation='relu'))
            self.max_poolings.append(GlobalMaxPooling1D())
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        embedding = self.embedding(inputs)
        convs = []
        for i in range(len(self.kernel_sizes)):
            c = self.convs[i](embedding)
            c = self.max_poolings[i](c)
            convs.append(c)
        x = Concatenate()(convs)
        output = self.classifier(x)
        return output

# 训练模型

In [6]:
optimizer = 'adam'
loss = 'binary_crossentropy'
metrics = ['accuracy']
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')

In [7]:
# TextCNN

model = TextCNN(MAXLEN, vocab_size, embedding_dims)
model.compile(optimizer, loss, metrics=metrics)

model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

Train on 40000 samples, validate on 10000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x200a0afbc48>