In [1]:
import numpy as np
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import Sequential, layers, utils, models, regularizers
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('chaobigyuliaoku.csv')
cj = lambda x: list(jieba.cut(x))
data['words'] = data['words'].apply(cj)
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(data['words'])
vocab = tokenizer.word_index

def yl_preprocessing():
    encoder = LabelEncoder()
    y = encoder.fit_transform(data['label'].values)
    x_train, x_test, y_train, y_test = train_test_split(data['words'].values, y, test_size=0.1, random_state=24)
    x_train_wordids = tokenizer.texts_to_sequences(x_train)
    x_test_wordids = tokenizer.texts_to_sequences(x_test)
    x_train_sequences = pad_sequences(x_train_wordids, maxlen=1000)
    x_test_sequences = pad_sequences(x_test_wordids, maxlen=1000)
    return x_train_sequences, x_test_sequences, y_train, y_test


def CNN_model(x_train_sequences, x_test_sequences, y_train, y_test, vocab):
    model = Sequential()
    model.add(layers.Embedding(len(vocab)+1, 300, input_length=1000))
    model.add(layers.Conv1D(256, 5, padding='same'))
    model.add(layers.MaxPooling1D(3, 3, padding='same'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Conv1D(128, 5, padding='same'))
    model.add(layers.MaxPooling1D(3, 3, padding='same'))
    model.add(layers.Conv1D(64, 3, padding='same'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Flatten())
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(8, activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    one_hot_label = utils.to_categorical(y_train, num_classes=8)
    # print(one_hot_label)
    print(x_train_sequences.shape)
    print(y_train.shape)
    model.fit(x_train_sequences, one_hot_label, batch_size=500, epochs=10)
    model.save('textcnn.h5')
    y_predict = model.predict_classes(x_test_sequences)
    print(y_predict)
    print('accuracy: ', metrics.accuracy_score(y_test, y_predict))
    print('F1-Score: ', metrics.f1_score(y_test, y_predict, average='weighted'))

x_train_sequences, x_test_sequences, y_train, y_test = yl_preprocessing()
print(y_train)
CNN_model(x_train_sequences, x_test_sequences, y_train, y_test, vocab)

model = models.load_model('textcnn.h5')
test_x = []
for word in jieba.lcut('6月27日晚，一则“成都某小区8岁男童被暴打至血流不止”的消息在不少群里流传。网友截屏显示，位于成都北门的某小区，一名少年将一名男童拖入地下室暴打两小时，致男童身受重伤，现场血迹斑斑。'):
    test_x.append(vocab[word])
word_pad = pad_sequences([test_x], maxlen=1000)
print(model.predict_classes(word_pad))

# ['体育' '娱乐' '房产' '教育' '时政' '社会' '科技' '财经']




Using TensorFlow backend.
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.749 seconds.
Prefix dict has been built succesfully.


[6 5 3 ... 5 7 3]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 300)         56602800  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 256)         384256    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 334, 256)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 334, 256)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 334, 128)          163968    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 112, 128)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 112, 64)           246