In [None]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import glorot_normal

In [None]:
MAX_SEQUENCE_LENGTH = 500  # 每个文本或者句子的截断长度，只保留5000个单词
MAX_NUM_WORDS = 28000  # 用于构建词向量的词汇表数量
EMBEDDING_DIM = 200  # 词向量维度
VALIDATION_SPLIT = 0.16
TEST_SPLIT = 0.2
GLOVE_DIR = r'D:\v-yanx\masijia\text_mood_classification\glove.6B'

In [None]:
print("Indexing word vectors.")
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.200d.txt'), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # 单词
        coefs = np.asarray(values[1:], dtype='float32')  # 单词对应的向量
        embeddings_index[word] = coefs  # 单词及对应的向量

In [None]:
words_train = np.load('words_dic.npy',allow_pickle=True).item()
x_train = []
y_train = []
for key in words_train:
    sentance = ' '.join(words_train[key])
    x_train.append(sentance)
    y_train.append(int(key[-1]))
y_train = np.array(y_train)

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# fit_on_text(texts) 使用一系列文档来生成token词典，texts为list类，每个元素为一个文档。就是对文本单词进行去重后
tokenizer.fit_on_texts(x_train)
# texts_to_sequences(texts) 将多个文档转换为word在词典中索引的向量形式,shape为[len(texts)，len(text)] -- (文档数，每条文档的长度)
sequences = tokenizer.texts_to_sequences(x_train)
print(sequences[0])
print(len(sequences))  # 24500
 


In [None]:
word_index = tokenizer.word_index  # word_index 一个dict，保存所有word对应的编号id，从1开始
print("Founnd %s unique tokens." % len(word_index))  # 72955个单词
# ['the', 'to', 'of', 'a', 'and', 'in', 'i', 'is', 'that', "'ax"] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(list(word_index.keys())[0:10], list(word_index.values())[0:10])  #


In [None]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  # 长度超过MAX_SEQUENCE_LENGTH则截断，不足则补0

In [None]:
labels = y_train
print("训练数据大小为：", data.shape)  # (24500, 500)
print("标签大小为:", labels.shape)  # (24500, 1)
 
# 将训练数据划分为训练集和验证集
indices = np.arange(data.shape[0])
np.random.shuffle(indices)  # 打乱数据
data_shuffle = data[indices]
labels_shuffle = labels[indices]
 
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
num_test_samples = int(TEST_SPLIT * data.shape[0])
 
# 训练数据
X_train = data_shuffle[:-(num_validation_samples+num_test_samples)]
Y_train = labels_shuffle[:-(num_validation_samples+num_test_samples)]
 
# 验证数据
x_val = data_shuffle[-(num_validation_samples+num_test_samples):-num_test_samples]
y_val = labels_shuffle[-(num_validation_samples+num_test_samples):-num_test_samples]
#test data
x_test = data_shuffle[-num_test_samples:]
y_test = labels_shuffle[-num_test_samples:]

In [None]:
# 准备词向量矩阵
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)  # 词汇表数量
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # 28000*200
 
for word, i in word_index.items():
    if i>= MAX_NUM_WORDS:  # 过滤掉根据频数排序后排28000以后的词
        continue
    embedding_vector = embeddings_index.get(word)  # 根据词向量字典获取该单词对应的词向量
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
# 加载预训练的词向量到Embedding layer
embedding_layer = Embedding(input_dim=num_words,  # 词汇表单词数量
                            output_dim=EMBEDDING_DIM,  # 词向量维度
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,  # 文本或者句子截断长度
                            trainable=False)  # 词向量矩阵不进行训练

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import LearningRateScheduler
 
def scheduler(epoch):
    # 每隔100个epoch，学习率减小为原来的1/10
    if epoch % 1 == 0 and epoch != 0:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr * 0.8)
        print("lr changed to {}".format(lr * 0.8))
    return K.get_value(model.optimizer.lr)
 
reduce_lr = LearningRateScheduler(scheduler)


In [None]:
print("开始训练模型.....")
# 使用
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500，也就是模型的输入为batch_size*1000
embedded_sequences = embedding_layer(sequence_input)  # 返回batch_size*500*200
x = Conv1D(128, 5, activation='relu')(embedded_sequences)  # 输出的神经元个数为128，卷积的窗口大小为5
x = MaxPooling1D(5)(x)
x = Dropout(0.2)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='sigmoid')(x)
 
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val))
model.summary()
model.save("textClassifier.h5")

In [None]:
print("开始训练模型.....")
# 使用
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500，也就是模型的输入为batch_size*1000
embedded_sequences = embedding_layer(sequence_input)  # 返回batch_size*500*200
x = Dropout(0.2)(embedded_sequences)
x = Conv1D(32, 5, activation='relu',kernel_initializer=glorot_normal(seed=None))(x)  # 输出的神经元个数为128，卷积的窗口大小为5
x = MaxPooling1D(5)(x)
x = Conv1D(32, 3, activation='relu',kernel_initializer=glorot_normal(seed=None))(x)
x = GlobalMaxPooling1D()(x)
x = Dense(32, activation='relu',kernel_initializer=glorot_normal(seed=None))(x)
preds = Dense(1, activation='sigmoid')(x)
 
model = Model(sequence_input, preds)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val))

model.save("textClassifier.h5")
print(model.evaluate(x_test, y_test))