In [1]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, Flatten
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import glorot_normal

Using TensorFlow backend.


# word2vec

In [2]:
MAX_SEQUENCE_LENGTH = 500  # 每个文本或者句子的截断长度，只保留500个单词
MAX_NUM_WORDS = 28000  # 用于构建词向量的词汇表数量
EMBEDDING_DIM = 200  # 词向量维度
VALIDATION_SPLIT = 0.2

GLOVE_DIR = r'D:\v-yanx\masijia\text_mood_classification\glove.6B'

In [3]:
print("Indexing word vectors.")
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.200d.txt'), encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]  # 单词
        coefs = np.asarray(values[1:], dtype='float32')  # 单词对应的向量
        embeddings_index[word] = coefs  # 单词及对应的向量

Indexing word vectors.


In [4]:
words_train = np.load('words_dic.npy',allow_pickle=True).item()
x_train = []
y_train = []
for key in words_train:
    sentance = ' '.join(words_train[key])
    x_train.append(sentance)
    y_train.append(int(key[-1]))
y_train = np.array(y_train)

In [5]:
words_test = np.load('test_words_dic.npy',allow_pickle=True).item()
x_test = []
for key in words_test:
    sentance = ' '.join(words_test[key])
    x_test.append(sentance)

In [6]:
len(x_test)

22000

In [7]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
# fit_on_text(texts) 使用一系列文档来生成token词典，texts为list类，每个元素为一个文档。就是对文本单词进行去重后
tokenizer.fit_on_texts(x_train)
# texts_to_sequences(texts) 将多个文档转换为word在词典中索引的向量形式,shape为[len(texts)，len(text)] -- (文档数，每条文档的长度)
sequences = tokenizer.texts_to_sequences(x_train)
t_sequences = tokenizer.texts_to_sequences(x_test)
print(sequences[0])
print(len(sequences))  # 24500
print(len(t_sequences))
 


[11, 47, 5, 29, 1, 1501, 737, 56, 208, 10, 28, 124, 109, 1730, 8522, 6, 3, 1228, 25, 2846, 9, 8978, 1482, 2, 296, 9, 200, 27, 398, 6, 337, 1208, 2, 337, 624, 10, 807, 12, 251, 97, 844, 145, 11, 2, 24, 457, 67, 3, 123, 105, 349, 8, 13, 163]
24500
22000


In [8]:
word_index = tokenizer.word_index  # word_index 一个dict，保存所有word对应的编号id，从1开始
print("Founnd %s unique tokens." % len(word_index))  # 72955个单词
# ['the', 'to', 'of', 'a', 'and', 'in', 'i', 'is', 'that', "'ax"] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print(list(word_index.keys())[0:10], list(word_index.values())[0:10])  #


Founnd 72955 unique tokens.
['the', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i'] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [9]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  # 长度超过MAX_SEQUENCE_LENGTH则截断，不足则补0

In [10]:
X_test  = pad_sequences(t_sequences, maxlen=MAX_SEQUENCE_LENGTH)  # 长度超过MAX_SEQUENCE_LENGTH则截断，不足则补0

In [11]:
X_test.shape

(22000, 500)

In [12]:
data.shape

(24500, 500)

# train_val_split

In [13]:
labels = y_train
print("训练数据大小为：", data.shape)  # (24500, 500)
print("标签大小为:", labels.shape)  # (24500, 1)
 
# 将训练数据划分为训练集和验证集
indices = np.arange(data.shape[0])
np.random.seed(10)
np.random.shuffle(indices)  # 打乱数据
data_shuffle = data[indices]
labels_shuffle = labels[indices]
 
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
#num_test_samples = int(TEST_SPLIT * data.shape[0])
 
# 训练数据
X_train = data_shuffle[:-num_validation_samples]
Y_train = labels_shuffle[:-num_validation_samples]
 
# 验证数据
x_val = data_shuffle[-num_validation_samples:]
y_val = labels_shuffle[-num_validation_samples:]
#test data
# x_test = data_shuffle[-num_test_samples:]
# y_test = labels_shuffle[-num_test_samples:]


训练数据大小为： (24500, 500)
标签大小为: (24500,)


In [14]:
pos = 0
for i in y_val:
    if i==1:
        pos+=1
print(pos/y_val.shape[0])

0.5042857142857143


In [17]:
# 准备词向量矩阵
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)  # 词汇表数量
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))  # 28000*200
 
for word, i in word_index.items():
    if i>= MAX_NUM_WORDS:  # 过滤掉根据频数排序后排28000以后的词
        continue
    embedding_vector = embeddings_index.get(word)  # 根据词向量字典获取该单词对应的词向量
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
# 加载预训练的词向量到Embedding layer
embedding_layer = Embedding(input_dim=num_words,  # 词汇表单词数量
                            output_dim=EMBEDDING_DIM,  # 词向量维度
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,  # 文本或者句子截断长度
                            trainable=False)  # 词向量矩阵不进行训练

# Sample CNN

In [59]:
from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_MLP'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500，也就是模型的输入为batch_size*500
embedded_sequences = embedding_layer(sequence_input)  

x = Conv1D(32, 5, activation='relu',kernel_initializer=glorot_normal(seed=10))(embedded_sequences)  # 输出的神经元个数为32，卷积的窗口大小为5
x = GlobalMaxPooling1D()(x)
x = Dense(16, activation='relu',kernel_initializer=glorot_normal(seed=10))(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()


Model: "model_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 496, 32)           32032     
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 32)                0         
_________________________________________________________________
dense_43 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 17        
Total params: 5,632,577
Trainable params: 32,577
Non-trainable params: 5,600,000
___________________________________________

In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val),callbacks=[checkpoints])

In [61]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=10, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0xa994568d0>

In [75]:
from tensorflow.keras.models import load_model
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_MLP\model_06-0.8641_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_model_06-0.8641_glorot_normal.txt',prediction, fmt='%.9f')

0.8640816


In [62]:
from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_MLP\2'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500，也就是模型的输入为batch_size*500
embedded_sequences = embedding_layer(sequence_input)  
x = Conv1D(16, 5, activation='relu',kernel_initializer=glorot_normal(seed=10))(embedded_sequences)  # 输出的神经元个数为16，卷积的窗口大小为5
x = GlobalMaxPooling1D()(x)
x = Dense(16, activation='relu',kernel_initializer=glorot_normal(seed=10))(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 496, 16)           16016     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 16)                0         
_________________________________________________________________
dense_45 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_46 (Dense)             (None, 1)                 17        
Total params: 5,616,305
Trainable params: 16,305
Non-trainable params: 5,600,000
___________________________________________

In [63]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x64b9ca7da0>

In [76]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_MLP\2\model_06-0.8602_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_CNN2_06-0.8602_glorot_normal.txt',prediction, fmt='%.9f')

0.8602041


# LSTM200

In [54]:
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500
embedded_sequences = embedding_layer(sequence_input)  
x = LSTM(200, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)  
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_26 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 201       
Total params: 5,921,001
Trainable params: 321,001
Non-trainable params: 5,600,000
_________________________________________________________________


In [55]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x64b26c2d30>

In [77]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\model_20-0.8831_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_LSTM200_20-0.8831_glorot_normal.txt',prediction, fmt='%.9f')

0.88306123


# LSTM350

In [56]:
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\2'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500
embedded_sequences = embedding_layer(sequence_input)  
x = LSTM(350, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)  
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 350)               771400    
_________________________________________________________________
dropout_6 (Dropout)          (None, 350)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 1)                 351       
Total params: 6,371,751
Trainable params: 771,751
Non-trainable params: 5,600,000
_________________________________________________________________


In [57]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x64b926f6d8>

In [78]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\2\model_19-0.8831_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_LSTM350_19-0.8831_glorot_normal.txt',prediction, fmt='%.9f')

0.88306123


# LSTM100

In [64]:
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\3'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  
embedded_sequences = embedding_layer(sequence_input)  
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)  # 输出的神经元个数为100
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_30 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 1)                 101       
Total params: 5,720,501
Trainable params: 120,501
Non-trainable params: 5,600,000
_________________________________________________________________


In [65]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0xa99c5f9e8>

In [79]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\3\model_17-0.8884_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_LSTM100_17-0.8884_glorot_normal.txt',prediction, fmt='%.9f')

0.88836735


# LSTM250

In [66]:
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\250'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  
embedded_sequences = embedding_layer(sequence_input)  
x = LSTM(250, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)  
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_31 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 250)               451000    
_________________________________________________________________
dropout_8 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_48 (Dense)             (None, 1)                 251       
Total params: 6,051,251
Trainable params: 451,251
Non-trainable params: 5,600,000
_________________________________________________________________


In [67]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x64c3164320>

In [80]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\250\model_15-0.8902_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_LSTM250_15-0.8902_glorot_normal.txt',prediction, fmt='%.9f')

0.8902041


# LSTM150

In [68]:
from tensorflow.keras.layers import LSTM, Embedding
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import ModelCheckpoint
save_dir = r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\150'
weight_path = 'model_{epoch:02d}-{val_acc:.4f}_glorot_normal.hdf5'
checkpoints = ModelCheckpoint(os.path.join(save_dir,weight_path), monitor='val_acc', verbose=0, save_best_only=False, save_weights_only=False, mode='max')

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 
embedded_sequences = embedding_layer(sequence_input)  
x = LSTM(150, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)  
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model = Model(sequence_input, preds)
model.summary()

Model: "model_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_32 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
_________________________________________________________________
lstm_8 (LSTM)                (None, 150)               210600    
_________________________________________________________________
dropout_9 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_49 (Dense)             (None, 1)                 151       
Total params: 5,810,751
Trainable params: 210,751
Non-trainable params: 5,600,000
_________________________________________________________________


In [69]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train ,Y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val),callbacks=[checkpoints])

Train on 19600 samples, validate on 4900 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x64c7ddfbe0>

In [81]:
model = load_model(r'D:\v-yanx\masijia\text_mood_classification\text_mood_classification\word2vec_LSTM\150\model_19-0.8820_glorot_normal.hdf5')
loss,acc = model.evaluate(x_val, y_val,batch_size=128,verbose=0)
print(acc)
prediction = model.predict(X_test)
np.savetxt(r'submit\word2vec\submission_LSTM150_19-0.8820_glorot_normal.txt',prediction, fmt='%.9f')

0.8820408


# Word2vec + SVM

In [83]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  # 返回一个张量，长度为500
embedded_sequences = embedding_layer(sequence_input)
model = Model(sequence_input, embedded_sequences)
model.summary()

Model: "model_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_33 (InputLayer)        [(None, 500)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 500, 200)          5600000   
Total params: 5,600,000
Trainable params: 0
Non-trainable params: 5,600,000
_________________________________________________________________


In [84]:
X = model.predict(data)

X.shape

(24500, 500, 200)

In [85]:
labels = y_train
print("训练数据大小为：", X.shape)  # (24500, 500)
print("标签大小为:", labels.shape)  # (24500, 1)
 
# 将训练数据划分为训练集和验证集
indices = np.arange(X.shape[0])
np.random.seed(10)
np.random.shuffle(indices)  # 打乱数据
X_shuffle = X[indices]
labels_shuffle = labels[indices]
 
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
#num_test_samples = int(TEST_SPLIT * data.shape[0])
 
# 训练数据
X_train = X_shuffle[:-num_validation_samples]
Y_train = labels_shuffle[:-num_validation_samples]
print(X_train.shape)
 
# 验证数据
x_val = X_shuffle[-num_validation_samples:]
y_val = labels_shuffle[-num_validation_samples:]

训练数据大小为： (24500, 500, 200)
标签大小为: (24500,)
(19600, 500, 200)


In [88]:
m = np.sum(X_train,axis = 1)

m.shape

(19600, 500)

In [92]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs',
                               multi_class='ovr').fit(np.sum(X_train,axis = 1)/500, Y_train)
y_pred = clf.predict(np.sum(x_val,axis = 1)/500)
y_pred_prob = clf.predict_proba(np.sum(x_val,axis = 1)/500) 
acc = clf.score(np.sum(x_val,axis = 1)/500, y_val)
print(acc)
acc = clf.score(np.sum(X_train,axis = 1)/500, Y_train)
print(acc)



0.8071428571428572
0.8131632653061225


In [93]:
# rbf SVM
from sklearn.svm import SVC
clf = SVC(kernel='rbf')
clf.fit(np.sum(X_train,axis = 1)/500, Y_train) 
y_pred = clf.predict(np.sum(x_val,axis = 1)/500)
acc = clf.score(np.sum(x_val,axis = 1)/500, y_val)
print(acc)
acc = clf.score(np.sum(X_train,axis = 1)/500, Y_train)
print(acc)



0.656530612244898
0.6657142857142857


In [24]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(np.sum(X_train,axis = 1)/X_train.shape[0], Y_train) 
y_pred = clf.predict(np.sum(x_val,axis = 1)/x_val.shape[0])
acc = clf.score(np.sum(x_val,axis = 1)/x_val.shape[0], y_val)
print(acc)



0.4982142857142857
