In [30]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import numpy as np
from keras.callbacks import EarlyStopping
from keras import Input,Model
from keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.metrics import accuracy_score

In [28]:
maxlen = 400
max_features = 2000

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data('./imdb.npz',num_words=max_features)

print(len(x_train),'train_lenth')
print(len(x_test),'train_lenth')

print('Average train sequence length:{}'.format(np.mean(list(map(len,x_train)),dtype=int)))
print('Average test sequence length:{}'.format(np.mean(list(map(len,x_test)),dtype=int)))

#填充为同一长度
print('pad sequences....')
x_train = sequence.pad_sequences(x_train,maxlen = maxlen)
x_test = sequence.pad_sequences(x_test,maxlen = maxlen)
print('x_train:',x_train.shape)
print('x_test:',x_test.shape)

Loading data...
25000 train_lenth
25000 train_lenth
Average train sequence length:238
Average test sequence length:230
pad sequences....
x_train: (25000, 400)
x_test: (25000, 400)


In [24]:
embedding_dims = 50

class FastText(object):
    def __init__(self, maxlen, max_features, embedding_dims, class_num=1, 
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
    
    def get_model(self):
        input = Input((self.maxlen,))
        embedding = Embedding(self.max_features, self.embedding_dims, input_length = self.maxlen)(input) 
        #mbedding的主要目的是对（稀疏）特征进行降维，它降维的方式可以类比为一个全连接层（没有激活函数），通过 embedding 层的权重矩阵计算来降低维度。
        x = GlobalAveragePooling1D()(embedding)
        # GlobalAveragePooling：把特征图全局平均一下输出一个值，也就是把W*H*D的一个张量变成1*1*D的张量
        '''如在平均池化前的张量输出是8x8x1024，对每个8x8的特征图做一个平均池化(取一个平均数)，就可以得到1024个标量了，
        然后在进入一个1000结点的全连接层，最后通过softmax输出'''
        output = Dense(self.class_num, activation = self.last_activation)(x) #全连接层
        model = Model(inputs = input, outputs = output)
        return model

In [31]:
batch_size =32
epochs =20

print('构造模型...')
model = FastText(maxlen, max_features, embedding_dims).get_model()
model.compile('adam','binary_crossentropy',metrics = ['accuracy'])
#keras model.compile(loss='目标函数 ', optimizer='adam', metrics=['accuracy'])


print('训练模型.....')
early_stopping = EarlyStopping(monitor='val_acc', patience = 3, mode = 'max') 
#使用了acc验证，故mode取max
#有验证集 故monitor选取val_acc 而不是acc
'''EarlyStopping参数说明： https://blog.csdn.net/silent56_th/article/details/72845912'''
model.fit(x_train,y_train,
         batch_size = batch_size,
         epochs = epochs,
         callbacks = [early_stopping],
         validation_data = (x_test,y_test))

print('模型测试.......')
result = model.predict(x_test)
result[result>=0.5] = 1
result[result<0.5] = 0
acc = accuracy_score(result, y_test)
print('test data accuracy is:', acc)

构造模型...
训练模型.....
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
模型测试.......
test data accuracy is: 0.87896


## 2-gram 增加二元词组信息

In [37]:
def create_ngram_set(input_list, ngram_value =2):
    """
    Extract a set of n-grams from a list of integers.
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(1,4),(4, 9), (9, 4), (4, 1)}
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    # >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    # >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range+1):
            for i in range(len(new_list)-ngram_value+1):
                ngram = tuple(new_list[i:i+ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [41]:
ngram_range = 2
max_features = 2000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 20

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data('./imdb.npz',num_words=max_features)

print(len(x_train),'train_lenth')
print(len(x_test),'train_lenth')

print('Average train sequence length:{}'.format(np.mean(list(map(len,x_train)),dtype=int)))
print('Average test sequence length:{}'.format(np.mean(list(map(len,x_test)),dtype=int)))


if ngram_range>1:
    print('add {}-gram features.....'.format(ngram_range))
    #Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2,ngram_range+1):
            set_of_ngram =  create_ngram_set(input_list, ngram_value = i)
            ngram_set.update(set_of_ngram)
     
    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features +1
    token_indice = {v:k+start_index for k,v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}
    
    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys()))+1
    
    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))
    

print('*'*20)
#填充为同一长度
print('pad sequences....')
x_train = sequence.pad_sequences(x_train,maxlen = maxlen)
x_test = sequence.pad_sequences(x_test,maxlen = maxlen)
print('x_train:',x_train.shape)
print('x_test:',x_test.shape)

print('*'*20)
print('构造模型...')
model = FastText(maxlen, max_features, embedding_dims).get_model()
model.compile('adam','binary_crossentropy',metrics = ['accuracy'])
#keras model.compile(loss='目标函数 ', optimizer='adam', metrics=['accuracy'])

print('*'*20)
print('训练模型.....')
early_stopping = EarlyStopping(monitor='val_acc', patience = 3, mode = 'max') 
#使用了acc验证，故mode取max
#有验证集 故monitor选取val_acc 而不是acc
'''EarlyStopping参数说明： https://blog.csdn.net/silent56_th/article/details/72845912'''
model.fit(x_train,y_train,
         batch_size = batch_size,
         epochs = epochs,
         callbacks = [early_stopping],
         validation_data = (x_test,y_test))

print('*'*20)
print('模型测试.......')
result = model.predict(x_test)
result[result>=0.5] = 1
result[result<0.5] = 0
acc = accuracy_score(result, y_test)
print('test data accuracy is:', acc)

Loading data...
25000 train_lenth
25000 train_lenth
Average train sequence length:238
Average test sequence length:230
add 2-gram features.....
Average train sequence length: 476
Average test sequence length: 452
********************
pad sequences....
x_train: (25000, 400)
x_test: (25000, 400)
********************
构造模型...
********************
训练模型.....
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
********************
模型测试.......
test data accuracy is: 0.896
