#### 一. 购物评价情感分析
1. 本数据集包含两万多条中文标注语料，涉及六个领域的评论数据  
 对这些评论数据先分词, 再倒入与训练好的词向量, 构建RNN模型进行情感分析
<img src='img/zhongwenpingluing.jpg' height='60%' width='60%'>

#### 二. 训练过程
1. 读取文件, 构建标签
2. 对文件内容分词, 使用gensim训练词向量, 并使用词向量中的单词创建词典
3. 使用词典将文本内容转换成index
4. 训练神经网络,保存模型和参数
5. 加载模型文件,对新的评论作出预测

In [1]:
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
import pandas as pd
import numpy as np
import jieba
from gensim.models import word2vec
from gensim.corpora.dictionary import Dictionary

#### 第一步:读取文件, 构建标签

In [2]:
# 1.加载数据文件
neg_df = pd.read_excel('/home/lj/Downloads/neg.xls',header=None,index=None)
pos_df = pd.read_excel('/home/lj/Downloads/pos.xls',header=None,index=None)

corpus = pd.concat((pos_df[0],neg_df[0]))
labels = np.concatenate((np.ones(pos_df.shape[0]),np.zeros(neg_df.shape[0])))

print('corpus:',corpus.shape,'labels:',labels.shape)
corpus.head()

corpus: (21105,) labels: (21105,)


0    做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...
1    作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2    作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3    作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4    作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...
Name: 0, dtype: object

#### 第二步:分词

In [3]:
# 2.分词
def strQ2B(ustring):
    ''' ustring : 需要转换成半角的字符串 '''
    ss = ''
    for s in ustring:
        rstring = ""
        for uchar in s:
            inside_code = ord(uchar)
            if inside_code == 12288:  # 全角空格直接转换
                inside_code = 32
            elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符（除空格）根据关系转化
                inside_code -= 65248
            rstring += chr(inside_code)
        ss = ss + rstring
    return ss

def jieba_cut(_sentence):
    semiangle_str = [strQ2B(w) for w in jieba.lcut(_sentence)]
    return [w for w in semiangle_str if w not in stp_list]

stp_list = open('../../data/stop_words_utf8.txt').read().splitlines()
corpus_cut = corpus.apply(jieba_cut)

corpus_cut.head()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.661 seconds.
Prefix dict has been built succesfully.


0    [做, 父母, 一定, 刘墉, 心态, 不断, 学习, 不断, 进步, 不断, 补充, 新鲜...
1    [作者, 真有, 英国人, 严谨, 风格, 提出, 观点, 进行, 论述, 论证, 物理学,...
2    [作者, 长篇大论, 借用, 详细, 报告, 数据处理, 工作, 计算结果, 支持, 其新,...
3    [作者, 战, 之前, 拥抱, 令人, 叫绝, 日本, 战败, 美军, 占领, 没胡, 官僚...
4    [作者, 少年, 时即, 喜, 阅读, 看出, 精读, 无数, 经典, 一个, 庞大, 内心...
Name: 0, dtype: object

#### 第三步:gensim训练词向量, 转换文本文件为index, 构造嵌入矩阵

In [23]:
# 3. gensim训练词嵌入,并构建词典
gensim_model = word2vec.Word2Vec(corpus_cut,
                                size=100,
                                min_count=5,
                                workers=8)
gensim_model.save('Word2vec_model.pkl')

all_tockens = gensim_model.wv.vocab.keys()  #gensim_model.wv.vocab为词典,{单词:<gensim.models.keyedvectors.Vocab>}
gensim_dictionary = Dictionary()
gensim_dictionary.doc2bow(all_tockens,allow_update=True)
print('gensim_dictionary中的元素为(index,word):',list(gensim_dictionary.items())[2000:2010])

gensim_dictionary中的元素为(index,word) [(2000, '众所周知'), (2001, '优'), (2002, '优于'), (2003, '优优'), (2004, '优先'), (2005, '优势'), (2006, '优化'), (2007, '优惠'), (2008, '优派'), (2009, '优点')]


In [None]:
# 用于转换文本为index和获取嵌入矩阵的中间结果
word_index = {word:index+1 for index,word in gensim_dictionary.items()}  # 单词和index的映射
word_vector = {word:gensim_model.wv.__getitem__(word) for word in word_index.keys()} # 单词和向量的映射

In [28]:
# 4. 文本内容转换为index
def wordlist2indexlist(wordlist):
    return [word_index[w] for w in wordlist if w in word_index]
corpus_index = corpus_cut.apply(wordlist2indexlist)
corpus_index.head()

0    [2334, 9358, 747, 2919, 6184, 1185, 5081, 1185...
1    [2159, 10022, 1390, 13332, 7156, 11558, 12450,...
2    [2159, 12929, 2288, 11810, 6857, 5614, 7286, 1...
3    [2159, 1538, 6937, 1941, 7671, 3316, 5885, 661...
4    [2159, 5504, 4098, 13003, 9969, 10550, 7614, 1...
Name: 0, dtype: object

In [29]:
# 构建嵌入矩阵
m = len(word_index) + 1
n = 100
embedding_matrix = np.zeros((m,n))
for word,index in word_index.items():
    embedding_matrix[index,:] = word_vector[word]

#### 第四步:构建神经网络

In [30]:
# 构建神经网络
from keras.models import Sequential
from keras.layers import Embedding,Flatten,LSTM,Dense
from keras import preprocessing
from keras.layers.core import Dropout
from sklearn.model_selection import train_test_split



# sentence length
maxlen = 100
corpus_index_pad = preprocessing.sequence.pad_sequences(corpus_index,maxlen=maxlen)
x_train, x_test, y_train, y_test = train_test_split(corpus_index_pad,labels,test_size=0.2)

model = Sequential()
model.add(Embedding(m,n,input_length=maxlen))
# model.add(Embedding(m,n,
#                     mask_zero=True,
#                     weights=[embedding_matrix],
#                     input_length=maxlen))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

model.set_weights([embedding_matrix])
model.layers[0].trainable = False  # 冻结Embedding层

model.summary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1356700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 128)          117248    
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,523,421
Trainable params: 166,721
Non-trainable params: 1,356,700
_________________________________________________________________


In [31]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',metrics=['accuracy'])

model.fit(x_train,y_train,epochs=10,batch_size=32,validation_data=(x_test,y_test))

Train on 16884 samples, validate on 4221 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fb7d00a3b38>

#### 第五步: 评价新的评论文字

In [32]:
print(model.evaluate(x_test, y_test))
model.save('model_chinese_comment.h5') #保存模型和权重

[0.3664279420972859, 0.8578535892533566]
