In [1]:
import pandas as pd
import csv

In [2]:
pos_path = "C:/Users/Oscar/Desktop/jd_xiaomi9_pos.csv"
neg_path = "C:/Users/Oscar/Desktop/jd_xiaomi9_neg.csv"

In [3]:
pos_file = open(pos_path)
neg_file = open(neg_path)

In [4]:
pos_reader_lines = csv.reader(pos_file)
neg_reader_lines = csv.reader(neg_file)

In [5]:
# 现在我们将所有的评价内容放置到一个list里
train_texts_orig = []
# 文本所对应的labels, 也就是标记
train_target = []
for line in pos_reader_lines:
    train_texts_orig.append(line[1])
    train_target.append(line[2])
for line in neg_reader_lines:
    train_texts_orig.append(line[1])
    train_target.append(line[2])

In [6]:
# 首先加载必用的库
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import jieba # 结巴分词
# gensim用来加载预训练word vector
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings("ignore")
# 用来解压
import bz2

In [7]:
# 请将下载的词向量压缩包放置在根目录 embeddings 文件夹里
# 解压词向量, 有可能需要等待1-2分钟
with open("dataset/NLP/sgns.zhihu.bigram", 'wb') as new_file, open("dataset/NLP/sgns.zhihu.bigram.bz2", 'rb') as file:
    decompressor = bz2.BZ2Decompressor()
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(decompressor.decompress(data))

In [8]:
# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('dataset/NLP/sgns.zhihu.bigram', binary=False, unicode_errors="ignore")

In [9]:
# 由此可见每一个词都对应一个长度为300的向量
embedding_dim = cn_model['深圳'].shape[0]
print('词向量的长度为{}'.format(embedding_dim))
cn_model['深圳']

词向量的长度为300


array([ 2.023750e-01, -3.708000e-03, -7.565430e-01,  8.173280e-01,
       -1.443002e+00, -1.169990e-01,  3.027310e-01,  7.402350e-01,
        1.360870e-01,  1.137900e-01, -4.726100e-01,  6.012690e-01,
       -4.950800e-02,  1.690520e-01,  4.720550e-01, -1.202050e-01,
       -2.324800e-02, -7.101900e-02, -4.373060e-01,  5.065900e-02,
        4.102140e-01, -6.221000e-03, -1.446990e-01,  6.171340e-01,
       -1.099430e-01, -5.144020e-01, -2.406990e-01, -5.681100e-01,
       -5.727620e-01,  1.198359e+00, -1.361800e-02,  1.860260e-01,
       -2.040300e-02,  5.441100e-02, -1.790730e-01, -2.648950e-01,
       -5.086800e-02, -3.325420e-01, -3.877390e-01,  1.861580e-01,
       -8.618210e-01,  4.052920e-01,  3.045350e-01, -8.005700e-02,
        4.662680e-01,  1.387900e-01, -4.225740e-01, -3.337170e-01,
       -9.828600e-02,  5.087270e-01, -1.998500e-02,  3.137330e-01,
        2.240040e-01,  1.746050e-01,  1.361020e-01,  9.837300e-02,
       -1.522923e+00,  7.467910e-01,  1.645960e-01,  6.225840e

In [10]:
cn_model.similarity('酒店','宾馆')

0.77130115

In [11]:
cn_model.most_similar(positive=['烤肉'], topn=10)

[('涮羊肉', 0.7356440424919128),
 ('烤鱼', 0.7211310267448425),
 ('麻辣锅', 0.7164487838745117),
 ('烤全羊', 0.7161980867385864),
 ('炒年糕', 0.7136646509170532),
 ('咖喱饭', 0.7114191651344299),
 ('羊排', 0.7113378047943115),
 ('干锅', 0.7113052606582642),
 ('川味', 0.7098793387413025),
 ('麻辣火锅', 0.7087246179580688)]

In [12]:
train_texts_orig[1700]

'这个花屏是什么鬼啊。拿到第二天'

In [13]:
# 我们使用tensorflow的keras接口来建模
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [14]:
# 进行分词和tokenize
# train_tokens是一个长长的list，其中含有4000个小list，对应每一条评价
train_tokens = []
for text in train_texts_orig:
    # 去掉标点
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
    # 结巴分词
    cut = jieba.cut(text)
    # 结巴分词的输出结果为一个生成器
    # 把生成器转换为list
    cut_list = [ i for i in cut ]
    for i, word in enumerate(cut_list):
        try:
            # 将词转换为索引index
            cut_list[i] = cn_model.vocab[word].index
        except KeyError:
            # 如果词不在字典中，则输出0
            cut_list[i] = 0
    train_tokens.append(cut_list)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Oscar\AppData\Local\Temp\jieba.cache
Loading model cost 0.751 seconds.
Prefix dict has been built succesfully.


In [15]:
num_tokens = [len(token) for token in train_tokens]
num_tokens = np.array(num_tokens)

In [16]:
np.mean(num_tokens)

34.716751269035534

In [17]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

98

In [18]:
# 取tokens的长度为98时，大约95%的样本被涵盖
# 我们对长度不足的进行padding，超长的进行修剪
np.sum( num_tokens < max_tokens ) / len(num_tokens)

0.9527918781725888

In [19]:
# 用来将tokens转换为文本
def reverse_tokens(tokens):
    text = ''
    for i in tokens:
        if i != 0:
            text = text + cn_model.index2word[i]
        else:
            text = text + ' '
    return text

In [20]:
reverse = reverse_tokens(train_tokens[1000])

In [21]:
reverse

'物流挺好手机各种问题都是共性问题都别买了'

In [22]:
embedding_dim

300

In [23]:
# 只使用前20000个词
num_words = 60000
# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
    embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]
embedding_matrix = embedding_matrix.astype('float32')

In [24]:
# 检查index是否对应，
# 输出300意义为长度为300的embedding向量一一对应
np.sum(cn_model[cn_model.index2word[333]] == embedding_matrix[333] )

300

In [25]:
# embedding_matrix的维度，
# 这个维度为keras的要求，后续会在模型中用到
embedding_matrix.shape

(60000, 300)

In [26]:
# 进行padding和truncating， 输入的train_tokens是一个list
# 返回的train_pad是一个numpy array
train_pad = pad_sequences(train_tokens, maxlen=max_tokens, padding='pre', truncating='pre')

In [27]:
# 超出五万个词向量的词用0代替
train_pad[ train_pad>=num_words ] = 0

In [28]:
# 可见padding之后前面的tokens全变成0，文本在最后面
train_pad[31]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,  4738,  1851,     3,    71,     3,    72,     0,     3,
         470,    72,     1,  7859,    34,    72, 11056,  1304,  2350,
       14441,   227,     0,    71, 16068, 14441, 11056,     0,     3,
         470,    72,     1,   300,    34, 18151,    34,   772,  1575,
        8046,  2657, 51127, 55422,     0, 14594,   401,  4858,   262,
           1,   300,     4,  5378,     0,     1,    36,  4376,  1007,
         348,    95,   223,  3465,  1304,    82,  2811,    98,     0,
         616,     7,   371,   515,   470,    72,     1,   518])

In [29]:
train_target = np.array(train_target)

In [30]:
train_target = train_target.astype('int')

In [31]:
train_target

array([1, 1, 1, ..., 0, 0, 0])

In [32]:
# 进行训练和测试样本的分割
from sklearn.model_selection import train_test_split

In [33]:
# 90%的样本用来训练，剩余10%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

In [34]:
# 查看训练样本，确认无误
print(reverse_tokens(X_train[100]))
print('class: ',y_train[100])

充电快速度快感觉就像送外卖我很喜欢米酒功能好我很喜欢玩游戏一点都 拍照功能也差不多很好太喜欢了喜欢的朋友可以考虑一下哦我真的不是吹的功能太好了玩游戏一点都 指纹开锁也挺快的音质也很好小爱同学也很听话充电充一个小时就充满了特别的快买的时候就是上午下午就到了说了那么多因为我不是托感谢大家来观看希望 个赞 
class:  1


In [35]:
model = Sequential()
model.add(Embedding(num_words,
                   embedding_dim,
                   weights=[embedding_matrix],
                   input_length = max_tokens,
                   trainable = False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(LSTM(units=16, return_sequences=False))

In [36]:
model.add(Dense(1, activation='sigmoid'))
# 我们使用adam以0.001的learning rate进行优化
optimizer = Adam(lr=1e-3)

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 98, 300)           18000000  
_________________________________________________________________
bidirectional (Bidirectional (None, 98, 128)           186880    
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                9280      
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 18,196,177
Trainable params: 196,177
Non-trainable params: 18,000,000
_________________________________________________________________


In [38]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [39]:
# 建立一个权重的存储点
path_checkpoint = 'dataset/NLP/sentiment_checkpoint.keras'
checkpoint = ModelCheckpoint(filepath=path_checkpoint, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [40]:
# 尝试加载已训练模型
try:
    model.load_weights(path_checkpoint)
except Exception as e:
    print(e)

In [41]:
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

In [42]:
# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-8, patience=0, verbose=1)

In [43]:
# 定义callback函数
callbacks = [
    earlystopping, 
    checkpoint,
    lr_reduction
]

In [44]:
# 开始训练
model.fit(X_train, y_train,
          validation_split=0.1, 
          epochs=20,
          batch_size=256,
          callbacks=callbacks)

Train on 1595 samples, validate on 178 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.11786, saving model to dataset/NLP/sentiment_checkpoint.keras
Epoch 2/20
Epoch 00002: val_loss did not improve from 0.11786

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 3/20
Epoch 00003: val_loss improved from 0.11786 to 0.11692, saving model to dataset/NLP/sentiment_checkpoint.keras
Epoch 4/20
Epoch 00004: val_loss did not improve from 0.11692

Epoch 00004: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 5/20
Epoch 00005: val_loss did not improve from 0.11692

Epoch 00005: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 6/20
Epoch 00006: val_loss did not improve from 0.11692

Epoch 00006: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 7/20
Epoch 00007: val_loss did not improve from 0.11692

Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.000000082740371

<tensorflow.python.keras.callbacks.History at 0x22780b11d30>

In [45]:
result = model.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:96.95%


In [46]:
def predict_sentiment(text):
    print(text)
    # 去标点
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
    # 分词
    cut = jieba.cut(text)
    cut_list = [ i for i in cut ]
    # tokenize
    for i, word in enumerate(cut_list):
        try:
            cut_list[i] = cn_model.vocab[word].index
            if cut_list[i] >= 30000:
                cut_list[i] = 0
        except KeyError:
            cut_list[i] = 0
    # padding
    tokens_pad = pad_sequences([cut_list], maxlen=max_tokens,
                           padding='pre', truncating='pre')
    # 预测
    result = model.predict(x=tokens_pad)
    coef = result[0][0]
    if coef >= 0.5:
        print('是一例正面评价','output=%.2f'%coef)
    else:
        print('是一例负面评价','output=%.2f'%coef)

In [47]:
predict_sentiment('品控不好，还没到一个月就坏了')

品控不好，还没到一个月就坏了
是一例负面评价 output=0.03


In [48]:
y_pred = model.predict(X_test)
y_pred = y_pred.T[0]
y_pred = [1 if p>= 0.5 else 0 for p in y_pred]
y_pred = np.array(y_pred)

In [49]:
y_actual = np.array(y_test)

In [50]:
misclassified = np.where( y_pred != y_actual )[0]

In [51]:
# 输出所有错误分类的索引
len(misclassified)
print(len(X_test))

197


In [52]:
# 我们来找出错误分类的样本看看
idx=101
print(reverse_tokens(X_test[idx]))
print('预测的分类', y_pred[idx])
print('实际的分类', y_actual[idx])

                                                                          GPS无效不能导航 硬伤手机发烫这是小米通病电池耗电较快其他还行京东自营快递还是 
预测的分类 0
实际的分类 0


In [53]:
misclassified

array([  8,  21,  74,  87, 183, 188], dtype=int64)

In [54]:
predict_sentiment('手机用起来很舒服，外观造型漂亮，很流畅，屏幕很清晰')

手机用起来很舒服，外观造型漂亮，很流畅，屏幕很清晰
是一例正面评价 output=0.96
