In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import platform
import datetime,pytz

root_ = '/content/drive/My Drive/colab/' if platform.system() == 'Linux' else '/Users/love/Test/'

WeiboSentiment_ = os.path.join(root_, 'WeiboSentiment')
if not os.path.exists(WeiboSentiment_):
    os.makedirs(WeiboSentiment_)

model_ = os.path.join(WeiboSentiment_, 'model')
if not os.path.exists(model_):
    os.makedirs(model_)

lstm_model_ = os.path.join(model_, 'lstm')
if not os.path.exists(lstm_model_):
    os.makedirs(lstm_model_)


import jieba
import re
import numpy as np

def tokenize(text):
    """
    带有语料清洗功能的分词函数, 包含数据预处理, 可以根据自己的需求重载
    """
    text = re.sub("\{%.+?%\}", " ", text)           # 去除 {%xxx%} (地理定位, 微博话题等)
    text = re.sub("@.+?( |$)", " ", text)           # 去除 @xxx (用户名)
    text = re.sub("【.+?】", " ", text)              # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    icons = re.findall("\[.+?\]", text)             # 提取出所有表情图标
    text = re.sub("\[.+?\]", "IconMark", text)      # 将文本中的图标替换为`IconMark`

    tokens = []
    for k, w in enumerate(jieba.lcut(text)):
        w = w.strip()
        if "IconMark" in w:                         # 将IconMark替换为原图标
            for i in range(w.count("IconMark")):
                tokens.append(icons.pop(0))
        elif w and w != '\u200b' and w.isalpha():   # 只保留有效文本
                tokens.append(w)
    return tokens


def load_curpus(path):
    """
    加载语料库
    """
    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            [_, seniment, content] = line.split(",", 2)
            content = tokenize(content)             # 分词
            data.append((content, int(seniment)))
    return data


Mounted at /content/drive


#### 加载数据

In [2]:
import pandas as pd
train_data = load_curpus(os.path.join(WeiboSentiment_, 'train.txt'))
test_data = load_curpus(os.path.join(WeiboSentiment_, 'test.txt'))
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.055 seconds.
Prefix dict has been built successfully.


加载停用词

In [3]:
stopwords = []
with open(os.path.join(WeiboSentiment_, 'stopwords.txt'), "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

TfIdf

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
tfidf = TfidfVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords)
tfidf_fit = tfidf.fit_transform(data_str)

  'stop_words.' % sorted(inconsistent))


加载之前训练好的FastText模型

In [5]:
from gensim.models import FastText
model = FastText.load(os.path.join(model_, 'model_100.txt'))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
max_length = 128

#### 为保证输入神经网络的向量长度一致, 要对长度不足max_length的句子用零向量补齐, 对长度超过max_length的句子进行截断

In [7]:
X_train, train_length, y_train = [], [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    for w in content[:max_length]:
        if w in model:
            X.append(np.expand_dims(model[w], 0))
    if X:
        length = len(X)
        X = X + [np.zeros_like(X[0])] * (max_length - length)
        X = np.concatenate(X)
        X = np.expand_dims(X, 0)
        X_train.append(X)
        train_length.append(length)
        y_train.append(y)

  """
  


In [8]:
X_test, test_length, y_test = [], [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    for w in content[:max_length]:
        if w in model:
            X.append(np.expand_dims(model[w], 0))
    if X:
        length = len(X)
        X = X + [np.zeros_like(X[0])] * (max_length - length)
        X = np.concatenate(X)
        X = np.expand_dims(X, 0)
        X_test.append(X)
        test_length.append(length)
        y_test.append(y)

  """
  


### LSTM
网络结构：两层LSTM+两层MLP

In [9]:
!pip install tensorflow==1.14.0 
!pip install 'tensorflow-estimator<1.15.0rc0,>=1.14.0rc0' --force-reinstall

Collecting tensorflow==1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl (109.2MB)
[K     |████████████████████████████████| 109.2MB 120kB/s 
[?25hCollecting keras-applications>=1.0.6
[?25l  Downloading https://files.pythonhosted.org/packages/71/e3/19762fdfc62877ae9102edf6342d71b28fbfd9dea3d2f96a882ce099b03f/Keras_Applications-1.0.8-py3-none-any.whl (50kB)
[K     |████████████████████████████████| 51kB 5.7MB/s 
Collecting tensorboard<1.15.0,>=1.14.0
[?25l  Downloading https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.2MB 44.9MB/s 
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/3c/d5/21860a5b11caf0678fbc8319341b0ae21a07156911132e0e71bffed05

In [45]:
from tensorflow.contrib import rnn

In [10]:
import tensorflow as tf
from tensorflow.contrib import rnn
batch_size = 512
lr = 1e-3
hidden_size = 100

X = tf.placeholder(shape=(batch_size, max_length, 100), dtype=tf.float32, name="X")
L = tf.placeholder(shape=(batch_size), dtype=np.int32, name="L")
y = tf.placeholder(shape=(batch_size, 1), dtype=np.float32, name="y")
dropout = tf.placeholder(shape=(), dtype=np.float32, name="dropout")
with tf.variable_scope("lstm", reuse=tf.AUTO_REUSE):
    def lstm_cell(hidden_size, cell_id=0):
        # LSTM细胞生成器
        cell = rnn.LSTMCell(hidden_size, reuse=tf.AUTO_REUSE, name='cell%d' % cell_id)
        cell = rnn.DropoutWrapper(cell, output_keep_prob=dropout)
        return cell
    
    cell = rnn.MultiRNNCell([lstm_cell(hidden_size, 0),
                              lstm_cell(hidden_size, 1)], state_is_tuple=True)
    initial_state = cell.zero_state(batch_size, tf.float32)
    cell_output, cell_state = tf.nn.dynamic_rnn(cell, X, 
                                                sequence_length=L, 
                                                initial_state=initial_state, 
                                                dtype=tf.float32)
    W1 = tf.get_variable("W1", shape=(hidden_size, 50))
    b1 = tf.get_variable("b1", shape=(50,))
    W2 = tf.get_variable("W2", shape=(50, 1))
    b2 = tf.get_variable("b2", shape=(1,))
    fcn1 = tf.nn.xw_plus_b(cell_state[1][1], W1, b1)
    logists = tf.nn.xw_plus_b(fcn1, W2, b2)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logists, labels=y))
    op = tf.train.AdamOptimizer(lr).minimize(loss)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
config = tf.ConfigProto(gpu_options=gpu_options)
sess = tf.Session(config=config)

In [12]:
total_step = 1001
step = 0
cursor = 0
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=1)
while step < total_step:
    _X, _L, _y = X_train[cursor: cursor + batch_size], train_length[cursor: cursor + batch_size], y_train[cursor: cursor + batch_size]
    cursor += batch_size
    if len(_X) < batch_size:
        cursor = batch_size - len(_X)
        _X += X_train[: cursor]
        _L += train_length[: cursor]
        _y += y_train[: cursor]
    _X = np.concatenate(_X)
    _L = np.reshape(np.array(_L, dtype=np.int32), (-1))
    _y = np.reshape(np.array(_y, dtype=np.float32), (batch_size, 1))
    _, l = sess.run([op, loss], feed_dict={X: _X, L:_L, y: _y, dropout:.75})
    if step % 100 == 0:
        print("step:", step, " loss:", l)
        saver.save(sess,os.path.join(lstm_model_, 'model'), global_step=step)
    step += 1

step: 0  loss: 0.83615035
step: 100  loss: 0.17628229
Instructions for updating:
Use standard file APIs to delete files with this prefix.
step: 200  loss: 0.1992946
step: 300  loss: 0.20099834
step: 400  loss: 0.16002187
step: 500  loss: 0.1692469
step: 600  loss: 0.1593749
step: 700  loss: 0.13127106
step: 800  loss: 0.13999704
step: 900  loss: 0.16018492
step: 1000  loss: 0.14407545


In [16]:
_X = np.concatenate(X_test + [np.zeros_like(X_test[0])] * (batch_size - len(X_test)))
_L = np.array(test_length + [0] * (batch_size - len(test_length)))

In [44]:
result = sess.run(tf.nn.sigmoid(logists), feed_dict={X: _X[:512], L: _L[:512], dropout:1.})
prediction = []
for i in result[:len(X_test)]:
    if i > 0.5:
        prediction.append(1)
    else:
        prediction.append(0)

#### 效果测评
比DNN效果好很多，一方面是因为网络结构更复杂，另一方面是因为LSTM考虑了语序信息

In [39]:
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
print(metrics.classification_report(y_test, prediction))

print('acc:',accuracy_score(y_test, prediction))
print('pc',precision_score(y_test, prediction))
print('rc:',recall_score(y_test, prediction))
print('f1:',f1_score(y_test, prediction))

acc: 0.90625
pc 0.9213483146067416
rc: 0.9010989010989011
f1: 0.9111111111111112
