## 在word2vec+LSTM情感分析

In [3]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

#from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

Using TensorFlow backend.


### 读取数据

In [4]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

### 清洗评论文本

In [6]:
# eng_stopwords = set(stopwords.words('english'))
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

### 读入之前训练好的Word2Vec模型

In [7]:
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('..', 'models', model_name))

### 根据word2vec的结果去对影评文本进行编码
编码方式有一点粗暴，简单说来就是把这句话中的词的词向量做平均

In [8]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [11]:
model['love'].shape #每个单词是一个300维的向量

(300,)

In [12]:
# 将每行的评论变成向量
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [13]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.030023,0.029956,-0.016156,0.007592,0.001252,-0.007221,0.001652,0.002545,0.00679,0.000278,...,-0.019643,-0.000346,-0.007629,-0.015923,0.012045,0.003244,0.003341,-0.012681,-0.000918,0.004719
1,0.030643,0.011501,0.001342,0.01753,0.008891,-0.017623,0.00567,-0.013038,-0.004778,0.031069,...,-0.021898,-0.014636,-0.001904,-0.051971,-0.001877,0.007129,-0.006705,-0.009252,0.01614,0.013918
2,-0.002264,0.008187,-0.010805,0.003473,-0.012158,-0.00075,-0.00727,0.039489,-0.001391,0.000328,...,-0.016351,0.012109,-0.048683,-0.016565,0.01617,0.007857,-0.00015,0.010387,-0.023958,0.001242
3,0.008067,0.019937,-0.002238,-0.000273,-0.012168,-0.008075,0.004959,-0.007353,0.018808,0.01005,...,0.004207,-0.00185,-0.021498,-0.018876,0.008945,0.004867,0.025828,-0.00841,-0.009932,0.016853
4,0.007507,0.018863,-0.015902,-0.003176,-0.006588,0.006482,0.005634,0.015489,0.006553,-0.00999,...,-0.016796,0.006233,-0.05626,-0.013591,0.015625,-0.003683,0.005536,0.008538,-0.025258,0.011272


### 构建LSTM分类器

In [15]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [17]:
type(train_data_features)

pandas.core.frame.DataFrame

In [18]:
max_features = 300
batch_size = 32
print('Pad sequences(samples x time)')
x_train = sequence.pad_sequences(train_data_features.values)

Pad sequences(samples x time)


In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128))  #嵌入层将正整数下标转换为固定大小的向量。只能作为模型的第一层
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
print('Train...')
y_train=np.array(list(df['sentiment']))
model.fit(x_train, y_train, batch_size=batch_size, epochs=5)

### 查看结果

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df.sentiment, model.predict(train_data_features)) # 混淆矩阵，所有正确的预测结果都在对角线上