In [1]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from gensim.models.word2vec import Word2Vec
import nltk
from nltk.corpus import stopwords

In [87]:
# 读入数据
def load_dataset(name,nrows=None):
    datasets = {
        'unlabeled_train':'unlabeledTrainData.tsv',
        'labeled_train':'labeledTrainData.tsv',
        'test':'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    datafile = os.path.join('.','datas',datasets[name])
    df = pd.read_csv(datafile,sep="\t",escapechar="\\",nrows=nrows)
    return df
df = load_dataset('unlabeled_train',50000)
print("Number of lines: {}".format(len(df)))
df.head()

Number of lines: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [56]:
# 数据清洗
def clean_sentence(sentence):
    sentence = sentence.lower() #0,归一化处理，全部转为小写
    sentence = BeautifulSoup(sentence,'html.parser').get_text()#1，去掉html标签
    sentence = re.sub('[\W]',' ',sentence)#2，移除标点
    tokens = nltk.word_tokenize(sentence)#3，切分词/token
    stop_words = set(stopwords.words('english'))
    sentence_words = [i for i in sentence.split() if i not in stop_words]#4,去掉停用词
    return ' '.join(sentence_words)#5，重组为新的句子

# count = 0 # 打印进度
def split_sentences(review):
#     global count
#     count += 1
#     if count % 100 == 0:
#          print(count//100,end=",")
    sentence = clean_sentence(review)
    sentences_words = sentence.split()
    return sentences_words
sentences = [x for x in df.review.apply(split_sentences)]

1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,

In [57]:
print(len(sentences))
print(sentences[:1])# sentences 应该是一个二维数组！！！

50000
[['watching', 'time', 'chasers', 'obvious', 'made', 'bunch', 'friends', 'maybe', 'sitting', 'around', 'one', 'day', 'film', 'school', 'said', 'hey', 'let', 'pool', 'money', 'together', 'make', 'really', 'bad', 'movie', 'something', 'like', 'ever', 'said', 'still', 'ended', 'making', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'barrel', 'stock', 'music', 'etc', 'corners', 'cut', 'except', 'one', 'would', 'prevented', 'film', 'release', 'life', 'like']]


In [58]:
"""使用gensim训练词嵌入模型"""
# 设定词向量训练的参数
num_features = 300# word vector dimensionality ——建议300-500维度 
min_word_count = 40# minimum word count  推荐40
num_workers = 4#number of threads to run in parallel
context = 10# context window size
downsampling = 1e-3 # downsample setting for frequnet words
model_name = '{}featurs_{}minwords_{}context.model'.format(num_features,min_word_count,context)

print('training model...')
model = Word2Vec(sentences,workers=num_workers,size=num_features,min_count = min_word_count,window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(os.path.join('.','models',model_name))

training model...


In [60]:
print(model.doesnt_match("man woman child cat".split())) # 找出不匹配的词语
print(model.wv.similarity("woman","man")) # 打印两个词的相关性
print(model.wv.most_similar("woman")) # 关联度最高的词

cat
0.5969352
[('lady', 0.6617084741592407), ('conchita', 0.6416019797325134), ('prostitute', 0.6292552947998047), ('husband', 0.6276531219482422), ('lover', 0.6242899894714355), ('sexually', 0.6181201934814453), ('man', 0.5969352126121521), ('women', 0.5967299938201904), ('whore', 0.5816379189491272), ('pregnant', 0.5698956847190857)]


  """Entry point for launching an IPython kernel.


In [61]:
# 删除不需要的变量
del model
del sentences
del df

接下来是使用之前保存好的模型，结合labeledData来做训练

In [62]:
"""在word2vec上训练情感分析模型"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

# 读入之前训练好的word2vec模型
model_name = model_name
model = Word2Vec.load(os.path.join('.','models',model_name))

In [79]:
df = load_dataset('labeled_train')
print(df.head())


def to_review_vector(review):
    words = clean_sentence(review).split()
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  "The Classic War of the Worlds" by Timothy Hin...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


  import sys
  import sys


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.001585,-0.000567,0.032747,-0.00392,-0.023202,0.015403,-0.008614,-0.010279,0.012291,-0.00323,...,-0.012411,0.007878,0.003054,-0.002946,-0.008337,-0.005934,-0.001328,0.012154,-0.0047,-0.017998
1,-0.011214,-0.010502,-0.001437,-0.005376,-0.031425,-0.003963,0.003215,0.006335,0.02074,-0.013269,...,0.011174,0.002982,0.000119,-0.004681,0.006098,-0.012677,0.015745,0.00192,-0.005046,-0.021951
2,-0.004807,-0.010583,-0.024568,0.001196,0.029349,-0.008517,-0.000997,-0.002218,-0.013086,-0.005278,...,-0.000991,-0.017428,-0.00068,-0.020089,0.029553,-0.004461,-0.004151,-0.015569,-0.0098,0.008643
3,0.000204,-0.011484,-0.006328,-0.019119,-0.010072,-0.006927,-0.002075,0.008648,0.003871,-0.016238,...,-0.015881,-0.001592,-0.008767,-0.01348,0.005728,0.015701,0.00993,0.018884,0.002761,-0.014817
4,-0.004331,-0.002867,-0.007977,-0.006313,0.001415,0.008218,-0.006616,-0.014684,-0.000503,0.008105,...,-0.012473,-0.014938,0.001253,-0.015154,0.009919,0.001044,-0.008464,0.003701,-0.008495,0.000546


mean()函数功能：求取均值
mean() 函数定义：numpy.mean(a, axis, dtype, out，keepdims )
经常操作的参数为axis，以m * n矩阵举例

axis 不设置值，对 m*n 个数求均值，返回一个实数
axis = 0：压缩行，对各列求均值，返回 1* n 矩阵 1行n列[x,x,x,x...]
axis =1 ：压缩列，对各行求均值，返回 m *1 矩阵 n行1列[y,y,y,y...]的转置


In [80]:
"""用随机森林构建分类器"""
forest = RandomForestClassifier(n_estimators=100,random_state=42) # 构建100棵树
forest = forest.fit(train_data_features,df.sentiment)#训练词向量（词向量的特征值）

In [81]:
# 在训练集上试试，确保模型正常work
confusion_matrix(df.sentiment,forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]], dtype=int64)

In [83]:
# 删除内存占用
del df
del train_data_features

In [89]:
"""预测测试集"""
df = load_dataset('test')
print(df.head())

test_data_features = df.review.apply(to_review_vector)
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.head()

         id                                             review
0  12311_10  Naturally in a film who's main themes are of m...
1    8348_2  This movie is a disaster within a disaster fil...
2    5828_4  All in all, this is a movie for kids. We saw i...
3    7186_2  Afraid of the Dark left me with the impression...
4   12128_7  A very accurate depiction of small time mob li...


NameError: name 'test_data_features' is not defined