# 在word2vec上训练情感分析模型

In [4]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

#from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

### 和之前的操作一致

In [5]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

In [7]:
# eng_stopwords = set(stopwords.words('english'))
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

### 读入之前训练好的Word2Vec模型

In [8]:
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('..', 'models', model_name))

### 我们可以根据word2vec的结果去对影评文本进行编码

编码方式有一点粗暴，简单说来就是把这句话中的词的词向量做平均

In [9]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [10]:
model['love']

array([ 0.04394694,  0.05369674,  0.08092983, -0.03452746, -0.05322076,
       -0.02460469, -0.08408138,  0.06128323,  0.10407756, -0.07784242,
        0.07673959,  0.06874307,  0.06069328, -0.05997264,  0.02069032,
       -0.08889063,  0.11769066, -0.00737705,  0.00590136,  0.05115112,
        0.15071088,  0.0262636 ,  0.01458188, -0.00783722,  0.00996073,
        0.04211447,  0.09647947, -0.0935035 ,  0.05298699, -0.02045441,
       -0.00226827, -0.10501024,  0.09173818, -0.02053106,  0.00167883,
       -0.0346115 ,  0.02186703, -0.01394878,  0.10207272,  0.01215976,
        0.13881618, -0.04215522, -0.07350793, -0.11474081,  0.02626938,
        0.11274648, -0.01416363,  0.00535241,  0.12130103,  0.0291233 ,
        0.04322159, -0.08536672,  0.01844861,  0.07645814, -0.00040104,
        0.01078126,  0.08877202,  0.0970154 ,  0.0716213 ,  0.03593608,
        0.05678654,  0.01333988, -0.01599465, -0.00681712,  0.03623066,
        0.06369489, -0.03352832,  0.00556041,  0.03320808,  0.01

In [11]:
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [12]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.030023,0.029956,-0.016156,0.007592,0.001252,-0.007221,0.001652,0.002545,0.00679,0.000278,...,-0.019643,-0.000346,-0.007629,-0.015923,0.012045,0.003244,0.003341,-0.012681,-0.000918,0.004719
1,0.030643,0.011501,0.001342,0.01753,0.008891,-0.017623,0.00567,-0.013038,-0.004778,0.031069,...,-0.021898,-0.014636,-0.001904,-0.051971,-0.001877,0.007129,-0.006705,-0.009252,0.01614,0.013918
2,-0.002264,0.008187,-0.010805,0.003473,-0.012158,-0.00075,-0.00727,0.039489,-0.001391,0.000328,...,-0.016351,0.012109,-0.048683,-0.016565,0.01617,0.007857,-0.00015,0.010387,-0.023958,0.001242
3,0.008067,0.019937,-0.002238,-0.000273,-0.012168,-0.008075,0.004959,-0.007353,0.018808,0.01005,...,0.004207,-0.00185,-0.021498,-0.018876,0.008945,0.004867,0.025828,-0.00841,-0.009932,0.016853
4,0.007507,0.018863,-0.015902,-0.003176,-0.006588,0.006482,0.005634,0.015489,0.006553,-0.00999,...,-0.016796,0.006233,-0.05626,-0.013591,0.015625,-0.003683,0.005536,0.008538,-0.025258,0.011272


### 用随机森林构建分类器

In [13]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 同样在训练集上试试，确保模型能正常work

In [14]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

### 清理占用内容的变量

In [15]:
del df
del train_data_features

### 预测测试集结果并上传kaggle

In [16]:
df = load_dataset('test')
df.head()

Number of reviews: 25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [17]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.044937,0.02664,-0.020985,-0.000668,-0.016686,-0.003924,-0.00193,0.000945,0.006424,0.005443,...,-0.004149,0.010839,-0.063309,-0.055783,0.028198,-0.001743,0.027613,-0.0135,-0.013135,0.027243
1,0.048151,0.062075,-0.027988,-0.008301,0.002779,0.006056,-0.004327,-0.018348,0.009714,0.024614,...,-0.022285,-0.010061,-0.023892,-0.032577,0.02227,0.014868,0.033391,-0.013999,0.024013,0.002791
2,0.04756,0.043983,-0.004736,-0.002596,0.004812,-0.025201,0.013004,-0.030051,0.00857,0.018096,...,0.006398,-0.006216,-0.011681,-0.04586,0.007112,0.001249,0.016637,-0.023928,0.004249,0.014391
3,0.039483,0.036809,-0.012046,0.001131,-0.001806,0.005945,-0.002101,-0.004565,-0.00199,0.018447,...,-0.01249,-0.009538,-0.035018,-0.019758,0.026294,0.007208,0.008355,0.001082,0.001661,0.021524
4,0.029964,0.0139,0.000292,0.004766,0.000128,0.016663,-0.001327,0.022154,-0.012872,-0.00149,...,-0.021609,-0.004259,-0.019412,-0.011326,0.017897,-0.008278,0.033263,-0.00207,-0.007387,0.002127


In [18]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [19]:
del df
del test_data_features
del forest

------------------
### 对词向量进行聚类研究和编码
使用Kmeans进行聚类

In [52]:
model.syn1neg.shape

(13056, 300)

In [50]:
model.syn0_lockf.shape

(13056,)

In [53]:
word_vectors = model.syn1neg
num_clusters = word_vectors.shape[0] // 10

In [56]:
num_clusters

1305

In [57]:
%%time

kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=4)
idx = kmeans_clustering.fit_predict(word_vectors)

CPU times: user 920 ms, sys: 136 ms, total: 1.06 s
Wall time: 3min 26s


In [59]:
idx.shape

(13056,)

In [65]:
len(model.wv.index2word)

13056

In [67]:
word_centroid_map = dict(zip(model.wv.index2word, idx)) # 将词典中的单词 和 他们的分类所属的类别进行映射

In [68]:
import pickle

filename = 'word_centroid_map_10avg.pickle'
with open(os.path.join('..', 'models', filename), 'bw') as f:  # 持久化到文件中
    pickle.dump(word_centroid_map, f)
    
#with open(os.path.join('..', 'models', filename), 'br') as f:
#    word_centroid_map = pickle.load(f)    

### 输出一些clusters看

In [69]:
for cluster in range(0,10):
    print("\nCluster %d" % cluster)
    print([w for w,c in word_centroid_map.items() if c == cluster])


Cluster 0
['watched']

Cluster 1
['exposure', 'goods', 'arguments', 'agony', 'speeches', 'layer', 'observation', 'misplaced', 'interpretations', 'weirdness', 'nuances', 'envelope', 'cynicism', 'demeanor', 'gesture', 'incomplete', 'gloss', 'oneself', 'alarm', 'characteristic', 'monologues', 'annoyance', 'insider', 'andromeda', 'nuance', 'barrier', 'ridicule', 'incompetence', 'indifference', 'disdain', 'responses', 'endeavor', 'agreement', 'finesse', 'thematic', 'grit', 'amuse', 'lingers', 'occurrence', 'essay', 'strains', 'gears', 'spectator', 'amazement', 'drastic', 'dismay', 'reflective', 'insistence', 'trait', 'spellbound', 'incessant', 'aplomb', 'climaxes', 'speculation', 'misunderstandings', 'curve', 'chord', 'expertise', 'paradox', 'histrionics', 'glances', 'surrealist', 'fore', 'craftsmanship', 'discomfort', 'ceases', 'ingenuity', 'filmic', 'foreshadowing', 'analogy', 'ridiculousness', 'epilogue', 'irritation', 'competence', 'empathise', 'lows', 'schizophrenia', 'damaging', 'scr

### 把评论数据转成cluster bag vectors

In [70]:
wordset = set(word_centroid_map.keys())

def make_cluster_bag(review):
    words = clean_text(review, remove_stopwords=True)
    return (pd.Series([word_centroid_map[w] for w in words if w in wordset])
              .value_counts()
              .reindex(range(num_clusters+1), fill_value=0)) # 找到各个单词属于哪个类，计算每个类别出现的次数，作为句子的特征向量该位置处的值

In [71]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [72]:
train_data_features = df.review.apply(make_cluster_bag)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305
0,2,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### 再用随机森林算法建模

In [73]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 在训练集上试一试效果

In [74]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

#### 去掉无用的占内存的量

In [75]:
del df
del train_data_features

### 载入测试数据做预测

In [76]:
df = load_dataset('test')
df.head()

Number of reviews: 25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [77]:
test_data_features = df.review.apply(make_cluster_bag)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1296,1297,1298,1299,1300,1301,1302,1303,1304,1305
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_BagOfClusters.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,0


In [79]:
del df
del test_data_features
del forest