# 在word2vec上训练情感分析模型

In [7]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from gensim.models import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

### 和之前的操作一致

In [8]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('../../../', 'data/nlpword2vecdata', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

In [9]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

LookupError: 
**********************************************************************
  Resource 'corpora/stopwords' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

### 读入之前训练好的Word2Vec模型

In [None]:
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(os.path.join('../../../', 'data/nlpword2vecdata/models', model_name))

### 我们可以根据word2vec的结果去对影评文本进行编码

编码方式有一点粗暴，简单说来就是把这句话中的词的词向量做平均

In [None]:
df = load_dataset('labeled_train')
df.head()

In [None]:
def to_review_vector(review):
    words = clean_text(review, remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [None]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

### 用随机森林构建分类器

In [None]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 同样在训练集上试试，确保模型能正常work

In [None]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

### 清理占用内容的变量

In [None]:
del df
del train_data_features

### 预测测试集结果并上传kaggle

In [None]:
df = load_dataset('test')
df.head()

In [None]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

In [None]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_model.csv'), index=False)
output.head()

In [None]:
del df
del test_data_features
del forest

------------------
### 对词向量进行聚类研究和编码
使用Kmeans进行聚类

In [None]:
word_vectors = model.syn0
num_clusters = word_vectors.shape[0] // 10

In [None]:
%%time

kmeans_clustering = KMeans(n_clusters = num_clusters, n_jobs=4)
idx = kmeans_clustering.fit_predict(word_vectors)

In [None]:
word_centroid_map = dict(zip(model.index2word, idx))

In [None]:
import pickle

filename = 'word_centroid_map_10avg.pickle'
with open(os.path.join('..', 'models', filename), 'bw') as f:
    pickle.dump(word_centroid_map, f)
    
#with open(os.path.join('..', 'models', filename), 'br') as f:
#    word_centroid_map = pickle.load(f)    

### 输出一些clusters看

In [None]:
for cluster in range(0,10):
    print("\nCluster %d" % cluster)
    print([w for w,c in word_centroid_map.items() if c == cluster])

### 把评论数据转成cluster bag vectors

In [None]:
wordset = set(word_centroid_map.keys())

def make_cluster_bag(review):
    words = clean_text(review, remove_stopwords=True)
    return (pd.Series([word_centroid_map[w] for w in words if w in wordset])
              .value_counts()
              .reindex(range(num_clusters+1), fill_value=0))

In [None]:
df = load_dataset('labeled_train')
df.head()

In [None]:
train_data_features = df.review.apply(make_cluster_bag)
train_data_features.head()

### 再用随机森林算法建模

In [None]:
forest = RandomForestClassifier(n_estimators = 100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

##### 在训练集上试一试效果

In [None]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

#### 去掉无用的占内存的量

In [None]:
del df
del train_data_features

### 载入测试数据做预测

In [None]:
df = load_dataset('test')
df.head()

In [None]:
test_data_features = df.review.apply(make_cluster_bag)
test_data_features.head()

In [None]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id, 'sentiment':result})
output.to_csv(os.path.join('..', 'data', 'Word2Vec_BagOfClusters.csv'), index=False)
output.head()

In [None]:
del df
del test_data_features
del forest