# word2vec训练词向量

In [1]:
import os
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

import nltk.data
#nltk.download()
#from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

Using TensorFlow backend.


In [2]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('..', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

### 读入无标签数据
用于训练生成word2vec词向量

In [3]:
df = load_dataset('unlabeled_train')
df.head()

Number of reviews: 50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


### 和第一个ipython notebook一样做数据的预处理
稍稍有一点不一样的是，我们留了个候选，可以去除停用词，也可以不去除停用词

In [5]:
#eng_stopwords = set(stopwords.words('english'))
eng_stopwords = {}.fromkeys([ line.rstrip() for line in open('../stopwords.txt')])

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text() # 去除HTML标签符
    text = re.sub(r'[^a-zA-Z]', ' ', text) # 去除标点符号
    words = text.lower().split() # 分词
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords] # 去除停用词
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n = 0
    def wrapped(*args, **kwargs):
        nonlocal n
        n += 1
        if n % 1000 == 1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip()) # 分句
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences

In [6]:
%time sentences = sum(df.review.apply(split_sentences), [])
print('{} reviews -> {} sentences'.format(len(df), len(sentences)))

method split_sentences called 1 times


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)


method split_sentences called 1001 times
method split_sentences called 2001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 3001 times
method split_sentences called 4001 times
method split_sentences called 5001 times
method split_sentences called 6001 times
method split_sentences called 7001 times
method split_sentences called 8001 times
method split_sentences called 9001 times
method split_sentences called 10001 times
method split_sentences called 11001 times
method split_sentences called 12001 times
method split_sentences called 13001 times
method split_sentences called 14001 times
method split_sentences called 15001 times
method split_sentences called 16001 times
method split_sentences called 17001 times
method split_sentences called 18001 times
method split_sentences called 19001 times
method split_sentences called 20001 times
method split_sentences called 21001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 22001 times
method split_sentences called 23001 times
method split_sentences called 24001 times
method split_sentences called 25001 times
method split_sentences called 26001 times
method split_sentences called 27001 times
method split_sentences called 28001 times
method split_sentences called 29001 times
method split_sentences called 30001 times
method split_sentences called 31001 times
method split_sentences called 32001 times
method split_sentences called 33001 times
method split_sentences called 34001 times
method split_sentences called 35001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 36001 times
method split_sentences called 37001 times
method split_sentences called 38001 times
method split_sentences called 39001 times
method split_sentences called 40001 times
method split_sentences called 41001 times
method split_sentences called 42001 times
method split_sentences called 43001 times
method split_sentences called 44001 times
method split_sentences called 45001 times
method split_sentences called 46001 times
method split_sentences called 47001 times
method split_sentences called 48001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 49001 times
CPU times: user 8min 6s, sys: 18.5 s, total: 8min 24s
Wall time: 8min 23s
50000 reviews -> 537851 sentences


In [10]:
len(sentences)

537851

### 用gensim训练词嵌入模型

In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [12]:
# 设定词向量训练的参数
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)

In [15]:
print('Training model...')
model = Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model.save(os.path.join('..', 'models', model_name))

2017-12-29 11:22:34,568 : INFO : collecting all words and their counts
2017-12-29 11:22:34,569 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-29 11:22:34,630 : INFO : PROGRESS: at sentence #10000, processed 225072 words, keeping 17237 word types
2017-12-29 11:22:34,696 : INFO : PROGRESS: at sentence #20000, processed 443536 words, keeping 24570 word types
2017-12-29 11:22:34,762 : INFO : PROGRESS: at sentence #30000, processed 666343 words, keeping 29785 word types


Training model...


2017-12-29 11:22:34,829 : INFO : PROGRESS: at sentence #40000, processed 886903 words, keeping 33939 word types
2017-12-29 11:22:34,897 : INFO : PROGRESS: at sentence #50000, processed 1103863 words, keeping 37503 word types
2017-12-29 11:22:34,966 : INFO : PROGRESS: at sentence #60000, processed 1327231 words, keeping 40738 word types
2017-12-29 11:22:35,036 : INFO : PROGRESS: at sentence #70000, processed 1550828 words, keeping 43603 word types
2017-12-29 11:22:35,105 : INFO : PROGRESS: at sentence #80000, processed 1772824 words, keeping 46155 word types
2017-12-29 11:22:35,172 : INFO : PROGRESS: at sentence #90000, processed 1987492 words, keeping 48328 word types
2017-12-29 11:22:35,242 : INFO : PROGRESS: at sentence #100000, processed 2210772 words, keeping 50551 word types
2017-12-29 11:22:35,311 : INFO : PROGRESS: at sentence #110000, processed 2435496 words, keeping 52762 word types
2017-12-29 11:22:35,380 : INFO : PROGRESS: at sentence #120000, processed 2658449 words, keepin

2017-12-29 11:22:55,106 : INFO : PROGRESS: at 6.41% examples, 166232 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:22:56,122 : INFO : PROGRESS: at 6.81% examples, 166246 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:22:57,137 : INFO : PROGRESS: at 7.21% examples, 166253 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:22:58,156 : INFO : PROGRESS: at 7.61% examples, 166264 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:22:59,172 : INFO : PROGRESS: at 8.03% examples, 166617 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:23:00,182 : INFO : PROGRESS: at 8.43% examples, 166675 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:23:01,184 : INFO : PROGRESS: at 8.82% examples, 166460 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:23:02,202 : INFO : PROGRESS: at 9.23% examples, 166451 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:23:03,218 : INFO : PROGRESS: at 9.63% examples, 166459 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:23:04,235 : INFO : PROGRESS: at 10.03% examples, 166467 words/s, in_

2017-12-29 11:24:16,106 : INFO : PROGRESS: at 38.76% examples, 167270 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:17,124 : INFO : PROGRESS: at 39.15% examples, 167264 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:18,139 : INFO : PROGRESS: at 39.55% examples, 167264 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:19,167 : INFO : PROGRESS: at 39.97% examples, 167305 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:20,175 : INFO : PROGRESS: at 40.36% examples, 167309 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:21,186 : INFO : PROGRESS: at 40.77% examples, 167308 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:22,201 : INFO : PROGRESS: at 41.17% examples, 167305 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:23,203 : INFO : PROGRESS: at 41.58% examples, 167322 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:24,216 : INFO : PROGRESS: at 41.98% examples, 167320 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:24:25,255 : INFO : PROGRESS: at 42.40% examples, 167341 wor

2017-12-29 11:25:37,564 : INFO : PROGRESS: at 71.05% examples, 166924 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:38,590 : INFO : PROGRESS: at 71.43% examples, 166877 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:39,632 : INFO : PROGRESS: at 71.83% examples, 166854 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:40,663 : INFO : PROGRESS: at 72.23% examples, 166837 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:41,670 : INFO : PROGRESS: at 72.64% examples, 166845 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:42,701 : INFO : PROGRESS: at 73.05% examples, 166828 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:43,739 : INFO : PROGRESS: at 73.45% examples, 166807 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:44,749 : INFO : PROGRESS: at 73.86% examples, 166812 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:45,774 : INFO : PROGRESS: at 74.27% examples, 166803 words/s, in_qsize 7, out_qsize 0
2017-12-29 11:25:46,804 : INFO : PROGRESS: at 74.67% examples, 166789 wor

2017-12-29 11:26:52,028 : INFO : saved ../models/300features_40minwords_10context.model


### 看看训练的词向量结果如何

In [16]:
print(model.doesnt_match("man woman child kitchen".split()))
print(model.doesnt_match('france england germany berlin'.split()))

kitchen
berlin


In [17]:
model.most_similar("man")

[('woman', 0.651971697807312),
 ('lady', 0.6159939765930176),
 ('lad', 0.5849895477294922),
 ('guy', 0.5439465641975403),
 ('soldier', 0.5393285751342773),
 ('chap', 0.536977231502533),
 ('person', 0.5364387035369873),
 ('monk', 0.5182623863220215),
 ('gentleman', 0.5176940560340881),
 ('boy', 0.5167154669761658)]

In [18]:
model.most_similar("queen")

[('belle', 1.2401440143585205),
 ('blonde', 1.2320035696029663),
 ('catherine', 1.2289553880691528),
 ('regina', 1.2160236835479736),
 ('angela', 1.2042949199676514),
 ('virgin', 1.2029750347137451),
 ('marlene', 1.1997132301330566),
 ('mistress', 1.1830108165740967),
 ('goddess', 1.1782617568969727),
 ('madame', 1.1761484146118164)]

In [19]:
model.most_similar("awful")

[('terrible', 1.567256212234497),
 ('horrible', 1.5290379524230957),
 ('dreadful', 1.4438960552215576),
 ('abysmal', 1.4089943170547485),
 ('embarrassing', 1.2835288047790527),
 ('crappy', 1.2247406244277954),
 ('lame', 1.220945119857788),
 ('ridiculous', 1.1674494743347168),
 ('incomprehensible', 1.1456104516983032),
 ('unbelievable', 1.1332685947418213)]