# 实现word2vec


In [56]:
from gensim.models import Word2Vec

1. 读取数据

In [77]:
file_path = '../data/word2vec/icwb2-data/training/msr_training.utf8'

sen_list = [] # list->str: 句子
with open(file_path, encoding='utf-8') as f:
    for line in f.readlines():
        sen_list.append(line)

2. 清理数据

In [78]:
import re

In [79]:
# 只保留空格和字符（汉字，英语等）
data_list = []
for sen in sen_list:
    sen = re.sub(r'\W(?!\S)', '', sen).split() # 只保留汉字，空格 不知道为什么这么写有用
    sen = list(filter(lambda word: len(word) != 1, sen)) # 去除单字
    data_list.append(sen) # 不可以使用split(' ') 它会严格按照一个空格来进行划分

# 去除停用词
stopword_file_path = '../data/word2vec/cn_stopwords.txt'
stopwords = [i.strip() for i in open(stopword_file_path, encoding='utf-8').readlines()]

def pretty_cut(sen):
    '''
    去除停用词
    :param sen: list 句子的分词list
    :return:
    '''
    r_sen = []
    for word in sen:
        if word not in stopwords:
            r_sen.append(word)
    return r_sen

data_list = list(map(lambda x: pretty_cut(x), data_list))

3. 统计数据

In [80]:
from collections import Counter

In [81]:
counter = Counter()
for data in data_list:
    counter.update(data)

len(counter), counter.most_common(10)

(83863,
 [('发展', 5976),
  ('经济', 5063),
  ('中国', 4976),
  ('一个', 4739),
  ('工作', 4445),
  ('企业', 4433),
  ('问题', 4276),
  ('国家', 3636),
  ('进行', 3619),
  ('政府', 3144)])

4. 构建模型

In [82]:
model =  Word2Vec(sentences=data_list, size=100, window=5, min_count=5, workers=4, sg=0)

In [83]:
model.wv.most_similar('北京')

[('上海', 0.9082496166229248),
 ('近日', 0.9048246145248413),
 ('本报', 0.8812147974967957),
 ('４月１４日', 0.8725495934486389),
 ('主办', 0.8707386255264282),
 ('浦东', 0.8684402704238892),
 ('广州', 0.867500901222229),
 ('日前', 0.8642042875289917),
 ('中国青年政治学院', 0.8616960048675537),
 ('天津', 0.8601998090744019)]

In [90]:
model.wv.most_similar('老师')

[('战士', 0.9523284435272217),
 ('同学', 0.9513710737228394),
 ('教给', 0.931329607963562),
 ('孝敬', 0.9291872382164001),
 ('后来', 0.9266383051872253),
 ('因材施教', 0.9154665470123291),
 ('喜欢', 0.9122165441513062),
 ('感动', 0.9115886688232422),
 ('回家', 0.9093741774559021),
 ('终日', 0.9086799621582031)]

76