# Word2Vec

Step 1: Download files:
+ zhwiki-latest-pages-articles.xml.bz2

In [7]:
from urllib.request import urlretrieve
import os 

url = 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
filename = url.split('/')[-1]
fullname = os.path.join('data', filename)
if not os.path.exists(fullname):
    print('Downloading {} from {}...'.format(filename, url))
    urlretrieve(url, filename=fullname)

Step 2: Preprocess zhwiki and save the corpus in `data/reduce_zhiwiki.txt`

In [8]:
from gensim.corpora import WikiCorpus
import jieba
from utils.langconv import *
import os


def preprocessing():
    space = ' '
    i = 0
    l = []
    zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
    f = open('./data/reduce_zhiwiki.txt', 'w')
    wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={}) # 转换xml格式的维基中文语料为txt格式
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = Converter('zh-hans').convert(temp_sentence) # 转换繁体字为简体字
            seg_list = list(jieba.cut(temp_sentence)) # 分词
            for temp_term in seg_list:
                l.append(temp_term)
        f.write(space.join(l) + '\n') # 空格拼接分词内容, 因为gensim的Word2Vec要求这种格式
        l = []
        i = i + 1

        if (i %200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()

pre_corpus_filename = 'data/reduce_zhiwiki.txt'
if not os.path.exists(pre_corpus_filename):
    preprocessing()

Step 3: Train models and save the model in `data/zhiwiki_news.word2vec`

In [3]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging

def train_model(model_filename='data/zhiwiki_news.word2vec'):
    with open(pre_corpus_filename, 'r') as wiki_news:
        model = Word2Vec(LineSentence(wiki_news), sg=0, size=192, window=5, min_count=5, workers=9)
        model.save(model_filename)

model_filename = 'data/zhiwiki_news.word2vec'

if not os.path.exists(model_filename):
    train_model(model_filename)


Step 4: Use models

In [6]:
from gensim.models import Word2Vec

model = Word2Vec.load(model_filename)

print(model.similarity('西红柿', '香蕉'))

word = '中国'

if word in model.wv.index2word:
    print(model.most_similar(word))


  """
  # Remove the CWD from sys.path while we load stuff.


0.5083116286754799
[('我国', 0.6470304727554321), ('北京', 0.5798619985580444), ('礁层', 0.5606339573860168), ('中国政府', 0.5527150630950928), ('台湾', 0.5525480508804321), ('中华民国', 0.5394100546836853), ('亚洲', 0.5239245295524597), ('全国', 0.5231877565383911), ('台商', 0.5222627520561218), ('中华人民共和国', 0.5218644738197327)]
