### 1. 把維基百科資料讀出

In [1]:
# -*- coding: utf-8 -*-

import gensim
import logging

input_file = "wiki_article/zhwiki-20171201-pages-articles-multistream.xml.bz2"

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki =  gensim.corpora.WikiCorpus(input_file, lemmatize=False, dictionary={})

texts_num = 0
with open("wiki_article/wiki_text.txt",'w',encoding='utf-8') as output:
    for text in wiki.get_texts():
        output.write(' '.join(text) + '\n')
        texts_num += 1
        if texts_num % 10000 == 0:
            logging.info("已處理 %d 篇文章" % texts_num)

2017-12-05 03:15:40,070 : INFO : 已處理 10000 篇文章
2017-12-05 03:16:11,646 : INFO : 已處理 20000 篇文章
2017-12-05 03:16:40,125 : INFO : 已處理 30000 篇文章
2017-12-05 03:17:08,763 : INFO : 已處理 40000 篇文章
2017-12-05 03:17:38,526 : INFO : 已處理 50000 篇文章
2017-12-05 03:18:08,070 : INFO : 已處理 60000 篇文章
2017-12-05 03:18:35,903 : INFO : 已處理 70000 篇文章
2017-12-05 03:19:04,648 : INFO : 已處理 80000 篇文章
2017-12-05 03:19:32,788 : INFO : 已處理 90000 篇文章
2017-12-05 03:20:03,534 : INFO : 已處理 100000 篇文章
2017-12-05 03:20:41,418 : INFO : 已處理 110000 篇文章
2017-12-05 03:21:16,119 : INFO : 已處理 120000 篇文章
2017-12-05 03:21:51,791 : INFO : 已處理 130000 篇文章
2017-12-05 03:22:27,692 : INFO : 已處理 140000 篇文章
2017-12-05 03:23:02,943 : INFO : 已處理 150000 篇文章
2017-12-05 03:23:37,486 : INFO : 已處理 160000 篇文章
2017-12-05 03:24:14,164 : INFO : 已處理 170000 篇文章
2017-12-05 03:24:53,872 : INFO : 已處理 180000 篇文章
2017-12-05 03:27:47,548 : INFO : 已處理 190000 篇文章
2017-12-05 03:29:08,702 : INFO : 已處理 200000 篇文章
2017-12-05 03:30:18,716 : INFO : 已處理 210000 篇文章
2

### 2. 使用OpenCC把簡體文字轉為繁體

In [2]:
# opencc -i wiki_text.txt -o zh_wiki_text.txt -c s2tw.json
# wiki_text.txt -> zh_wiki_text.txt

### 3. 用Jieba斷詞

In [1]:
# -*- coding: utf-8 -*-

import jieba
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# set jieba dict
jieba.set_dictionary('jieba_dict/dict.txt.big')

output = open('wiki_seg.txt','w')

texts_num = 0

with open('wiki_article/wiki_seg.txt', 'w', encoding='utf8') as output:
    with open('wiki_article/zh_wiki_text.txt','r', encoding='utf8') as content :
        for line in content:
            words = jieba.cut(line, cut_all=False)
        
            for word in words:
                output.write(word +' ')
                
            texts_num += 1
        
            if texts_num % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % texts_num)

Building prefix dict from C:\Users\User\pythonRepository\SemanticAnalysis\jieba_dict\dict.txt.big ...
2017-12-06 01:42:26,089 : DEBUG : Building prefix dict from C:\Users\User\pythonRepository\SemanticAnalysis\jieba_dict\dict.txt.big ...
Dumping model to file cache C:\Users\User\AppData\Local\Temp\jieba.ua442fad0dbeebb90d0fa2ffe39e75bc0.cache
2017-12-06 01:42:27,512 : DEBUG : Dumping model to file cache C:\Users\User\AppData\Local\Temp\jieba.ua442fad0dbeebb90d0fa2ffe39e75bc0.cache
Loading model cost 1.543 seconds.
2017-12-06 01:42:27,640 : DEBUG : Loading model cost 1.543 seconds.
Prefix dict has been built succesfully.
2017-12-06 01:42:27,642 : DEBUG : Prefix dict has been built succesfully.
2017-12-06 01:44:52,187 : INFO : 已完成前 10000 行的斷詞
2017-12-06 01:46:36,249 : INFO : 已完成前 20000 行的斷詞
2017-12-06 01:48:10,034 : INFO : 已完成前 30000 行的斷詞
2017-12-06 01:49:37,785 : INFO : 已完成前 40000 行的斷詞
2017-12-06 01:51:00,941 : INFO : 已完成前 50000 行的斷詞
2017-12-06 01:52:20,792 : INFO : 已完成前 60000 行的斷詞
2017

### 4. Word2vec

In [3]:
# 訓練詞向量

from gensim.models import word2vec
import logging
import multiprocessing

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = word2vec.LineSentence("wiki_article/wiki_seg.txt")

# sg=0, CBOW算法(大樣本)
# negative=5, 採用negativesampling, 有5個noise words(小樣本：5-20, 大樣本：2-5)
model = word2vec.Word2Vec(sentences, size=250, window=5, min_count=5,
                          workers=multiprocessing.cpu_count(), sg=0, negative=5)

2017-12-06 02:31:36,248 : INFO : collecting all words and their counts
2017-12-06 02:31:36,250 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-06 02:31:40,557 : INFO : PROGRESS: at sentence #10000, processed 12046516 words, keeping 591559 word types
2017-12-06 02:31:43,799 : INFO : PROGRESS: at sentence #20000, processed 20756837 words, keeping 850351 word types
2017-12-06 02:31:46,752 : INFO : PROGRESS: at sentence #30000, processed 28591411 words, keeping 1031404 word types
2017-12-06 02:31:49,519 : INFO : PROGRESS: at sentence #40000, processed 35870443 words, keeping 1199056 word types
2017-12-06 02:31:52,246 : INFO : PROGRESS: at sentence #50000, processed 42755939 words, keeping 1344188 word types
2017-12-06 02:31:54,843 : INFO : PROGRESS: at sentence #60000, processed 49239461 words, keeping 1468634 word types
2017-12-06 02:31:57,254 : INFO : PROGRESS: at sentence #70000, processed 55383610 words, keeping 1583078 word types
2017-12-06 02:31:59,

2017-12-06 02:33:41,908 : INFO : PROGRESS: at 1.34% examples, 548327 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:33:42,910 : INFO : PROGRESS: at 1.39% examples, 549099 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:33:43,929 : INFO : PROGRESS: at 1.43% examples, 549748 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:33:44,944 : INFO : PROGRESS: at 1.49% examples, 549753 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:33:45,951 : INFO : PROGRESS: at 1.54% examples, 549272 words/s, in_qsize 0, out_qsize 1
2017-12-06 02:33:46,952 : INFO : PROGRESS: at 1.59% examples, 550457 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:33:47,960 : INFO : PROGRESS: at 1.63% examples, 550225 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:33:48,969 : INFO : PROGRESS: at 1.69% examples, 549195 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:33:49,973 : INFO : PROGRESS: at 1.74% examples, 549677 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:33:51,002 : INFO : PROGRESS: at 1.80% examples, 550383 words/s, in_q

2017-12-06 02:35:03,856 : INFO : PROGRESS: at 6.56% examples, 576627 words/s, in_qsize 5, out_qsize 0
2017-12-06 02:35:04,884 : INFO : PROGRESS: at 6.64% examples, 576985 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:35:05,908 : INFO : PROGRESS: at 6.72% examples, 577527 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:06,925 : INFO : PROGRESS: at 6.80% examples, 577639 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:07,943 : INFO : PROGRESS: at 6.87% examples, 577152 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:08,956 : INFO : PROGRESS: at 6.94% examples, 577094 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:35:09,963 : INFO : PROGRESS: at 7.00% examples, 576565 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:10,971 : INFO : PROGRESS: at 7.07% examples, 576531 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:11,992 : INFO : PROGRESS: at 7.14% examples, 575910 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:35:13,000 : INFO : PROGRESS: at 7.20% examples, 575306 words/s, in_q

2017-12-06 02:36:25,828 : INFO : PROGRESS: at 12.77% examples, 574764 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:36:26,829 : INFO : PROGRESS: at 12.85% examples, 574985 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:36:27,839 : INFO : PROGRESS: at 12.94% examples, 575071 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:36:28,865 : INFO : PROGRESS: at 13.02% examples, 575189 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:36:29,867 : INFO : PROGRESS: at 13.12% examples, 575352 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:36:30,873 : INFO : PROGRESS: at 13.21% examples, 575557 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:36:31,877 : INFO : PROGRESS: at 13.29% examples, 575484 words/s, in_qsize 5, out_qsize 1
2017-12-06 02:36:32,907 : INFO : PROGRESS: at 13.36% examples, 574946 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:36:33,927 : INFO : PROGRESS: at 13.43% examples, 574239 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:36:34,944 : INFO : PROGRESS: at 13.51% examples, 573691 wor

2017-12-06 02:37:47,012 : INFO : PROGRESS: at 19.10% examples, 570378 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:37:48,013 : INFO : PROGRESS: at 19.20% examples, 570557 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:37:49,020 : INFO : PROGRESS: at 19.29% examples, 570564 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:37:50,037 : INFO : PROGRESS: at 19.40% examples, 570744 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:37:51,043 : INFO : PROGRESS: at 19.49% examples, 570838 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:37:52,071 : INFO : PROGRESS: at 19.60% examples, 570780 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:37:53,087 : INFO : PROGRESS: at 19.69% examples, 570653 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:37:54,091 : INFO : PROGRESS: at 19.79% examples, 570688 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:37:55,116 : INFO : PROGRESS: at 19.88% examples, 570838 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:37:56,145 : INFO : PROGRESS: at 19.99% examples, 570970 wor

2017-12-06 02:39:08,007 : INFO : PROGRESS: at 23.64% examples, 576188 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:09,009 : INFO : PROGRESS: at 23.71% examples, 576374 words/s, in_qsize 5, out_qsize 2
2017-12-06 02:39:10,015 : INFO : PROGRESS: at 23.77% examples, 576550 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:11,018 : INFO : PROGRESS: at 23.84% examples, 576704 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:12,024 : INFO : PROGRESS: at 23.89% examples, 576630 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:13,050 : INFO : PROGRESS: at 23.95% examples, 576484 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:39:14,053 : INFO : PROGRESS: at 24.02% examples, 576494 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:39:15,058 : INFO : PROGRESS: at 24.09% examples, 576518 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:16,074 : INFO : PROGRESS: at 24.16% examples, 576569 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:39:17,097 : INFO : PROGRESS: at 24.22% examples, 576549 wor

2017-12-06 02:40:29,311 : INFO : PROGRESS: at 28.72% examples, 564931 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:40:30,334 : INFO : PROGRESS: at 28.79% examples, 564805 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:40:31,360 : INFO : PROGRESS: at 28.86% examples, 564828 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:40:32,363 : INFO : PROGRESS: at 28.94% examples, 564694 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:40:33,389 : INFO : PROGRESS: at 29.00% examples, 564438 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:40:34,432 : INFO : PROGRESS: at 29.07% examples, 564228 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:40:35,441 : INFO : PROGRESS: at 29.13% examples, 564008 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:40:36,455 : INFO : PROGRESS: at 29.18% examples, 563554 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:40:37,481 : INFO : PROGRESS: at 29.24% examples, 563410 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:40:38,492 : INFO : PROGRESS: at 29.30% examples, 563164 wor

2017-12-06 02:41:50,512 : INFO : PROGRESS: at 34.50% examples, 558485 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:51,520 : INFO : PROGRESS: at 34.56% examples, 558328 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:41:52,544 : INFO : PROGRESS: at 34.63% examples, 558141 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:53,559 : INFO : PROGRESS: at 34.68% examples, 557771 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:54,581 : INFO : PROGRESS: at 34.73% examples, 557510 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:41:55,604 : INFO : PROGRESS: at 34.80% examples, 557324 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:56,610 : INFO : PROGRESS: at 34.87% examples, 557222 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:57,614 : INFO : PROGRESS: at 34.94% examples, 557129 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:41:58,640 : INFO : PROGRESS: at 35.01% examples, 557129 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:41:59,648 : INFO : PROGRESS: at 35.09% examples, 557081 wor

2017-12-06 02:43:11,884 : INFO : PROGRESS: at 39.73% examples, 544157 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:43:12,917 : INFO : PROGRESS: at 39.81% examples, 544100 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:13,932 : INFO : PROGRESS: at 39.91% examples, 544141 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:14,945 : INFO : PROGRESS: at 39.99% examples, 544338 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:15,953 : INFO : PROGRESS: at 40.02% examples, 544434 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:16,973 : INFO : PROGRESS: at 40.05% examples, 544549 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:17,994 : INFO : PROGRESS: at 40.07% examples, 544674 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:43:19,002 : INFO : PROGRESS: at 40.11% examples, 544815 words/s, in_qsize 5, out_qsize 0
2017-12-06 02:43:20,014 : INFO : PROGRESS: at 40.14% examples, 544915 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:43:21,022 : INFO : PROGRESS: at 40.17% examples, 545023 wor

2017-12-06 02:44:32,938 : INFO : PROGRESS: at 43.58% examples, 544019 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:44:33,969 : INFO : PROGRESS: at 43.63% examples, 543818 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:44:34,991 : INFO : PROGRESS: at 43.69% examples, 543748 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:44:36,000 : INFO : PROGRESS: at 43.74% examples, 543725 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:44:37,032 : INFO : PROGRESS: at 43.79% examples, 543642 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:44:38,046 : INFO : PROGRESS: at 43.85% examples, 543622 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:44:39,055 : INFO : PROGRESS: at 43.90% examples, 543596 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:44:40,061 : INFO : PROGRESS: at 43.96% examples, 543552 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:44:41,088 : INFO : PROGRESS: at 44.02% examples, 543597 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:44:42,091 : INFO : PROGRESS: at 44.09% examples, 543634 wor

2017-12-06 02:45:54,172 : INFO : PROGRESS: at 48.93% examples, 543658 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:45:55,198 : INFO : PROGRESS: at 49.01% examples, 543678 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:45:56,198 : INFO : PROGRESS: at 49.09% examples, 543715 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:45:57,207 : INFO : PROGRESS: at 49.16% examples, 543762 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:45:58,224 : INFO : PROGRESS: at 49.23% examples, 543834 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:45:59,241 : INFO : PROGRESS: at 49.32% examples, 543865 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:46:00,247 : INFO : PROGRESS: at 49.39% examples, 543931 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:46:01,256 : INFO : PROGRESS: at 49.45% examples, 543930 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:46:02,259 : INFO : PROGRESS: at 49.51% examples, 543870 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:46:03,283 : INFO : PROGRESS: at 49.58% examples, 543863 wor

2017-12-06 02:47:15,378 : INFO : PROGRESS: at 54.44% examples, 539343 words/s, in_qsize 5, out_qsize 0
2017-12-06 02:47:16,413 : INFO : PROGRESS: at 54.51% examples, 539299 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:47:17,427 : INFO : PROGRESS: at 54.58% examples, 539274 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:47:18,456 : INFO : PROGRESS: at 54.65% examples, 539229 words/s, in_qsize 4, out_qsize 1
2017-12-06 02:47:19,471 : INFO : PROGRESS: at 54.71% examples, 539117 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:47:20,478 : INFO : PROGRESS: at 54.78% examples, 539118 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:47:21,513 : INFO : PROGRESS: at 54.85% examples, 539092 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:47:22,552 : INFO : PROGRESS: at 54.93% examples, 539064 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:47:23,555 : INFO : PROGRESS: at 55.00% examples, 539084 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:47:24,567 : INFO : PROGRESS: at 55.07% examples, 539038 wor

2017-12-06 02:48:36,541 : INFO : PROGRESS: at 60.28% examples, 540095 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:48:37,546 : INFO : PROGRESS: at 60.31% examples, 540101 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:48:38,552 : INFO : PROGRESS: at 60.35% examples, 540125 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:48:39,556 : INFO : PROGRESS: at 60.39% examples, 540137 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:48:40,586 : INFO : PROGRESS: at 60.42% examples, 540164 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:48:41,609 : INFO : PROGRESS: at 60.46% examples, 540195 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:48:42,611 : INFO : PROGRESS: at 60.50% examples, 540233 words/s, in_qsize 5, out_qsize 0
2017-12-06 02:48:43,613 : INFO : PROGRESS: at 60.54% examples, 540253 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:48:44,641 : INFO : PROGRESS: at 60.59% examples, 540296 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:48:45,652 : INFO : PROGRESS: at 60.62% examples, 540329 wor

2017-12-06 02:49:57,712 : INFO : PROGRESS: at 64.04% examples, 537336 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:49:58,745 : INFO : PROGRESS: at 64.10% examples, 537357 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:49:59,746 : INFO : PROGRESS: at 64.17% examples, 537349 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:50:00,762 : INFO : PROGRESS: at 64.23% examples, 537306 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:50:01,787 : INFO : PROGRESS: at 64.29% examples, 537326 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:50:02,799 : INFO : PROGRESS: at 64.35% examples, 537360 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:50:03,804 : INFO : PROGRESS: at 64.42% examples, 537402 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:50:04,823 : INFO : PROGRESS: at 64.48% examples, 537399 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:50:05,820 : INFO : PROGRESS: at 64.54% examples, 537388 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:50:06,823 : INFO : PROGRESS: at 64.61% examples, 537420 wor

2017-12-06 02:51:18,875 : INFO : PROGRESS: at 69.48% examples, 537619 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:51:19,892 : INFO : PROGRESS: at 69.55% examples, 537592 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:51:20,901 : INFO : PROGRESS: at 69.61% examples, 537543 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:51:21,907 : INFO : PROGRESS: at 69.66% examples, 537475 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:51:22,911 : INFO : PROGRESS: at 69.72% examples, 537457 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:51:23,924 : INFO : PROGRESS: at 69.79% examples, 537371 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:51:24,934 : INFO : PROGRESS: at 69.86% examples, 537337 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:51:25,937 : INFO : PROGRESS: at 69.93% examples, 537343 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:51:26,954 : INFO : PROGRESS: at 70.01% examples, 537376 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:51:27,973 : INFO : PROGRESS: at 70.07% examples, 537334 wor

2017-12-06 02:52:39,976 : INFO : PROGRESS: at 74.99% examples, 534787 words/s, in_qsize 5, out_qsize 0
2017-12-06 02:52:40,987 : INFO : PROGRESS: at 75.06% examples, 534775 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:52:41,990 : INFO : PROGRESS: at 75.14% examples, 534786 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:52:43,006 : INFO : PROGRESS: at 75.22% examples, 534803 words/s, in_qsize 6, out_qsize 2
2017-12-06 02:52:44,016 : INFO : PROGRESS: at 75.31% examples, 534836 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:52:45,018 : INFO : PROGRESS: at 75.38% examples, 534846 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:52:46,032 : INFO : PROGRESS: at 75.46% examples, 534842 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:52:47,037 : INFO : PROGRESS: at 75.54% examples, 534837 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:52:48,038 : INFO : PROGRESS: at 75.61% examples, 534793 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:52:49,043 : INFO : PROGRESS: at 75.68% examples, 534737 wor

2017-12-06 02:54:00,986 : INFO : PROGRESS: at 80.45% examples, 534493 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:02,009 : INFO : PROGRESS: at 80.48% examples, 534480 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:03,024 : INFO : PROGRESS: at 80.52% examples, 534481 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:04,053 : INFO : PROGRESS: at 80.56% examples, 534493 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:05,066 : INFO : PROGRESS: at 80.60% examples, 534508 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:54:06,097 : INFO : PROGRESS: at 80.64% examples, 534538 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:07,101 : INFO : PROGRESS: at 80.68% examples, 534611 words/s, in_qsize 6, out_qsize 0
2017-12-06 02:54:08,123 : INFO : PROGRESS: at 80.72% examples, 534617 words/s, in_qsize 1, out_qsize 0
2017-12-06 02:54:09,127 : INFO : PROGRESS: at 80.76% examples, 534606 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:54:10,140 : INFO : PROGRESS: at 80.81% examples, 534652 wor

2017-12-06 02:55:22,170 : INFO : PROGRESS: at 84.66% examples, 534830 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:55:23,171 : INFO : PROGRESS: at 84.72% examples, 534801 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:55:24,186 : INFO : PROGRESS: at 84.77% examples, 534714 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:55:25,216 : INFO : PROGRESS: at 84.83% examples, 534662 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:55:26,239 : INFO : PROGRESS: at 84.89% examples, 534645 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:55:27,265 : INFO : PROGRESS: at 84.94% examples, 534570 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:55:28,267 : INFO : PROGRESS: at 84.99% examples, 534498 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:55:29,268 : INFO : PROGRESS: at 85.04% examples, 534428 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:55:30,319 : INFO : PROGRESS: at 85.10% examples, 534341 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:55:31,323 : INFO : PROGRESS: at 85.16% examples, 534294 wor

2017-12-06 02:56:43,338 : INFO : PROGRESS: at 89.76% examples, 533010 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:56:44,364 : INFO : PROGRESS: at 89.84% examples, 533009 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:56:45,369 : INFO : PROGRESS: at 89.91% examples, 533004 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:56:46,407 : INFO : PROGRESS: at 89.99% examples, 532995 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:56:47,422 : INFO : PROGRESS: at 90.05% examples, 532991 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:56:48,436 : INFO : PROGRESS: at 90.11% examples, 532950 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:56:49,474 : INFO : PROGRESS: at 90.16% examples, 532875 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:56:50,477 : INFO : PROGRESS: at 90.23% examples, 532871 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:56:51,483 : INFO : PROGRESS: at 90.30% examples, 532860 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:56:52,483 : INFO : PROGRESS: at 90.38% examples, 532868 wor

2017-12-06 02:58:04,525 : INFO : PROGRESS: at 95.99% examples, 534349 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:58:05,526 : INFO : PROGRESS: at 96.07% examples, 534375 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:58:06,556 : INFO : PROGRESS: at 96.13% examples, 534370 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:58:07,588 : INFO : PROGRESS: at 96.20% examples, 534334 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:58:08,619 : INFO : PROGRESS: at 96.26% examples, 534310 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:58:09,629 : INFO : PROGRESS: at 96.32% examples, 534233 words/s, in_qsize 7, out_qsize 0
2017-12-06 02:58:10,671 : INFO : PROGRESS: at 96.38% examples, 534162 words/s, in_qsize 6, out_qsize 1
2017-12-06 02:58:11,682 : INFO : PROGRESS: at 96.46% examples, 534174 words/s, in_qsize 8, out_qsize 0
2017-12-06 02:58:12,708 : INFO : PROGRESS: at 96.54% examples, 534215 words/s, in_qsize 7, out_qsize 1
2017-12-06 02:58:13,714 : INFO : PROGRESS: at 96.62% examples, 534273 wor

In [4]:
# save model
model.save("model/20171201wiki_model.bin")

# load model
# model = word2vec.Word2Vec.load("model/20171201wiki_model.bin")

2017-12-06 03:00:49,640 : INFO : saving Word2Vec object under 20171201wiki_model.bin, separately None
2017-12-06 03:00:49,642 : INFO : storing np array 'syn0' to 20171201wiki_model.bin.wv.syn0.npy
2017-12-06 03:00:51,595 : INFO : not storing attribute syn0norm
2017-12-06 03:00:51,597 : INFO : storing np array 'syn1neg' to 20171201wiki_model.bin.syn1neg.npy
2017-12-06 03:00:53,562 : INFO : not storing attribute cum_table
2017-12-06 03:00:55,860 : INFO : saved 20171201wiki_model.bin
