In [30]:
# 训练语料库模型，将分词结果对应的词转成向量

import sys
import logging
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence


logger = logging.getLogger('word2Vec_train process...')
logging.basicConfig(format=('%(asctime)s: %(levelname)s: %(message)s'))
logging.root.setLevel(level=logging.INFO)

seg_word_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.seg.txt'
out_model_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model'

logger.info('training model...')
model = Word2Vec(LineSentence(seg_word_file), size=100, window=5, min_count=1, workers=2)

logger.info('save model...')
model.save(out_model_file)


2018-06-27 13:41:35,262: INFO: training model...
2018-06-27 13:41:35,265: INFO: collecting all words and their counts
2018-06-27 13:41:35,268: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-27 13:41:35,287: INFO: collected 3053 word types from a corpus of 11970 raw words and 18 sentences
2018-06-27 13:41:35,290: INFO: Loading a fresh vocabulary
2018-06-27 13:41:35,299: INFO: min_count=1 retains 3053 unique words (100% of original 3053, drops 0)
2018-06-27 13:41:35,301: INFO: min_count=1 leaves 11970 word corpus (100% of original 11970, drops 0)
2018-06-27 13:41:35,314: INFO: deleting the raw counts dictionary of 3053 items
2018-06-27 13:41:35,315: INFO: sample=0.001 downsamples 46 most-common words
2018-06-27 13:41:35,317: INFO: downsampling leaves estimated 9874 word corpus (82.5% of prior 11970)
2018-06-27 13:41:35,327: INFO: estimated required memory for 3053 words and 100 dimensions: 3968900 bytes
2018-06-27 13:41:35,329: INFO: resetting layer weigh

In [14]:
# 测试获取词向量

import gensim

model_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model'
model = gensim.models.Word2Vec.load(model_file)
word = 'VIPKID'

try:
    word_array = model[word]
    print('{w}\t{n}'.format(w=word, n=word_array))
except KeyError as e:
    print(e)
    



2018-06-27 11:32:48,012: INFO: loading Word2Vec object from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model
2018-06-27 11:32:48,021: INFO: loading wv recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.wv.* with mmap=None
2018-06-27 11:32:48,022: INFO: setting ignored attribute vectors_norm to None
2018-06-27 11:32:48,024: INFO: loading vocabulary recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.vocabulary.* with mmap=None
2018-06-27 11:32:48,025: INFO: loading trainables recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.trainables.* with mmap=None
2018-06-27 11:32:48,026: INFO: setting ignored attribute cum_table to None
2018-06-27 11:32:48,028: INFO: loaded D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model


VIPKID	[ 0.3329409   0.08134193 -0.11953539 -0.28934497  0.39962417  0.08710035
 -0.6130515   0.19526318 -0.14418946 -0.03861113 -0.22079551 -0.05094559
 -0.21329883 -0.06570481  0.5977378   0.366204    0.04207411 -0.49885884
  0.14802441  0.15806544  0.02693257 -0.03268153  0.36738417 -0.27391326
 -0.4165123  -0.27092597  0.77340055  0.04364326  0.695382    0.20128337
  0.0635343  -0.04332969 -0.15539786 -0.05190161 -0.18138081 -0.0722675
 -0.35403642 -0.74074376  0.03385624  0.4485879  -0.21862657 -0.3714158
  0.04636556  0.04635764  0.7754771  -0.43327007  0.47729978 -0.09457775
 -0.19334577 -0.03538497  0.26959366 -0.0240131  -0.4350531  -0.05296588
  0.41148537  0.05942417 -0.6693858  -0.17847358  0.0966382  -0.10470602
  0.36152864 -0.02380741 -0.02180869 -0.08598165 -0.03855221 -0.4171669
  0.14490622  0.53696644 -0.0448973   0.274913    0.25561827  0.56594723
  0.29975194  0.1274576  -0.5091291   0.17897408 -0.10721822 -0.24459949
 -0.2830593   0.3078476  -0.09172637  0.7962195

  # Remove the CWD from sys.path while we load stuff.


In [31]:
# 测试语句向量

import gensim
import jieba
import numpy as np


model_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model'
sentence_file = 'D:/lijiangming/docs/algorithm/library/my_corpus/sentences.txt'
model = gensim.models.Word2Vec.load(model_file)

with open(sentence_file, 'r', encoding='utf-8') as fr:
    for line in fr.readlines():
        sentence = line.strip()
        word_list = jieba.lcut(sentence, cut_all=False)
        sentence_array = []
        for word in word_list:
            try:
                word_array = model[word]
                sentence_array.append(word_array)
            except KeyError as e:
                print(e)
                continue
        
        # n = len(word_list), m = len(word_array), 生成的句子矩阵是 n * m 维的矩阵
        sentence_np_array = np.array(sentence_array, dtype='float')
        print('sentence_np_array shape: {s}'.format(s=sentence_np_array.shape))

        # 统一转化成 1 * m 维的矩阵
        sum_of_array = sum(sentence_np_array)  # 矩阵求和
#         print('sum of array: {soa}'.format(soa=sum_of_array))
        
        array_length = len(sentence_np_array)
        print('array length: {al}'.format(al=array_length))
        
        new_sentence_np_array = sum_of_array / array_length  # 算平均数
        print('new sentence np array shape: {nsnas}'.format(nsnas=new_sentence_np_array.shape))
#         print('new sentence np array: {nsna}'.format(nsna=new_sentence_np_array))
        

2018-06-27 13:42:00,043: INFO: loading Word2Vec object from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model
2018-06-27 13:42:00,081: INFO: loading wv recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.wv.* with mmap=None
2018-06-27 13:42:00,085: INFO: setting ignored attribute vectors_norm to None
2018-06-27 13:42:00,091: INFO: loading vocabulary recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.vocabulary.* with mmap=None
2018-06-27 13:42:00,097: INFO: loading trainables recursively from D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model.trainables.* with mmap=None
2018-06-27 13:42:00,100: INFO: setting ignored attribute cum_table to None
2018-06-27 13:42:00,100: INFO: loaded D:/lijiangming/docs/algorithm/library/my_corpus/my.corpus.text.model


sentence_np_array shape: (4, 100)
array length: 4
new sentence np array shape: (100,)
"word '竖式题' not in vocabulary"
sentence_np_array shape: (4, 100)
array length: 4
new sentence np array shape: (100,)
sentence_np_array shape: (24, 100)
array length: 24
new sentence np array shape: (100,)
sentence_np_array shape: (6, 100)
array length: 6
new sentence np array shape: (100,)
sentence_np_array shape: (7, 100)
array length: 7
new sentence np array shape: (100,)
sentence_np_array shape: (5, 100)
array length: 5
new sentence np array shape: (100,)
sentence_np_array shape: (15, 100)
array length: 15
new sentence np array shape: (100,)


