# Action 3

In [1]:
import jieba
import os

##### 以下的utils库不是python自带的库，而是课件里的utils文件，要放在同一个路径下

In [2]:
from utils import files_processing

In [3]:
# 源文件所在目录
source_folder = './three_kingdoms/source'
segment_folder = './three_kingdoms/segment'

##### 字词分割，对整个文件内容进行字词分割

In [4]:
def segment_lines(file_list,segment_out_dir,stopwords=[]):
    for i,file in enumerate(file_list):
        segment_out_name=os.path.join(segment_out_dir,'segment_{}.txt'.format(i))
        with open(file, 'rb') as f:
            document = f.read()
            document_cut = jieba.cut(document)
            sentence_segment=[]
            for word in document_cut:
                if word not in stopwords:
                    sentence_segment.append(word)
            result = ' '.join(sentence_segment)
            result = result.encode('utf-8')
            with open(segment_out_name, 'wb') as f2:
                f2.write(result)

##### 对source中的txt文件进行分词，输出到segment目录中

In [5]:
file_list=files_processing.get_files_list(source_folder, postfix='*.txt')

In [6]:
segment_lines(file_list, segment_folder)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\guoxi\AppData\Local\Temp\jieba.cache
Loading model cost 0.541 seconds.
Prefix dict has been built successfully.


##### 导入word2vec库

In [7]:
from gensim.models import word2vec
import multiprocessing

In [8]:
# 如果目录中有多个文件，可以使用PathLineSentences
segment_folder = './three_kingdoms/segment'
sentences = word2vec.PathLineSentences(segment_folder)

##### 设置模型参数，进行训练

In [9]:
model = word2vec.Word2Vec(sentences, size=100, window=3, min_count=1)

In [10]:
print(model.wv.similarity('曹操', '张飞'))
print(model.wv.similarity('曹操', '曹植'))
print(model.wv.most_similar(positive=['曹操', '张飞'], negative=['张翼德']))

0.9800237
0.9331262
[('门吏', 0.9769235849380493), ('又', 0.9755435585975647), ('玄德', 0.9735317230224609), ('当亲', 0.9723005890846252), ('数声', 0.9718474745750427), ('孔明', 0.9716532230377197), ('关公', 0.9691468477249146), ('李乐', 0.9675998687744141), ('艾问', 0.9672479629516602), ('靳祥', 0.9667261838912964)]


##### 根据课件代码更新了模型参数，进行训练以得到更精确的模型

In [12]:
model2 = word2vec.Word2Vec(sentences, size=128, window=5, min_count=5, workers=multiprocessing.cpu_count())


In [13]:
print(model2.wv.similarity('曹操', '曹孟德'))
print(model2.wv.similarity('曹操', '刘备'))
print(model2.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞']))

0.9388778
0.8632296
[('汝', 0.9874038696289062), ('吾', 0.987073540687561), ('臣', 0.9864275455474854), ('此', 0.9837418794631958), ('丞相', 0.9823691844940186), ('今', 0.981239914894104), ('古今', 0.9794062376022339), ('常理', 0.9777659773826599), ('非', 0.9749569296836853), ('耳', 0.9742422699928284)]


##### 我们发现，曹操和曹孟德和相似度确实大幅高过曹操和刘备单词的相似度。曹操+刘备-张飞则得到以上的结果比如汝，吾，臣。