In [1]:
import gensim
import re

In [2]:
document = []
with open("三国演义.txt", 'r') as f:
    document = f.readlines()

In [3]:
docs = []
for doc in document:
    line = doc.replace("\n", "")
    line = line.strip()
    if len(line) != 0:
        docs.append('_'.join(re.findall(r'([\u4e00-\u9fa5\s]+)', line))) # 仅包含中文字符，不包括标点符号

In [4]:
docs[:5]

['天涯在线书库',
 '三国演义',
 '第_回 宴桃园豪杰三结义 斩黄巾英雄首立功',
 '滚滚长江东逝水_浪花淘尽英雄_是非成败转头空',
 '青山依旧在_几度夕阳红_白发渔樵江渚上_惯']

In [5]:
words = []
for line in docs:
    for word in line.split('_'):
        for w in word.split(' '):
            if len(w) > 0:
                w = w.strip()
                words.append(w)

In [6]:
len(words)

90706

In [7]:
from collections import defaultdict
import heapq
word_freq = defaultdict(int)
word_pq= []
for word in words:
    word_freq[word] += 1
for k, v in word_freq.items():
    heapq.heappush(word_pq, (v, k))
heapq.nlargest(17, word_pq)
# （关、张，代称时被、划分）

[(353, '孔明曰'),
 (336, '玄德曰'),
 (281, '躁曰'),
 (206, '次日'),
 (142, '曰'),
 (122, '正是'),
 (104, '肃曰'),
 (101, '瑜曰'),
 (90, '张'),
 (87, '权曰'),
 (82, '懿曰'),
 (80, '且看下文分解'),
 (70, '第'),
 (70, '回'),
 (69, '布曰'),
 (63, '如之奈何'),
 (63, '关')]

In [8]:
# 查看句子长度
len_pq = []
len_freq = defaultdict(int)
for w in words:
    heapq.heappush(len_pq, (len(w), w))
    len_freq[len(w)] += 1
heapq.nlargest(20, len_pq)

[(23, '忽报洞后迤西银冶洞二十一洞主杨锋引三万兵来助战'),
 (21, '于是陆逊总率江南八十一州并荆湖之众七十余万'),
 (20, '遂皆以金珠玩好结构何进弟何苗并其母舞阳君'),
 (20, '密遣人探得关公果然撤荆州大半兵赴樊城听调'),
 (20, '卓命扶何太后并弘农王及帝妃唐氏永安宫闲住'),
 (20, '今将孟获并祝融夫人及宗党数百余人尽皆擒来'),
 (19, '说杨仪令先锋何平引兵自槎山小路抄来搦战'),
 (19, '臣已算定今番诸葛亮必效韩信暗度陈仓之计'),
 (18, '躁乃令曹洪引数十骑径出阵前与韩遂相见'),
 (18, '请降诏遣左车骑将军张翼领兵守护阳安关'),
 (18, '忽报黄巾贼党管亥部领群寇数万杀奔前来'),
 (18, '吕布使人探听得张勋一军从大路径取徐州'),
 (18, '只见一人将着严白虎首级来孙策军前投献'),
 (18, '原来黄忠预先使严颜引军埋伏于山僻去处'),
 (18, '原来孔明果遣赵云引一军埋伏于山僻之中'),
 (18, '即修书令云长同孙乾领五百军往江夏求救'),
 (18, '其祖张陵在西川鹄鸣山中造作道书以惑人'),
 (18, '公就令兴赍诸官立功文书去成都见汉中王'),
 (18, '享于故殁王事蜀中将校及南人亡者阴魂曰'),
 (18, '且说徐庶当晚密使近人去各寨中暗布谣言')]

In [9]:
import gensim.models
model = gensim.models.Word2Vec(sentences=words)

In [10]:
for index, word in enumerate(model.wv.index2word):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index2word)} is {word}")

word #0/2723 is 曰
word #1/2723 is 之
word #2/2723 is 不
word #3/2723 is 人
word #4/2723 is 军
word #5/2723 is 兵
word #6/2723 is 大
word #7/2723 is 一
word #8/2723 is 马
word #9/2723 is 将


In [11]:
print(model.wv.most_similar(positive=['亮'], topn=5))
# 莫名奇妙？

[('葛', 0.8173595666885376), ('诚', 0.7030994296073914), ('臣', 0.6893393993377686), ('窃', 0.6892356872558594), ('智', 0.6688312292098999)]


In [12]:
from gensim import utils
class biCorpus:
    # 传入sentence vector
    def __init__(self, data):
        self.data = data
        
    def __iter__(self):
        for word in self.data:
            len_w = len(word)
            if len_w < 2:
                continue
            idx = 0
            while idx + 2 <= len_w:
                yield utils.simple_preprocess(word[idx:idx+2])
                idx += 1

In [13]:
biword = biCorpus(words)

In [14]:
bimodel = gensim.models.Word2Vec(sentences=biword)

In [15]:
for index, word in enumerate(bimodel.wv.index2word):
    if index == 10:
        break
    print(f"word #{index}/{len(bimodel.wv.index2word)} is {word}")

word #0/14945 is 玄德
word #1/14945 is 孔明
word #2/14945 is 将军
word #3/14945 is 曹躁
word #4/14945 is 却说
word #5/14945 is 司马
word #6/14945 is 不可
word #7/14945 is 二人
word #8/14945 is 丞相
word #9/14945 is 关公


In [16]:
print(bimodel.wv.most_similar(positive=['丞相'], topn=5))
# word2vec 意义不太明确

[('谯郡', 0.3562915027141571), ('马尽', 0.34610530734062195), ('再放', 0.3302830159664154), ('左军', 0.32816624641418457), ('勒住', 0.3261064887046814)]


In [17]:
bimodel.wv.similar_by_word('刘备')

[('勇力', 0.3728499710559845),
 ('而待', 0.3476017117500305),
 ('德背', 0.3424703776836395),
 ('见帝', 0.33940520882606506),
 ('立后', 0.3314281702041626),
 ('各叙', 0.3194499611854553),
 ('故又', 0.31816405057907104),
 ('立营', 0.3167794346809387),
 ('皇帝', 0.3157844543457031),
 ('氏宗', 0.3156091570854187)]

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
unigram = [[w for w in word] for word in words]

In [20]:
unigram[:10]

[['天', '涯', '在', '线', '书', '库'],
 ['三', '国', '演', '义'],
 ['第'],
 ['回'],
 ['宴', '桃', '园', '豪', '杰', '三', '结', '义'],
 ['斩', '黄', '巾', '英', '雄', '首', '立', '功'],
 ['滚', '滚', '长', '江', '东', '逝', '水'],
 ['浪', '花', '淘', '尽', '英', '雄'],
 ['是', '非', '成', '败', '转', '头', '空'],
 ['青', '山', '依', '旧', '在']]

In [21]:
bow_converter = CountVectorizer(token_pattern='[\u4e00-\u9fa5]')

In [22]:
bow_converter.fit(words)

CountVectorizer(token_pattern='[一-龥]')

In [23]:
bow = bow_converter.get_feature_names()

In [24]:
len(bow)

3871

In [25]:
bow[:10]

['一', '丁', '七', '万', '丈', '三', '上', '下', '不', '与']

In [26]:
bigram_converter = CountVectorizer(ngram_range=(1,3), token_pattern='[\u4e00-\u9fa5]')
bigram_converter.fit(words)

CountVectorizer(ngram_range=(1, 3), token_pattern='[一-龥]')

In [27]:
bigram = bigram_converter.get_feature_names()
bigram[-20:]

['龚 景 牒',
 '龚 景 犒',
 '龚 起',
 '龚 起 应',
 '龚 起 接',
 '龚 都',
 '龚 都 便',
 '龚 都 差',
 '龚 都 已',
 '龚 都 披',
 '龚 都 接',
 '龚 都 数',
 '龚 都 素',
 '龚 都 自',
 '龚 都 运',
 '龛',
 '龛 中',
 '龟',
 '龟 纽',
 '龟 纽 墨']

In [28]:
# converter 看不到词频
def ngram_counter(data, n=1):
    freq = defaultdict(int)
    for word in data:
        idx = 0
        wordlen = len(word)
        while idx + n <= wordlen:
            freq[word[idx:idx+n]] += 1
            idx += 1
    
    return freq

In [29]:
uni_pq = []
unigram = ngram_counter(words, 1)
for k, v in unigram.items():
    heapq.heappush(uni_pq, (v, k))

In [30]:
heapq.nsmallest(10, uni_pq)

[(1, '丐'),
 (1, '乳'),
 (1, '亘'),
 (1, '亟'),
 (1, '仞'),
 (1, '佣'),
 (1, '侠'),
 (1, '侪'),
 (1, '俨'),
 (1, '倥')]

In [31]:
bi_pq = []
bigram = ngram_counter(words, 2)
for k, v in bigram.items():
    heapq.heappush(bi_pq, (v, k))

In [32]:
heapq.nlargest(10, bi_pq)

[(1812, '玄德'),
 (1690, '孔明'),
 (940, '将军'),
 (842, '曹躁'),
 (647, '却说'),
 (570, '司马'),
 (563, '不可'),
 (561, '二人'),
 (546, '丞相'),
 (519, '关公')]

In [33]:
tri_pq = []
trigram = ngram_counter(words, 3)
for k, v in trigram.items():
    heapq.heappush(tri_pq, (v, k))

In [34]:
heapq.nlargest(10, tri_pq)

[(385, '孔明曰'),
 (383, '玄德曰'),
 (287, '司马懿'),
 (157, '诸葛亮'),
 (152, '后人有'),
 (147, '人有诗'),
 (115, '大怒曰'),
 (114, '文分解'),
 (114, '下文分'),
 (93, '引一军')]

In [35]:
four_pq = []
fourgram = ngram_counter(words, 4)
for k, v in fourgram.items():
    heapq.heappush(four_pq, (v, k))

In [36]:
heapq.nlargest(10, four_pq)

[(146, '后人有诗'),
 (114, '下文分解'),
 (80, '看下文分'),
 (80, '且看下文'),
 (71, '如之奈何'),
 (55, '喊声大震'),
 (54, '人有诗赞'),
 (54, '人有诗叹'),
 (51, '如此如此'),
 (48, '却说孔明')]

In [37]:
fsorted = [heapq.heappop(four_pq) for i in range(len(four_pq))]
tsorted = [heapq.heappop(tri_pq) for i in range(len(tri_pq))]
bsorted = [heapq.heappop(bi_pq) for i in range(len(bi_pq))]
usorted = [heapq.heappop(uni_pq) for i in range(len(uni_pq))]

In [38]:
fword = [(v, k) for v, k in fsorted if v > 5]
tword = [(v, k) for v, k in tsorted if v > 5]
bword = [(v, k) for v, k in bsorted if v > 5]
uword = [(v, k) for v, k in usorted if v > 5]

In [39]:
[(k, v) for k, v in bword if v.endswith("曰")]

[(6, '优曰'),
 (6, '净曰'),
 (6, '吏曰'),
 (6, '宪曰'),
 (6, '徽曰'),
 (6, '忖曰'),
 (6, '授曰'),
 (6, '朗曰'),
 (6, '术曰'),
 (6, '楙曰'),
 (6, '温曰'),
 (6, '皓曰'),
 (6, '秀曰'),
 (6, '童曰'),
 (6, '累曰'),
 (6, '谕曰'),
 (6, '谦曰'),
 (6, '钦曰'),
 (6, '震曰'),
 (6, '马曰'),
 (7, '亮曰'),
 (7, '均曰'),
 (7, '恢曰'),
 (7, '琮曰'),
 (7, '瓘曰'),
 (7, '纪曰'),
 (7, '请曰'),
 (7, '足曰'),
 (8, '仓曰'),
 (8, '仪曰'),
 (8, '入曰'),
 (8, '前曰'),
 (8, '灵曰'),
 (8, '爽曰'),
 (8, '绣曰'),
 (8, '翼曰'),
 (8, '老曰'),
 (8, '越曰'),
 (8, '首曰'),
 (9, '任曰'),
 (9, '兴曰'),
 (9, '典曰'),
 (9, '咨曰'),
 (9, '封曰'),
 (9, '张曰'),
 (9, '氏曰'),
 (9, '盛曰'),
 (9, '籍曰'),
 (9, '芳曰'),
 (9, '见曰'),
 (9, '诏曰'),
 (9, '辞曰'),
 (10, '劝曰'),
 (10, '坚曰'),
 (10, '奉曰'),
 (10, '定曰'),
 (10, '文曰'),
 (10, '苞曰'),
 (10, '贺曰'),
 (10, '颜曰'),
 (11, '下曰'),
 (11, '使曰'),
 (11, '太曰'),
 (11, '妻曰'),
 (11, '宁曰'),
 (11, '宠曰'),
 (11, '望曰'),
 (11, '洪曰'),
 (11, '瓒曰'),
 (11, '腾曰'),
 (11, '配曰'),
 (12, '乃曰'),
 (12, '服曰'),
 (12, '歆曰'),
 (12, '歌曰'),
 (12, '淮曰'),
 (12, '琦曰'),
 (12, '禁曰'),
 (12, '蝉曰'),
 (12, '雍曰'),
 (13, '化曰'),

In [40]:
import pandas as pd
bkey = [k for k, v in bword]
bvalue = [v for k, v in bword]
df = pd.DataFrame({
    'word': bvalue,
    'count': bkey
})

In [41]:
df.to_csv('bword.csv', index=False, encoding='utf-8_sig')

In [42]:
tkey = [k for k, v in tword]
tvalue = [v for k, v in tword]
tdf = pd.DataFrame({
    'word': tvalue,
    'count': tkey
})
tdf.to_csv('tword.csv', index=False, encoding='utf-8_sig')

In [43]:
fkey = [k for k, v in fword]
fvalue = [v for k, v in fword]
fdf = pd.DataFrame({
    'word': fvalue,
    'count': fkey
})
fdf.to_csv('fword.csv', index=False, encoding='utf-8_sig')

In [55]:
# fasttext model
from gensim.models.fasttext import FastText
ftmodel = FastText(size=100)

In [56]:
ftmodel.build_vocab(biword)

In [74]:
ftmodel.train(
    sentences=biword, epochs=ftmodel.epochs,
    total_examples=ftmodel.corpus_count, total_words=ftmodel.corpus_total_words,
)

In [106]:
ftmodel.bucket

  """Entry point for launching an IPython kernel.


2000000

In [110]:
dir(wv)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_load_specials',
 '_log_evaluate_word_analogies',
 '_save_specials',
 '_smart_save',
 'accuracy',
 'add',
 'adjust_vectors',
 'bucket',
 'buckets_word',
 'closer_than',
 'compatible_hash',
 'cosine_similarities',
 'distance',
 'distances',
 'doesnt_match',
 'evaluate_word_analogies',
 'evaluate_word_pairs',
 'get_keras_embedding',
 'get_vector',
 'index2entity',
 'index2word',
 'init_ngrams_weights',
 'init_post_load',
 'init_sims',
 'load',
 'log_accuracy',
 'log_evaluate_word_pairs',
 'max_n',
 'min_n',
 'most_similar',
 'most_similar_cosmul',
 'most

In [112]:
wv.vocab

{'三国': <gensim.models.keyedvectors.Vocab at 0x202701b4080>,
 '桃园': <gensim.models.keyedvectors.Vocab at 0x2027323b828>,
 '豪杰': <gensim.models.keyedvectors.Vocab at 0x2027323ba58>,
 '三结': <gensim.models.keyedvectors.Vocab at 0x2027323ba90>,
 '结义': <gensim.models.keyedvectors.Vocab at 0x2027323bac8>,
 '黄巾': <gensim.models.keyedvectors.Vocab at 0x2027323bb00>,
 '英雄': <gensim.models.keyedvectors.Vocab at 0x2027323bb38>,
 '立功': <gensim.models.keyedvectors.Vocab at 0x2027323bc50>,
 '滚滚': <gensim.models.keyedvectors.Vocab at 0x2027323bb70>,
 '长江': <gensim.models.keyedvectors.Vocab at 0x2027323b748>,
 '江东': <gensim.models.keyedvectors.Vocab at 0x2027323bcc0>,
 '成败': <gensim.models.keyedvectors.Vocab at 0x2027323bcf8>,
 '依旧': <gensim.models.keyedvectors.Vocab at 0x2027323bd30>,
 '白发': <gensim.models.keyedvectors.Vocab at 0x2027323bd68>,
 '古今': <gensim.models.keyedvectors.Vocab at 0x2027323bda0>,
 '多少': <gensim.models.keyedvectors.Vocab at 0x2027323bdd8>,
 '临江': <gensim.models.keyedvectors.Vocab

In [100]:
bimodel.wv.most_similar('下文')

[('社稷', 0.35352253913879395),
 ('布回', 0.35087698698043823),
 ('立功', 0.34211254119873047),
 ('即下', 0.33835452795028687),
 ('其马', 0.334804892539978),
 ('整兵', 0.33461683988571167),
 ('张松', 0.3335270583629608),
 ('命将', 0.32903629541397095),
 ('忽又', 0.32760921120643616),
 ('会众', 0.31898951530456543)]