### 数据准备

In [18]:
import re
from jieba import posseg

with open('./data/三体.txt', 'r', encoding='utf-8') as f:
    data = f.read().strip()
lines = [i.strip() for i in re.split(r'\n+', data)]
print(len(lines))

12528


In [19]:
all_words = []
for line in lines:
    words  = "\n".join(["%s/%s" % (i.word, i.flag) for i in posseg.cut(line)])
    all_words.append(words)
with open('./data/三体分词.txt', 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(all_words))

### 词性标注

In [3]:
import re
from jieba import posseg

# 建立词典库
tag2id, id2tag = {}, {}
word2id, id2word = {}, {}
with open('./data/三体分词.txt', 'r', encoding='utf-8') as f:
    lines = re.split('\n+', f.read())
    for line in lines:
        items = line.split('/')
        # print(items)
        word, tag = items[0], items[1]
        if word not in word2id:
            id2word[len(id2word)] = word
            word2id[word] = len(word2id)
        
        if tag not in tag2id:
            id2tag[len(tag2id)] = tag
            tag2id[tag] = len(tag2id)

M = len(word2id)  # M: 词典大小
N = len(tag2id)  # N: 词性的种类个数

In [4]:
print(M, N)
print(tag2id)

28363 57
{'x': 0, 'm': 1, 'nr': 2, 'n': 3, 'v': 4, 'eng': 5, 'd': 6, 'nrfg': 7, 'r': 8, 'uj': 9, 'c': 10, 'j': 11, 'p': 12, 'q': 13, 'zg': 14, 'ul': 15, 'a': 16, 'ud': 17, 'i': 18, 'z': 19, 'u': 20, 'f': 21, 'ad': 22, 's': 23, 'df': 24, 'l': 25, 'vn': 26, 'uz': 27, 'y': 28, 't': 29, 'ug': 30, 'ns': 31, 'nt': 32, 'uv': 33, 'nrt': 34, 'b': 35, 'ng': 36, 'nz': 37, 'mq': 38, 'k': 39, 'o': 40, 'an': 41, 'rr': 42, '': 43, 'vd': 44, 'vg': 45, 'e': 46, 'h': 47, 'rz': 48, 'tg': 49, 'ag': 50, 'yg': 51, 'dg': 52, 'g': 53, 'vi': 54, 'rg': 55, 'vq': 56}


In [5]:
# 构建 pi, A, B（状态转移矩阵）
import numpy as np
pi = np.zeros(N)  # 每个词性出现在句子中的概率， pi[i]: tag i出现在句子中第一个位置的概率
A = np.zeros((N, M))  # A[i][j]: 给定tag i,出现单词j的概率
B = np.zeros((N, N))  # B[i][j]: 之前的状态是i，之后转换成j的概率

In [6]:
prev_tag = ""
with open('./data/三体分词.txt', 'r', encoding='utf-8') as f:
    lines = re.split('\n', f.read())
for line in lines:
    # 用于切分句子的空格，不是词
    if line == "":
        prev_tag = ""
        continue
    items = line.split('/')
    wordId, tagId = word2id[items[0]], tag2id[items[1]]
    # 该词是句子的开始
    if prev_tag == '':
        pi[tagId] += 1
        A[tagId][wordId] += 1
    else:
        A[tagId][wordId] += 1
        B[tag2id[prev_tag]][tagId] += 1
    # 句子结尾条件二，该词依然是句子中的词，因此写在处理模块后面
    if re.search(r'。？?!！', items[0]) is not None:
        prev_tag = ""
    else:
        prev_tag = items[1]

# normalize, 之前求的是频数，将其转化为概率 
pi = pi / sum(pi)
for i in range(N):
    A[i] /= sum(A[i])
    B[i] /= sum(B[i])



In [7]:
print(B[0])
print(A[1])
print(id2tag)

[1.71403197e-01 3.71427929e-02 3.17017784e-02 1.14182611e-01
 1.20522967e-01 3.05776019e-03 7.40495087e-02 4.13696967e-03
 1.88737999e-01 1.51763833e-03 9.05186951e-02 1.37149538e-03
 5.41515840e-02 3.69854082e-03 6.90244396e-03 7.86923578e-05
 1.37037120e-02 1.34901185e-04 3.55239787e-03 2.12469366e-03
 4.94637678e-04 1.02524901e-02 2.50691369e-03 3.77723318e-03
 9.78033590e-04 1.00613800e-02 4.74402500e-03 0.00000000e+00
 4.38428851e-04 1.97180565e-02 2.58560604e-04 8.73485172e-03
 8.76857702e-04 6.74505924e-05 4.51918969e-03 4.42925557e-03
 3.26011197e-04 2.31580367e-03 6.74505924e-04 5.62088270e-05
 4.72154147e-04 2.58560604e-04 1.01175889e-04 1.12417654e-05
 3.37252962e-05 7.86923578e-05 5.39604740e-04 2.24835308e-05
 1.01175889e-04 3.37252962e-05 1.46142950e-04 1.34901185e-04
 0.00000000e+00 1.46142950e-04 0.00000000e+00 0.00000000e+00
 0.00000000e+00]
[0.         0.00992198 0.00161126 ... 0.         0.         0.        ]
{0: 'x', 1: 'm', 2: 'nr', 3: 'n', 4: 'v', 5: 'eng', 6: 'd

In [8]:
def log(v):
    if v == 0:
        v = v + 0.000001
    return np.log(v)
    
def viterbi(x: list, pi, A, B):
    """
    param: x: 用户输入的字符串
    param: pi: 标签初始概率
    param: A: 给定词性tag,每个词出现的概率
    param: B: 状态转移矩阵，词性之间的转移概率
    """
    x = [word2id[word] for word in x]
    T = len(x)
    dp = np.zeros((T, N))
    # ptr = np.zeros((T, N)).astype(np.int16)
    ptr = np.array([[0 for x in range(N)] for y in range(T)])
    for j in range(N):
        dp[0][j] = log(pi[j]) + log(A[j][x[0]])
        
    for i in range(1, T):
        for j in range(N):
            dp[i][j] = -999999
            for k in range(N):
                score = dp[i-1][k] + log(B[k][j]) + log(A[j][x[i]])
                if score > dp[i][j]:
                    dp[i][j] = score
                    ptr[i][j] = k
    # 将识别出的最好的词性打印出来
    best_seq = [0] * T
    best_seq[T-1] = np.argmax(dp[T-1])
    for i in range(T-2, -1, -1):
        best_seq[i] = ptr[i+1][best_seq[i+1]]
    print(best_seq)
    for i in best_seq:
        yield id2tag[i]

In [9]:
sentence = '他说着，刚才吸进肚子里的烟都喷到汪淼脸上。'
words, flags = zip(*[[i.word, i.flag] for i in posseg.cut(sentence)])
pred_flags = list(viterbi(words, pi, A, B))
for word, flag, pred_flag in zip(words, flags, pred_flags):
    print("%s/%s/%s" % (word, flag, pred_flag))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\oscar\AppData\Local\Temp\jieba.cache
Loading model cost 1.162 seconds.
Prefix dict has been built succesfully.


[8, 4, 27, 0, 29, 4, 3, 21, 9, 3, 6, 4, 7, 23, 0]
他/r/r
说/v/v
着/uz/uz
，/x/x
刚才/t/t
吸进/v/v
肚子/n/n
里/f/f
的/uj/uj
烟/n/n
都/d/d
喷到/v/v
汪淼/nrfg/nrfg
脸上/s/s
。/x/x
