In [1]:
tag2id, id2tag = {}, {}
word2id, id2word = {}, {}

for line in open('dataset/pos_tag_dataset.txt', encoding='utf-8'):
    if line:
        for items in line.split('  '):
            item = items.split('/')
            if len(item)==2:
                word, tag = item[0], item[1].rstrip()

                if word not in word2id:
                    word2id[word] = len(word2id)
                    id2word[len(id2word)] = word
                if tag not in tag2id:
                    tag2id[tag] = len(tag2id)
                    id2tag[len(id2tag)] = tag

M = len(word2id)
N = len(tag2id)

In [2]:
print('M:{} N:{}'.format(M, N))

M:55317 N:45


In [3]:
print(id2tag)

{0: 'v', 1: 'n', 2: 'u', 3: 'a', 4: 'w', 5: 't', 6: 'm', 7: 'q', 8: 'nt', 9: 'nr', 10: 'Vg', 11: 'k', 12: 'p', 13: 'f', 14: 'r', 15: 'vn', 16: 'ns', 17: 'c', 18: 's', 19: 'd', 20: 'ad', 21: 'j', 22: 'l', 23: 'an', 24: 'b', 25: 'i', 26: 'vd', 27: 'z', 28: 'nz', 29: 'Ng', 30: 'Tg', 31: 'y', 32: 'nx', 33: 'vnz', 34: 'Ag', 35: 'o', 36: 'Dg', 37: 'Bg', 38: 'h', 39: 'Rg', 40: 'e', 41: 'vnt', 42: 'Mg', 43: 'na', 44: 'Yg'}


In [4]:
# 构建 pi A B
import numpy as np
pi = np.zeros(N)
A = np.zeros((N, M))
B = np.zeros((N, N))

In [5]:
for line in open('dataset/pos_tag_dataset.txt',encoding='utf-8'):
    if line:
        prev_tag = ''
        for items in line.split('  '):
            item = items.split('/')
            if len(item)==2:
                wordId, tagId = word2id[item[0]], tag2id[item[1].rstrip()]
                if prev_tag == '': # 句子的开始
                    pi[tagId] += 1
                    A[tagId][wordId] += 1
                else:
                    A[tagId][wordId] += 1
                    B[tag2id[prev_tag]][tagId] += 1

                if item[0] == '。':
                    prev_tag = ''
                else:
                    prev_tag = item[1]

In [6]:
pi

array([4.271e+03, 5.414e+03, 9.000e+00, 5.840e+02, 3.544e+03, 3.245e+03,
       2.356e+03, 1.000e+01, 7.950e+02, 3.148e+03, 4.100e+01, 0.000e+00,
       4.735e+03, 1.760e+02, 6.418e+03, 3.760e+02, 3.004e+03, 2.405e+03,
       1.460e+02, 1.413e+03, 1.190e+02, 6.280e+02, 3.980e+02, 1.400e+01,
       2.020e+02, 1.770e+02, 3.000e+00, 5.200e+01, 2.310e+02, 2.400e+01,
       2.600e+01, 1.000e+00, 1.700e+01, 0.000e+00, 5.000e+00, 4.000e+00,
       6.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+01, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00])

In [7]:
# normalize
pi = pi / sum(pi)
for i in range(N):
    A[i] /= sum(A[i])
    B[i] /= sum(B[i])

In [8]:
import jieba

In [9]:
def log(x):
    if x == 0:
        return np.log(0.00001)
    else:
        return np.log(x)

In [10]:
def viterbi(x, pi, A, B):
    '''
    x: 输入的句子
    pi:初始状态
    A:发射概率
    B:转移概率
    '''
    seg_list = jieba.cut(x)
    x = [word2id[word] for word in seg_list]
    T = len(x)
    
    dp = np.zeros((T,N)) # dp[i][j]:w1,w2,...,wT, 假设wi的tag是第j个tag
    # basecase for dp algorithm
    pointer = np.array([[0 for x in range(N)] for y in range(T)]) # T*N
    
    for j in range(N):
        dp[0][j] = log(pi[j]) + log(A[j][x[0]])
    for i in range(1, T): # 词语
        for j in range(N): # 词性
            dp[i][j] = float("-inf")
            for k in range(N): # 从每一个k词性到j
                score = dp[i-1][k] + log(B[k][j]) + log(A[j][x[i]])
                if score > dp[i][j]:
                    dp[i][j] = score
                    pointer[i][j] = k
    # decoding: 把最好的tag seq打印出来
    best_seq = [0 for _ in range(T)]
    # step1： 找出对应于最后一个单词的词性
    best_seq[T-1] = np.argmax(dp[T-1])
    # step2: 通过从后到前的循环依次求出每个单词的词性
    for i in range(T-2, -1, -1):
        best_seq[i] = pointer[i+1][best_seq[i+1]]
    # 到目前为止 best_seq存放了对应于x的词性序列
    for i in range(len(best_seq)):
        print(id2tag[best_seq[i]])

In [11]:
sentence = '云南完成党报党刊发行任务'
print(' '.join(jieba.cut(sentence)))
viterbi(sentence, pi, A, B)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Oscar\AppData\Local\Temp\jieba.cache
Loading model cost 0.784 seconds.
Prefix dict has been built succesfully.


云南 完成 党报 党刊 发行 任务
ns
v
n
n
vn
n
