In [0]:
# -*- coding: utf-8 -*-
#双向LSTM
from keras.layers import Input, Dense, Embedding, LSTM, Dropout, TimeDistributed, Bidirectional
from keras.models import Model, load_model
from keras.utils import np_utils
import numpy as np
import re

Using TensorFlow backend.


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
os.chdir('/content/gdrive/My Drive/NLP/Chinese_Word_Segmentation')

In [0]:
# 读取字典
vocab = open('data/msr/msr_training_words.utf8').read().rstrip('\n').split('\n')#读取词表
vocab = list(''.join(vocab))#提取词表里的字
stat = {}
for v in vocab:
    stat[v] = stat.get(v, 0) + 1
stat = sorted(stat.items(), key=lambda x:x[1], reverse=True)
vocab = [s[0] for s in stat]
# 5167 个字
print(len(vocab))
# 映射
char2id = {c : i + 1 for i, c in enumerate(vocab)}
id2char = {i + 1 : c for i, c in enumerate(vocab)}
tags = {'s': 0, 'b': 1, 'm': 2, 'e': 3, 'x': 4}

5167


In [0]:
embedding_size = 128
maxlen = 32 # 长于32则截断，短于32则填充0
hidden_size = 64
batch_size = 64
epochs = 50

In [0]:
def load_data(path):
    data = open(path).read().rstrip('\n')
    # 按标点符号和换行符分隔
    data = re.split('[，。！？、\n]', data)
    print('共有数据 %d 条' % len(data))
    print('平均长度：', np.mean([len(d.replace(' ', '')) for d in data]))	#replace去掉空格，这里得到的就是原始的未分词过的句子了，同理测试集也是如此
    
    # 准备数据
    X_data = []
    y_data = []
    
    for sentence in data:
        sentence = sentence.split(' ')
        X = []
        y = []
        
        try:
            for s in sentence:
                s = s.strip()
                # 跳过空字符
                if len(s) == 0:
                    continue
                # s
                elif len(s) == 1:
                    X.append(char2id[s])
                    y.append(tags['s'])
                elif len(s) > 1:
                    # b
                    X.append(char2id[s[0]])
                    y.append(tags['b'])
                    # m
                    for i in range(1, len(s) - 1):
                        X.append(char2id[s[i]])
                        y.append(tags['m'])
                    # e
                    X.append(char2id[s[-1]])
                    y.append(tags['e'])
            
            # 统一长度
            if len(X) > maxlen:
                X = X[:maxlen]
                y = y[:maxlen]
            else:
                for i in range(maxlen - len(X)):
                    X.append(0)
                    y.append(tags['x'])#填充的状态，所以最终有5个状态
        except:
            continue
        else:
            if len(X) > 0:
                X_data.append(X)
                y_data.append(y)
    
    X_data = np.array(X_data)
    y_data = np_utils.to_categorical(y_data, 5)#标签总类为5的独热编码，也就是一个句子里的每个字的标签
    
    return X_data, y_data

X_train, y_train = load_data('data/msr/msr_training.utf8')#已经将句子分词好的训练词表
X_test, y_test = load_data('data/msr/msr_test_gold.utf8')#国际标准的分词结果
print('X_train size:', X_train.shape)
print('y_train size:', y_train.shape)
print('X_test size:', X_test.shape)
print('y_test size:', y_test.shape)

共有数据 385152 条
平均长度： 9.742236831173146
共有数据 17961 条
平均长度： 9.48605311508268
X_train size: (385152, 32)
y_train size: (385152, 32, 5)
X_test size: (17917, 32)
y_test size: (17917, 32, 5)


In [0]:
X_train[0]

array([ 338,   29, 2038,  333,  496,   78,  473,  605,   23,   33,  173,
         68,  222,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [0]:
y_train[0].shape

(32, 5)

In [0]:
#对于LSTM以及BiLSTM可以参考https://www.jiqizhixin.com/articles/2018-10-24-13，实际上就是两个LSTM使用，一个前向一个后向
X = Input(shape=(maxlen,), dtype='int32')
#填补的0值在后续的计算中不产生影响, 就可以在初始化Embedding层时指定参数mask_zero为True, 意思就是屏蔽0值, 即填补的0值.
embedding = Embedding(input_dim=len(vocab) + 1, output_dim=embedding_size, input_length=maxlen, mask_zero=True)(X)
#返回全部hidden step，合并模式，也即是将两个LSTM连接起来，hidden_size = 64，隐层状态越多描述越精确但是会越耗费时间
blstm = Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode='concat')(embedding)
blstm = Dropout(0.6)(blstm)
#hidden_size 隐层状态的维数：（每个LSTM单元或者时间步的输出的ht的维度，单元内部有权重与偏差计算）
blstm = Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode='concat')(blstm)
blstm = Dropout(0.6)(blstm)
#最后对应的由五种状态
output = TimeDistributed(Dense(5, activation='softmax'))(blstm)

model = Model(X, output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs)
model.save('msr_bilstm.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


In [0]:
print(model.evaluate(X_train, y_train, batch_size=batch_size))
print(model.evaluate(X_test, y_test, batch_size=batch_size))

[0.033360598500879995, 0.9891092663648795]
[0.18042443292314747, 0.9547670693238373]


In [0]:
def viterbi(nodes):
    trans = {'be': 0.5, 'bm': 0.5, 'eb': 0.5, 'es': 0.5, 'me': 0.5, 'mm': 0.5, 'sb': 0.5, 'ss': 0.5}
    paths = {'b': nodes[0]['b'], 's': nodes[0]['s']}
    for l in range(1, len(nodes)):
        paths_ = paths.copy()
        paths = {}
        for i in nodes[l].keys():
            nows = {}
            for j in paths_.keys():
                if j[-1] + i in trans.keys():
                    nows[j + i] = paths_[j] + nodes[l][i] + trans[j[-1] + i]
            nows = sorted(nows.items(), key=lambda x: x[1], reverse=True)
            paths[nows[0][0]] = nows[0][1]
    
    paths = sorted(paths.items(), key=lambda x: x[1], reverse=True)
    return paths[0][0]

In [0]:
def cut_words(data):
    data = re.split('[，。！？、\n]', data)
    sens = []
    Xs = []
    for sentence in data:
        sen = []
        X = []
        sentence = list(sentence)
        for s in sentence:
            s = s.strip()
            if not s == '' and s in char2id:
                sen.append(s)
                X.append(char2id[s])
        if len(X) > maxlen:
            sen = sen[:maxlen]
            X = X[:maxlen]
        else:
            for i in range(maxlen - len(X)):
                X.append(0)
        
        if len(sen) > 0:
            Xs.append(X)
            sens.append(sen)
    
    Xs = np.array(Xs)
    ys = model.predict(Xs)
    
    results = ''
    for i in range(ys.shape[0]):
        nodes = [dict(zip(['s', 'b', 'm', 'e'], d[:4])) for d in ys[i]]
        ts = viterbi(nodes)
        for x in range(len(sens[i])):
            if ts[x] in ['s', 'e']:
                results += sens[i][x] + '/'
            else:
                results += sens[i][x]
        
    return results[:-1]

In [0]:
print(cut_words('中国共产党第十九次全国代表大会，是在全面建成小康社会决胜阶段、中国特色社会主义进入新时代的关键时期召开的一次十分重要的大会。'))
print(cut_words('把这本书推荐给，具有一定编程基础，希望了解数据分析、人工智能等知识领域，进一步提升个人技术能力的社会各界人士。'))
print(cut_words('结婚的和尚未结婚的。'))

中国共产党第十九次全国代表大会/是/在/全面/建成/小康/社会/决胜/阶段/中国/特色/社会主义/进入/新时代/的/关键/时期/召开/的/一次/十分/重要/的/大会
把/这/本/书/推荐/给/具有/一定/编程/基础/希望/了解/数据/分析/人工/智能/等/知识/领域/进一步/提升/个人/技术/能力/的/社会/各界人士
结婚/的/和/尚未/结婚/的
