In [8]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF
import numpy as np
import os, time, random
from utils import str2bool, get_logger, get_entity
from data import random_embedding,tag2label
import datapreprocessing as dp
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [9]:
## read corpus and get training data
train_data = dp.readCorpus('D:\\Desktop\\产品\\辅诊\\已标n\\corpus')
train_data[:1]

[(['骨', '关', '节', '炎', '复', '诊', '。'],
  ['B-B', 'I-B', 'I-B', 'I-B', 'B-ZL', 'I-ZL', 'O'])]

In [10]:
X = list()
for x in train_data:
    a = list()
    for word in x[0]:
        if word.isdigit():
            word = 'num'
        elif ('\u0041' <= word <='\u005a') or ('\u0061' <= word <='\u007a'):
            word = 'eng'
        a.append(word)
    X.append(a)
y = [x[1] for x in train_data]

In [11]:
# 一行数据最多的词汇数
max_document_length = max([len(x) for x in X])
max_document_length

350

In [12]:
texts = [' '.join(x) for x in X]

In [116]:
texts[:5]

['骨 关 节 炎 复 诊 。',
 '双 下 肢 疼 痛 畏 冷 num 天 。',
 'eng eng ： num 、 腰 椎 轻 度 退 行 性 变 。',
 'num 、 腰 num / num 、 腰 num / num 、 腰 num / 骶 num 椎 间 盘 膨 出 。',
 '骨 密 度 ： 腰 椎 eng = - num . num ， eng eng eng - num . num ， 股 骨 eng = - num . num 。']

In [13]:
# 实例化分词器，设置字典中最大词汇数为
tokenizer = Tokenizer(num_words=2000, filters='')
# 传入我们的训练数据，建立词典
tokenizer.fit_on_texts(texts) 
# 把词转换为编号，词的编号根据词频设定，频率越大，编号越小
sequences = tokenizer.texts_to_sequences(texts) 
# 把序列设定为max_document_length的长度，超过max_document_length的部分舍弃，不到max_document_length则补0
sequences = pad_sequences(sequences, maxlen=max_document_length)  
# sequences = np.array(sequences)
# sequences.shape

In [14]:
y_seq = [[tag2label[i] for i in x] for x in y]
y_seq[:1], y[:1], X[:1]

([[1, 2, 2, 2, 25, 26, 0]],
 [['B-B', 'I-B', 'I-B', 'I-B', 'B-ZL', 'I-ZL', 'O']],
 [['骨', '关', '节', '炎', '复', '诊', '。']])

In [15]:
y_sequences = pad_sequences(y_seq, maxlen=max_document_length)
# y_sequences = 
y_sequences[:1]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [16]:
y_sequences = np.expand_dims(y_sequences, 2)
y_sequences[0]

array([[ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],
       [ 0],

In [17]:
# 词对应编号的字典
dict_text = tokenizer.word_index
dict_text['糖']

123

In [18]:
len(dict_text)

1458

In [19]:
# 打乱数据
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(train_data)))
x_shuffled = sequences[shuffle_indices]
y_shuffled = y_sequences[shuffle_indices]
# 数据集切分为两部分
test_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[test_sample_index:]
y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[test_sample_index:]

In [20]:
x_train[:1]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [88]:
model = Sequential()
model.add(Embedding(len(dict_text), 300, mask_zero=True))  # Random embedding
# model.add(embeddings) 
model.add(Bidirectional(LSTM(150, return_sequences=True)))
crf = CRF(len(tag2label), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 300)         437400    
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 300)         541200    
_________________________________________________________________
crf_4 (CRF)                  (None, None, 209)         107008    
Total params: 1,085,608
Trainable params: 1,085,608
Non-trainable params: 0
_________________________________________________________________


In [89]:
model.fit(x_train, y_train, batch_size=16, epochs=10, validation_data=[x_test, y_test])

Train on 3297 samples, validate on 366 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x212234b6828>

In [90]:
model.save('crf1.h5')

In [91]:
model.fit(x_train, y_train, batch_size=16, epochs=10, validation_data=[x_test, y_test])

Train on 3297 samples, validate on 366 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x212239be438>

In [93]:
model.save('crf2.h5')

In [6]:
from keras.models import load_model
from keras_contrib.layers import CRF
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF

In [49]:
model = Sequential()
model.add(Embedding(len(dict_text), 300, mask_zero=True))  # Random embedding
# model.add(embeddings) 
model.add(Bidirectional(LSTM(150, return_sequences=True)))
crf = CRF(len(tag2label), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 300)         437400    
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 300)         541200    
_________________________________________________________________
crf_6 (CRF)                  (None, None, 209)         107008    
Total params: 1,085,608
Trainable params: 1,085,608
Non-trainable params: 0
_________________________________________________________________


In [46]:
model = model.load_weights('crf2.h5')

In [50]:
model = load_model('crf2.h5')

ValueError: Unknown layer: CRF

In [31]:
model.save_weights('tt.h5')

In [32]:
model = model.load_weights('tt.h5')

In [39]:
model = Sequential(model)

In [47]:
model

In [48]:
type(model)

NoneType

In [35]:
type(model)

keras.models.Sequential

In [41]:
# 预测
def predict(text):
    str1 = tokenizer.texts_to_sequences([' '.join(list(text)),])
    s = pad_sequences(str1, maxlen=max_document_length)  
    raw = model.predict(s[0][-len(str1[0]):])
    result = [np.argmax(row) for row in raw]
    label2tag = dict(zip(tag2label.values(),tag2label.keys()))
    result_tags = [label2tag[i] for i in result]
    return result_tags, raw

In [42]:
r, raw = predict('咽痛、鼻塞、咳嗽复诊，仍有咽痛、鼻塞、流涕，有咳嗽、咳痰色白。轻度发热。')

TypeError: Sequential model cannot be built: model is empty. Add some layers first.

In [142]:
raw[1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]], dtype=float32)

In [143]:
r

['O',
 'B-ZS',
 'O',
 'O',
 'O',
 'O',
 'B-ZS',
 'O',
 'B-ZL',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ZS',
 'O',
 'O',
 'O',
 'O',
 'B-1ZS',
 'O',
 'O',
 'O',
 'B-ZS',
 'O',
 'O',
 'B-ZS',
 'B-ZS',
 'O',
 'B-T',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [112]:
' '.join(list('有糖尿病史多年，目前无口渴，无多尿，无多食等症状。今药服完配药。'))

'有 糖 尿 病 史 多 年 ， 目 前 无 口 渴 ， 无 多 尿 ， 无 多 食 等 症 状 。 今 药 服 完 配 药 。'

In [121]:
s = pad_sequences(str1, maxlen=max_document_length)  

In [128]:
raw = model.predict(s[0][-len(str1[0]):])
result = [np.argmax(row) for row in raw]
result

[0,
 0,
 23,
 0,
 2,
 23,
 0,
 0,
 0,
 0,
 0,
 23,
 23,
 0,
 0,
 23,
 23,
 0,
 0,
 23,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 25,
 0,
 0]

In [130]:
label2tag = dict(zip(tag2label.values(),tag2label.keys()))

result_tags = [label2tag[i] for i in result]
result_tags

## 过去的训练结果

In [74]:
model.fit(x_train, y_train, batch_size=16, epochs=10, validation_data=[x_test, y_test])

Train on 1676 samples, validate on 186 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2118753aa58>