In [8]:
import numpy
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import pickle
import platform
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF

Using TensorFlow backend.


In [9]:
def load_data():
    train = _parse_data(open('data/train_data.txt', 'rb'))
    test = _parse_data(open('data/test_data.txt', 'rb'))

    word_counts = Counter(row[0].lower() for sample in train for row in sample)
    vocab = [w for w, f in iter(word_counts.items()) if f >= 1]
    chunk_tags = ['B-K'
    ,'E-K'
    ,'I-K'
    ,'O']

    train = _process_data(train, vocab, chunk_tags)
    test = _process_data(test, vocab, chunk_tags)
    return train, test, (vocab, chunk_tags)


def _parse_data(fh):
    split_text = '\n'
    string = fh.read().decode('utf-8')
    data = [[row.split() for row in sample.split(split_text)] for sample in string.strip().split(split_text + split_text)]
    fh.close()
    return data


def _process_data(data, vocab, chunk_tags, maxlen=None):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]  

    y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    y_chunk = numpy.expand_dims(y_chunk, 2)
    
    return x, y_chunk


def process_data(data, vocab, maxlen=100):
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    x = [word2idx.get(w[0].lower(), 1) for w in data]
    length = len(x)
    x = pad_sequences([x], maxlen)  # left padding
    return x, length

In [17]:
EMBED_DIM = 20
BiRNN_UNITS = 100

In [18]:
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()

In [21]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [25]:
model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # 输入层，隐藏层
model.add(Bidirectional(LSTM(BiRNN_UNITS, return_sequences=True)))
crf = CRF(len(chunk_tags), sparse_target=True)
model.add(crf)
model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 20)          13300     
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 200)         96800     
_________________________________________________________________
crf_2 (CRF)                  (None, None, 4)           828       
Total params: 110,928
Trainable params: 110,928
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.fit(train_x, train_y,batch_size=16,epochs=15, validation_data=[test_x, test_y])

Train on 556 samples, validate on 166 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x11cbe25f8>

In [27]:
with open('model/config.pkl', 'wb') as outp:
    pickle.dump((vocab, chunk_tags), outp)

In [28]:
model.save('model/crf_company.h5')

In [None]:
# 北京美丽屋房产经纪有限公司 -->  美丽屋
# 1. 正则匹配+字典：
# 2. HMM 
# 3. tf-idf
# 4. LR 
# 5. BiLSTM+CRF
# 6. CRF

In [1]:
import jieba

In [2]:
jieba.lcut('北京美丽屋房产经纪有限公司')

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/_t/wschnxms2rlgr_txwwx6p1g40000gn/T/jieba.cache
Loading model cost 1.825 seconds.
Prefix dict has been built succesfully.


['北京', '美丽', '屋', '房产', '经纪', '有限公司']

In [None]:
HMM