In [1]:
import tensorflow as tf
import numpy as np
import re

  from ._conv import register_converters as _register_converters


In [2]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0}
word_idx = 3
tag_idx = 1
char_idx = 1

In [3]:
def process_word(word, lower=False):
    if lower:
        word = word.lower()
    if word.isdigit():
        word = 'NUM'
    else:
        word = re.sub('[^A-Za-z0-9\- ]+', '', word)
    return word

def read_file(file):
    global word_idx, tag_idx, char_idx
    with open(file,'r') as f:
        words, tags, X, Y = [], [], [], []
        for line in f:
            line = line.strip()
            if (len(line) == 0 or line.startswith("-DOCSTART-")):
                continue
            else:
                ls = line.split(' ')
                if len(ls) > 1:
                    word, tag = ls[0],ls[-1]
                else:
                    word = ls[0]
                    tag = 'O'
                for c in word:
                    if c not in char2idx:
                        char2idx[c] = char_idx
                        char_idx += 1
                word = process_word(word,True)
                if len(word) < 1:
                    continue
                words += [word]
                tags += [tag]
                if word not in word2idx:
                    word2idx[word] = word_idx
                    word_idx += 1
                X.append(word2idx[word])
                if tag not in tag2idx:
                    tag2idx[tag] = tag_idx
                    tag_idx += 1
                Y.append(tag2idx[tag])
                        
    return words, tags, X, Y

In [4]:
train_words, train_tags, train_X, train_Y = read_file('eng.train')

In [5]:
test_words, test_tags, test_X, test_Y = read_file('eng.testa')

In [6]:
idx2tag={idx: tag for tag, idx in tag2idx.items()}
idx2word={idx: tag for tag, idx in word2idx.items()}
batch_size = 64
dim_word = 128
dim_char = 32
dropout = 0.8
learning_rate = 1e-3
hidden_size_char = 64
hidden_size_word = 128
num_layers = 2
seq_len = 20
display_step = 50
epoch = 1

In [7]:
class Model:
    def __init__(self, dim_word, dim_char, dropout, learning_rate,
                 hidden_size_char, hidden_size_word, num_layers):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        def clip_grads(loss):
            variables = tf.trainable_variables()
            grads = tf.gradients(loss, variables)
            clipped_grads, _ = tf.clip_by_global_norm(grads, 5.0)
            return zip(clipped_grads, variables)
        
        self.word_ids = tf.placeholder(tf.int32, shape=[None, None])
        self.sequence_lengths = tf.placeholder(tf.int32, shape=[None])
        self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None])
        self.word_lengths = tf.placeholder(tf.int32, shape=[None, None])
        self.labels = tf.placeholder(tf.int32, shape=[None, None])
        
        self.word_embeddings = tf.Variable(tf.truncated_normal([len(word2idx), dim_word],
                                                      stddev=1.0 / np.sqrt(dim_word)))
        self.char_embeddings = tf.Variable(tf.truncated_normal([len(char2idx), dim_char],
                                                      stddev=1.0 / np.sqrt(dim_char)))
        word_embedded = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids)
        char_embedded = tf.nn.embedding_lookup(self.char_embeddings, self.char_ids)
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(char_embedded, shape=[s[0]*s[1], s[-2], dim_char])
        word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                sequence_length=word_lengths,
                scope = 'bidirectional_rnn_char_%d'%(n))
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(char_embedded[:,-1], shape=[s[0], s[1], 2*hidden_size_char])
        word_embedded = tf.concat([word_embedded, output], axis=-1)
        word_embedded = tf.nn.dropout(word_embedded, dropout)
        
        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_word),
                cell_bw = cells(hidden_size_word),
                inputs = word_embedded,
                sequence_length=self.sequence_lengths, 
                dtype=tf.float32,
                scope = 'bidirectional_rnn_word_%d'%(n))
            word_embedded = tf.concat((out_fw, out_bw), 2)
        word_embedded = tf.nn.dropout(word_embedded, dropout)
        
        W = tf.get_variable('w',shape=(2*hidden_size_word, len(idx2tag)),
                            initializer=tf.orthogonal_initializer())
        b = tf.get_variable('b',shape=(len(idx2tag)),initializer=tf.zeros_initializer())
        
        nsteps = tf.shape(word_embedded)[1]
        output = tf.reshape(word_embedded, [-1, 2*hidden_size_word])
        pred = tf.matmul(output, W) + b
        self.logits = tf.reshape(pred, [-1, nsteps, len(idx2tag)])
        
        log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
        self.logits, self.labels, tf.count_nonzero(self.word_ids, 1))
        
        self.cost = tf.reduce_mean(-log_likelihood)
        self.global_step = tf.Variable(0, trainable=False)
        
        self.learning_rate = tf.train.exponential_decay(learning_rate,
                                                        self.global_step, len(train_X) // batch_size,
                                                        0.1)
        
        self.crf_decode = tf.contrib.crf.crf_decode(self.logits, 
                                                    trans_params, 
                                                    tf.count_nonzero(self.word_ids, 1))[0]
        
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(clip_grads(self.cost), 
                                                                                    global_step=self.global_step)

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [9]:
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def generate_char_seq(batch):
    x = [[len(idx2word[i]) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((batch.shape[0],batch.shape[1],maxlen),dtype=np.int32)
    for i in range(batch.shape[0]):
        for k in range(batch.shape[1]):
            for no, c in enumerate(idx2word[batch[i,k]]):
                temp[i,k,no] = char2idx[c]
    return temp, np.array(x,dtype=np.int32)

In [10]:
train_X_seq, train_Y_seq = to_train_seq(train_X, train_Y)
test_X_seq, test_Y_seq = to_train_seq(test_X, test_Y)

In [28]:
np.unique(train_Y_seq.ravel(),return_counts=True)

(array([1, 2, 3, 4, 5, 6, 7, 8]),
 array([ 198310, 2908994,   90686,  222459,  165511,     220,     740,
            480]))

In [11]:
for i in range(epoch):
    total_cost = 0
    for k in range(0,(train_X_seq.shape[0] // batch_size)*batch_size,batch_size):
        batch_x = train_X_seq[k:k+batch_size]
        batch_y = train_Y_seq[k:k+batch_size]
        batch_length = [seq_len] * batch_size
        batch_x_char, batch_x_char_length = generate_char_seq(batch_x)
        step, loss, _ = sess.run([model.global_step, model.cost, model.optimizer],
                                 feed_dict={model.word_ids:batch_x,
                                           model.sequence_lengths:batch_length,
                                           model.char_ids:batch_x_char,
                                           model.word_lengths:batch_x_char_length,
                                           model.labels:batch_y})
        if step % display_step == 0 or step == 1:
            print('epoch %d, step %d, loss %f'%(i+1,step,loss))
        total_cost += loss
    total_cost /= (train_X_seq.shape[0] // batch_size)
    print('epoch %d, avg loss %f'%(i+1,total_cost))

epoch 1, step 1, loss 42.354301
epoch 1, step 50, loss 12.833714
epoch 1, step 100, loss 26.714470
epoch 1, step 150, loss 21.240700
epoch 1, step 200, loss 11.442729
epoch 1, step 250, loss 13.302807
epoch 1, step 300, loss 8.469722
epoch 1, step 350, loss 9.016645
epoch 1, step 400, loss 3.415563
epoch 1, step 450, loss 7.920945
epoch 1, step 500, loss 12.634382
epoch 1, step 550, loss 4.335175
epoch 1, step 600, loss 5.281582
epoch 1, step 650, loss 3.294105
epoch 1, step 700, loss 10.223347
epoch 1, step 750, loss 1.398232
epoch 1, step 800, loss 8.702658
epoch 1, step 850, loss 4.399271
epoch 1, step 900, loss 12.357971
epoch 1, step 950, loss 12.835857
epoch 1, step 1000, loss 1.429750
epoch 1, step 1050, loss 5.212813
epoch 1, step 1100, loss 3.905361
epoch 1, step 1150, loss 9.409713
epoch 1, step 1200, loss 5.584057
epoch 1, step 1250, loss 7.990345
epoch 1, step 1300, loss 2.472900
epoch 1, step 1350, loss 1.571310
epoch 1, step 1400, loss 4.074049
epoch 1, step 1450, loss 4.

In [15]:
label_Y, predicted_Y = [], []
for k in range(0,(test_X_seq.shape[0] // batch_size)*batch_size,batch_size):
    batch_x = test_X_seq[k:k+batch_size]
    batch_length = [seq_len] * batch_size
    batch_x_char, batch_x_char_length = generate_char_seq(batch_x)
    batch_y = test_Y_seq[k:k+batch_size]
    Y_pred = sess.run(model.crf_decode,
                  feed_dict={model.word_ids:batch_x,
                             model.sequence_lengths:batch_length,
                            model.char_ids:batch_x_char,
                            model.word_lengths:batch_x_char_length})
    predicted_Y.append(Y_pred)
    label_Y.append(batch_y)

In [23]:
from sklearn.metrics import classification_report
print(classification_report(np.vstack(label_Y).ravel(), np.vstack(predicted_Y).ravel(), target_names=tag2idx.keys()))

             precision    recall  f1-score   support

          O       0.61      0.53      0.57     40883
      I-ORG       0.94      0.99      0.96    731626
     I-MISC       0.85      0.51      0.64     25207
      I-LOC       0.81      0.64      0.72     62733
        PAD       0.81      0.75      0.78     41871
     B-MISC       0.00      0.00      0.00        80

avg / total       0.91      0.92      0.91    902400



  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
