In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report

  from ._conv import register_converters as _register_converters


In [2]:
word2idx = {'<pad>': 0}
tag2idx = {'<pad>': 0}
word_idx = 1
tag_idx = 1
x_train = []
y_train = []
x_test = []
y_test = []

for line in open('pos_train.txt'):
    line = line.rstrip()
    if line:
        word, tag, _ = line.split()
        if word not in word2idx:
            word2idx[word] = word_idx
            word_idx += 1
        x_train.append(word2idx[word])
        if tag not in tag2idx:
            tag2idx[tag] = tag_idx
            tag_idx += 1
        y_train.append(tag2idx[tag])
        
word2idx['<unknown>'] = word_idx

for line in open('pos_test.txt'):
    line = line.rstrip()
    if line:
        word, tag, _ = line.split()
        if word in word2idx:
            x_test.append(word2idx[word])
        else:
            x_test.append(word_idx)
        y_test.append(tag2idx[tag])

In [3]:
params = {
    'seq_len': 20,
    'batch_size': 128,
    'hidden_dim': 128,
    'clip_norm': 5.0,
    'text_iter_step': 1,
    'lr': {'start': 5e-3, 'end': 5e-4},
    'n_epoch': 2,
    'num_layers':2,
    'display_step': 50,
    'vocab_size': len(word2idx),
    'n_class':tag_idx
}

In [4]:
def iter_seq(x):
    return np.array([x[i: i+params['seq_len']] for i in range(0, len(x)-params['seq_len'], params['text_iter_step'])])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

def to_test_seq(*args):
    return [np.reshape(x[:(len(x)-len(x)%params['seq_len'])],
        [-1,params['seq_len']]) for x in args]

In [5]:
X_train, Y_train = to_train_seq(x_train, y_train)
X_test, Y_test = to_test_seq(x_test, y_test)
params['lr']['steps'] = len(X_train) // params['batch_size']

In [6]:
Y_train.shape

(211707, 20)

In [7]:
class Model:
    def __init__(self):
        
        def clip_grads(loss):
            variables = tf.trainable_variables()
            grads = tf.gradients(loss, variables)
            clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])
            return zip(clipped_grads, variables)
        
        def cells(reuse=False):
            return tf.nn.rnn_cell.LSTMCell(params['hidden_dim'],initializer=tf.orthogonal_initializer(),
                                           reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, params['seq_len']])
        self.Y = tf.placeholder(tf.int32, [None, params['seq_len']])
        self.X_seq_len = tf.placeholder(tf.int32, [None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.batch_size = tf.placeholder(tf.int32, None)
        
        batch_size = self.batch_size
        encoder_embeddings = tf.Variable(tf.random_uniform([params['vocab_size'], 
                                                            params['hidden_dim']], -1, 1))
        decoder_embeddings = tf.Variable(tf.random_uniform([params['n_class'], params['hidden_dim']], -1, 1))
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        main = tf.strided_slice(self.X, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], 0), main], 1)
        decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, decoder_input)
        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(params['num_layers'])])
        _, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded,
                                          dtype = tf.float32)
        with tf.variable_scope("decoder"):
            rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(params['num_layers'])])
            outputs, _ = tf.nn.dynamic_rnn(rnn_cells_dec, decoder_embedded, 
                                           initial_state = last_state,
                                           dtype = tf.float32)
        self.logits = tf.layers.dense(outputs,params['n_class'])
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.logits,
                                                     targets = self.Y,
                                                     weights = masks)
        
        self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate = tf.train.exponential_decay(params['lr']['start'],
                                                        self.global_step, params['lr']['steps'],
                                                        params['lr']['end']/params['lr']['start'])
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(clip_grads(self.cost), 
                                                                                    global_step=self.global_step)

In [8]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())

In [9]:
def check_accuracy(logits, Y):
    acc = 0
    for i in range(logits.shape[0]):
        internal_acc = 0
        for k in range(len(Y[i])):
            if Y[i][k] == logits[i][k]:
                internal_acc += 1
        acc += (internal_acc / len(Y[i]))
    return acc / logits.shape[0]

In [10]:
for i in range(params['n_epoch']):
    total_cost,total_accuracy = 0, 0
    for k in range(0,(X_train.shape[0] // params['batch_size'])*params['batch_size'],params['batch_size']):
        batch_x = X_train[k:k+params['batch_size']]
        batch_y = Y_train[k:k+params['batch_size']]
        batch_x_seq = [params['seq_len']]*params['batch_size']
        batch_y_seq = [params['seq_len']]*params['batch_size']
        predicted, step, loss, _ = sess.run([tf.argmax(model.logits,2), model.global_step, 
                                             model.cost, model.optimizer],
                                            feed_dict={model.X:batch_x, model.Y:batch_y,
                                            model.X_seq_len:batch_x_seq,
                                            model.Y_seq_len:batch_y_seq,
                                            model.batch_size:params['batch_size']})
        acc = check_accuracy(predicted,batch_y)
        if step % params['display_step'] == 0 or step == 1:
            print('epoch %d, step %d, loss %f, acc %f'%(i+1,step,loss, acc))
        total_cost += loss
        total_accuracy += acc
    total_cost /= ((X_train.shape[0] // params['batch_size']))
    total_accuracy /= ((X_train.shape[0] // params['batch_size']))
    print('epoch %d, avg loss %f, avg acc %f'%(i+1,total_cost,total_accuracy))

epoch 1, step 1, loss 3.804602, acc 0.023828
epoch 1, step 50, loss 2.641443, acc 0.269141
epoch 1, step 100, loss 2.538152, acc 0.269531
epoch 1, step 150, loss 2.517906, acc 0.257031
epoch 1, step 200, loss 2.560117, acc 0.278125
epoch 1, step 250, loss 2.248977, acc 0.331250
epoch 1, step 300, loss 2.299693, acc 0.306250
epoch 1, step 350, loss 2.440006, acc 0.308203
epoch 1, step 400, loss 2.179614, acc 0.334766
epoch 1, step 450, loss 2.176687, acc 0.348828
epoch 1, step 500, loss 2.124507, acc 0.403906
epoch 1, step 550, loss 2.072956, acc 0.399609
epoch 1, step 600, loss 2.207045, acc 0.350781
epoch 1, step 650, loss 2.013363, acc 0.384375
epoch 1, step 700, loss 1.799941, acc 0.428516
epoch 1, step 750, loss 1.746237, acc 0.464063
epoch 1, step 800, loss 2.087084, acc 0.380078
epoch 1, step 850, loss 1.930929, acc 0.469531
epoch 1, step 900, loss 2.072091, acc 0.439062
epoch 1, step 950, loss 1.820239, acc 0.447656
epoch 1, step 1000, loss 1.131939, acc 0.616797
epoch 1, step 1

In [11]:
Y_pred = sess.run(tf.argmax(model.logits,2),feed_dict={model.X:X_test,model.batch_size:X_test.shape[0],
                                model.X_seq_len:[params['seq_len']]*X_test.shape[0]})
print(classification_report(Y_test.ravel(), Y_pred.ravel(), target_names=tag2idx.keys()))

             precision    recall  f1-score   support

          ,       0.45      0.65      0.53      6639
        VBD       0.62      0.79      0.69      5070
         VB       0.66      0.85      0.74      4020
        RBR       0.36      0.24      0.28       912
         EX       0.32      0.12      0.18      1354
        WP$       0.53      0.45      0.49      1103
        VBP       0.83      0.87      0.85      1177
          #       0.74      0.81      0.77      1269
          )       0.34      0.12      0.18      2962
        POS       0.41      0.24      0.31      3034
         TO       0.47      0.61      0.53      4803
        JJR       0.77      0.93      0.84      2389
      <pad>       0.47      0.40      0.43      1214
         CC       0.59      0.36      0.44       433
          :       0.89      0.92      0.91      1974
         IN       0.48      0.37      0.42       539
         CD       0.26      0.03      0.05       727
        SYM       0.77      0.08      0.15   

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
