In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np

In [2]:
with open('test.conll.txt') as fopen:
    corpus = fopen.read().split('\n')
    
with open('dev.conll.txt') as fopen:
    corpus_test = fopen.read().split('\n')

In [3]:
word2idx = {'PAD': 0,'NUM':1,'UNK':2}
tag2idx = {'PAD': 0}
char2idx = {'PAD': 0,'NUM':1,'UNK':2}
word_idx = 3
tag_idx = 1
char_idx = 3

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels = [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label = [], [], [], []
    for sentence in corpus:
        if len(sentence):
            sentence = sentence.split('\t')
            for c in sentence[1]:
                if c not in char2idx:
                    char2idx[c] = char_idx
                    char_idx += 1
            if sentence[7] not in tag2idx:
                tag2idx[sentence[7]] = tag_idx
                tag_idx += 1
            if sentence[1] not in word2idx:
                word2idx[sentence[1]] = word_idx
                word_idx += 1
            temp_word.append(word2idx[sentence[1]])
            temp_depend.append(int(sentence[6]))
            temp_label.append(tag2idx[sentence[7]])
            temp_sentence.append(sentence[1])
        else:
            words.append(temp_word)
            depends.append(temp_depend)
            labels.append(temp_label)
            sentences.append(temp_sentence)
            temp_word = []
            temp_depend = []
            temp_label = []
            temp_sentence = []
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1]
        
sentences, words, depends, labels = process_corpus(corpus)
sentences_test, words_test, depends_test, labels_test = process_corpus(corpus_test)

In [4]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
words = pad_sequences(words,padding='post')
depends = pad_sequences(depends,padding='post')
labels = pad_sequences(labels,padding='post')

words_test = pad_sequences(words_test,padding='post')
depends_test = pad_sequences(depends_test,padding='post')
labels_test = pad_sequences(labels_test,padding='post')

In [6]:
words_test.shape

(1700, 118)

In [7]:
def generate_char_seq(batch, UNK = 2):
    maxlen_c = max([len(k) for k in batch])
    x = [[len(i) for i in k] for k in batch]
    maxlen = max([j for i in x for j in i])
    temp = np.zeros((len(batch),maxlen_c,maxlen),dtype=np.int32)
    for i in range(len(batch)):
        for k in range(len(batch[i])):
            for no, c in enumerate(batch[i][k]):
                temp[i,k,-1-no] = char2idx.get(c, UNK)
    return temp

In [8]:
idx2word = {idx: tag for tag, idx in word2idx.items()}
idx2tag = {i: w for w, i in tag2idx.items()}

train_X = words
train_Y = labels
train_depends = depends
train_char = generate_char_seq(sentences)

test_X = words_test
test_Y = labels_test
test_depends = depends_test
test_char = generate_char_seq(sentences_test)

In [9]:
class Model:
    def __init__(
        self,
        dim_word,
        dim_char,
        dropout,
        learning_rate,
        hidden_size_char,
        hidden_size_word,
        num_layers,
        maxlen
    ):
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                output_keep_prob = dropout,
            )

        def bahdanau(embedded, size):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units = hidden_size_word, memory = embedded
            )
            return tf.contrib.seq2seq.AttentionWrapper(
                cell = cells(hidden_size_word),
                attention_mechanism = attention_mechanism,
                attention_layer_size = hidden_size_word,
            )

        self.word_ids = tf.placeholder(tf.int32, shape = [None, None])
        self.char_ids = tf.placeholder(tf.int32, shape = [None, None, None])
        self.labels = tf.placeholder(tf.int32, shape = [None, None])
        self.depends = tf.placeholder(tf.int32, shape = [None, None])
        self.maxlen = tf.shape(self.word_ids)[1]
        self.lengths = tf.count_nonzero(self.word_ids, 1)

        self.word_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(word2idx), dim_word], stddev = 1.0 / np.sqrt(dim_word)
            )
        )
        self.char_embeddings = tf.Variable(
            tf.truncated_normal(
                [len(char2idx), dim_char], stddev = 1.0 / np.sqrt(dim_char)
            )
        )

        word_embedded = tf.nn.embedding_lookup(
            self.word_embeddings, self.word_ids
        )
        char_embedded = tf.nn.embedding_lookup(
            self.char_embeddings, self.char_ids
        )
        s = tf.shape(char_embedded)
        char_embedded = tf.reshape(
            char_embedded, shape = [s[0] * s[1], s[-2], dim_char]
        )

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(hidden_size_char),
                cell_bw = cells(hidden_size_char),
                inputs = char_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_char_%d' % (n),
            )
            char_embedded = tf.concat((out_fw, out_bw), 2)
        output = tf.reshape(
            char_embedded[:, -1], shape = [s[0], s[1], 2 * hidden_size_char]
        )
        word_embedded = tf.concat([word_embedded, output], axis = -1)

        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = bahdanau(word_embedded, hidden_size_word),
                cell_bw = bahdanau(word_embedded, hidden_size_word),
                inputs = word_embedded,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_word_%d' % (n),
            )
            word_embedded = tf.concat((out_fw, out_bw), 2)

        logits = tf.layers.dense(word_embedded, len(idx2tag))
        logits_depends = tf.layers.dense(word_embedded, maxlen)
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, self.labels, self.lengths
        )
        with tf.variable_scope("depends"):
            log_likelihood_depends, transition_params_depends = tf.contrib.crf.crf_log_likelihood(
                logits_depends, self.depends, self.lengths
            )
        self.cost = tf.reduce_mean(-log_likelihood) + tf.reduce_mean(-log_likelihood_depends)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.tags_seq, _ = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq_depends, _ = tf.contrib.crf.crf_decode(
            logits_depends, transition_params_depends, self.lengths
        )

        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.labels, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.boolean_mask(self.tags_seq_depends, mask)
        mask_label = tf.boolean_mask(self.depends, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [10]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

dim_word = 128
dim_char = 256
dropout = 1
learning_rate = 1e-3
hidden_size_char = 64
hidden_size_word = 64
num_layers = 2
batch_size = 32

model = Model(dim_word,dim_char,dropout,learning_rate,hidden_size_char,hidden_size_word,num_layers,
             words.shape[1])
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [11]:
import time

for e in range(20):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss, train_acc_depends, test_acc_depends = 0, 0, 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_char = train_char[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        batch_depends = train_depends[i : min(i + batch_size, train_X.shape[0])]
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        train_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_char = test_char[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_depends = test_depends[i : min(i + batch_size, test_X.shape[0])]
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.word_ids: batch_x,
                model.char_ids: batch_char,
                model.labels: batch_y,
                model.depends: batch_depends
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        test_acc_depends += acc_depends
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    train_acc_depends /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size
    test_acc_depends /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
        % (e, train_loss, train_acc, train_acc_depends, test_loss, test_acc, test_acc_depends)
    )

train minibatch loop: 100%|██████████| 76/76 [00:43<00:00,  1.90it/s, accuracy=0.123, accuracy_depends=0.116, cost=104]   
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.61it/s, accuracy=0.136, accuracy_depends=0.0273, cost=168]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 60.34970307350159
epoch: 0, training loss: 149.379215, training acc: 0.132985, training depends: 0.079643, valid loss: 144.880309, valid acc: 0.144134, valid depends: 0.090478



train minibatch loop: 100%|██████████| 76/76 [00:43<00:00,  1.92it/s, accuracy=0.233, accuracy_depends=0.137, cost=95.3]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.67it/s, accuracy=0.255, accuracy_depends=0.0909, cost=152]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.71914076805115
epoch: 1, training loss: 132.388958, training acc: 0.186336, training depends: 0.126305, valid loss: 132.800756, valid acc: 0.259966, valid depends: 0.107971



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.93it/s, accuracy=0.483, accuracy_depends=0.219, cost=74.4]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.65it/s, accuracy=0.527, accuracy_depends=0.155, cost=123] 
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.5471887588501
epoch: 2, training loss: 111.379293, training acc: 0.398462, training depends: 0.154883, valid loss: 106.319645, valid acc: 0.507343, valid depends: 0.159132



train minibatch loop: 100%|██████████| 76/76 [00:43<00:00,  1.89it/s, accuracy=0.637, accuracy_depends=0.226, cost=64.3]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.58it/s, accuracy=0.6, accuracy_depends=0.118, cost=111]   
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.70253324508667
epoch: 3, training loss: 93.394260, training acc: 0.580188, training depends: 0.196789, valid loss: 98.123219, valid acc: 0.622438, valid depends: 0.158794



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.91it/s, accuracy=0.661, accuracy_depends=0.295, cost=55]  
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.63it/s, accuracy=0.645, accuracy_depends=0.218, cost=103] 
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.64599561691284
epoch: 4, training loss: 81.789298, training acc: 0.674215, training depends: 0.241653, valid loss: 89.319501, valid acc: 0.659905, valid depends: 0.208392



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.94it/s, accuracy=0.76, accuracy_depends=0.397, cost=46.4] 
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.63it/s, accuracy=0.718, accuracy_depends=0.2, cost=92]    
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.38800311088562
epoch: 5, training loss: 72.712886, training acc: 0.737575, training depends: 0.290594, valid loss: 82.248631, valid acc: 0.704317, valid depends: 0.227678



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.94it/s, accuracy=0.798, accuracy_depends=0.445, cost=42.7]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.60it/s, accuracy=0.727, accuracy_depends=0.155, cost=93.7]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.42586350440979
epoch: 6, training loss: 64.976823, training acc: 0.780481, training depends: 0.352966, valid loss: 81.418071, valid acc: 0.729733, valid depends: 0.216033



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.96it/s, accuracy=0.87, accuracy_depends=0.558, cost=34.8] 
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.65it/s, accuracy=0.782, accuracy_depends=0.145, cost=86.3]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.26801133155823
epoch: 7, training loss: 57.875818, training acc: 0.807750, training depends: 0.420946, valid loss: 81.698430, valid acc: 0.744215, valid depends: 0.215680



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.95it/s, accuracy=0.877, accuracy_depends=0.555, cost=30.3]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.67it/s, accuracy=0.782, accuracy_depends=0.227, cost=82.4]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.17950391769409
epoch: 8, training loss: 51.506652, training acc: 0.834453, training depends: 0.481950, valid loss: 81.583055, valid acc: 0.754682, valid depends: 0.230223



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.89, accuracy_depends=0.599, cost=29.8] 
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.59it/s, accuracy=0.773, accuracy_depends=0.273, cost=87.6]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.80722999572754
epoch: 9, training loss: 47.002005, training acc: 0.853911, training depends: 0.516468, valid loss: 83.256975, valid acc: 0.755200, valid depends: 0.228200



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.925, accuracy_depends=0.682, cost=23.8]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.69it/s, accuracy=0.764, accuracy_depends=0.236, cost=84.4]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.78147864341736
epoch: 10, training loss: 42.517333, training acc: 0.874978, training depends: 0.566414, valid loss: 80.450278, valid acc: 0.765323, valid depends: 0.249620



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.94it/s, accuracy=0.945, accuracy_depends=0.678, cost=21.3]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.62it/s, accuracy=0.791, accuracy_depends=0.191, cost=88]  
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.979615449905396
epoch: 11, training loss: 38.750908, training acc: 0.887077, training depends: 0.602463, valid loss: 82.313864, valid acc: 0.777407, valid depends: 0.247966



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.945, accuracy_depends=0.702, cost=19]  
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.68it/s, accuracy=0.755, accuracy_depends=0.227, cost=99.1]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.794678926467896
epoch: 12, training loss: 35.110486, training acc: 0.897572, training depends: 0.640705, valid loss: 92.571763, valid acc: 0.765969, valid depends: 0.220016



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.952, accuracy_depends=0.771, cost=17.5]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.67it/s, accuracy=0.773, accuracy_depends=0.227, cost=93.7]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.75972008705139
epoch: 13, training loss: 33.335737, training acc: 0.906613, training depends: 0.654030, valid loss: 92.200707, valid acc: 0.771328, valid depends: 0.246652



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.945, accuracy_depends=0.812, cost=15.1]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.70it/s, accuracy=0.782, accuracy_depends=0.245, cost=92.2]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.76455545425415
epoch: 14, training loss: 29.911946, training acc: 0.915421, training depends: 0.693428, valid loss: 87.522663, valid acc: 0.782953, valid depends: 0.262049



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.96it/s, accuracy=0.966, accuracy_depends=0.812, cost=13]  
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.71it/s, accuracy=0.791, accuracy_depends=0.291, cost=89.7]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.7827091217041
epoch: 15, training loss: 27.855397, training acc: 0.924138, training depends: 0.715602, valid loss: 89.768037, valid acc: 0.789311, valid depends: 0.263630



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.97it/s, accuracy=0.935, accuracy_depends=0.788, cost=13.2]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.69it/s, accuracy=0.791, accuracy_depends=0.291, cost=86.2]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.92595195770264
epoch: 16, training loss: 26.030449, training acc: 0.934033, training depends: 0.725757, valid loss: 92.311703, valid acc: 0.784540, valid depends: 0.263717



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.95it/s, accuracy=0.966, accuracy_depends=0.818, cost=12.7]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.68it/s, accuracy=0.791, accuracy_depends=0.291, cost=86.6]
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 58.923388719558716
epoch: 17, training loss: 24.009113, training acc: 0.943932, training depends: 0.744273, valid loss: 99.844222, valid acc: 0.776442, valid depends: 0.237053



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.95it/s, accuracy=0.969, accuracy_depends=0.795, cost=12]  
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.65it/s, accuracy=0.8, accuracy_depends=0.218, cost=93.2]  
train minibatch loop:   0%|          | 0/76 [00:00<?, ?it/s]

time taken: 59.0040009021759
epoch: 18, training loss: 21.788654, training acc: 0.950376, training depends: 0.768208, valid loss: 101.069921, valid acc: 0.784996, valid depends: 0.249907



train minibatch loop: 100%|██████████| 76/76 [00:42<00:00,  1.94it/s, accuracy=0.976, accuracy_depends=0.829, cost=11.7]
test minibatch loop: 100%|██████████| 54/54 [00:16<00:00,  3.67it/s, accuracy=0.809, accuracy_depends=0.245, cost=101] 

time taken: 59.10275650024414
epoch: 19, training loss: 20.282661, training acc: 0.956831, training depends: 0.780196, valid loss: 104.031246, valid acc: 0.784366, valid depends: 0.267584






In [12]:
seq, deps = sess.run([model.tags_seq, model.tags_seq_depends],
        feed_dict={model.word_ids:batch_x[:1],
                  model.char_ids:batch_char[:1]})

In [13]:
seq = seq[0]
deps = deps[0]

In [14]:
seq[seq>0]

array([18, 19,  2,  6,  3,  7, 16, 18, 23, 20, 19,  2], dtype=int32)

In [15]:
batch_y[0][seq>0]

array([18, 19,  2,  6,  3,  7, 16, 18, 23, 20, 19,  2], dtype=int32)

In [16]:
deps[seq>0]

array([ 2,  3,  3,  5,  5,  0,  5, 11, 11, 11,  8,  3], dtype=int32)

In [17]:
batch_depends[0][seq>0]

array([ 2,  6,  6,  5,  6,  0,  6, 11, 11, 11,  6,  6], dtype=int32)