In [None]:
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu

In [None]:
import malaya
import re
from malaya.texts._text_functions import split_into_sentences
from malaya.texts import _regex
import numpy as np
import itertools

tokenizer = malaya.preprocessing._tokenizer
splitter = split_into_sentences

In [None]:
def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def preprocessing(w):
    if is_number_regex(w):
        return '<NUM>'
    elif re.match(_regex._money, w):
        return '<MONEY>'
    elif re.match(_regex._date, w):
        return '<DATE>'
    elif re.match(_regex._expressions['email'], w):
        return '<EMAIL>'
    elif re.match(_regex._expressions['url'], w):
        return '<URL>'
    else:
        w = ''.join(''.join(s)[:2] for _, s in itertools.groupby(w))
        return w

In [None]:
word2idx = {'PAD': 0,'UNK':1, '_ROOT': 2}
tag2idx = {'PAD': 0, '_<ROOT>': 1}
char2idx = {'PAD': 0,'UNK':1, '_ROOT': 2}
word_idx = 3
tag_idx = 2
char_idx = 3

special_tokens = ['<NUM>', '<MONEY>', '<DATE>', '<URL>', '<EMAIL>']

for t in special_tokens:
    word2idx[t] = word_idx
    word_idx += 1
    char2idx[t] = char_idx
    char_idx += 1
    
word2idx, char2idx

In [None]:
PAD = "_PAD"
PAD_POS = "_PAD_POS"
PAD_TYPE = "_<PAD>"
PAD_CHAR = "_PAD_CHAR"
ROOT = "_ROOT"
ROOT_POS = "_ROOT_POS"
ROOT_TYPE = "_<ROOT>"
ROOT_CHAR = "_ROOT_CHAR"
END = "_END"
END_POS = "_END_POS"
END_TYPE = "_<END>"
END_CHAR = "_END_CHAR"

def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos, chars = [], [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    first_time = True
    for sentence in corpus:
        try:
            if len(sentence):
                if sentence[0] == '#':
                    continue
                if first_time:
                    print(sentence)
                    first_time = False
                sentence = sentence.split('\t')
                for c in sentence[1]:
                    if c not in char2idx:
                        char2idx[c] = char_idx
                        char_idx += 1
                if sentence[7] not in tag2idx:
                    tag2idx[sentence[7]] = tag_idx
                    tag_idx += 1
                sentence[1] = preprocessing(sentence[1])
                if sentence[1] not in word2idx:
                    word2idx[sentence[1]] = word_idx
                    word_idx += 1
                temp_word.append(word2idx[sentence[1]])
                temp_depend.append(int(sentence[6]))
                temp_label.append(tag2idx[sentence[7]])
                temp_sentence.append(sentence[1])
                temp_pos.append(sentence[3])
            else:
                if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):
                    temp_word = []
                    temp_depend = []
                    temp_label = []
                    temp_sentence = []
                    temp_pos = []
                    continue
                words.append([word2idx['_ROOT']] + temp_word)
                depends.append([0] + temp_depend)
                labels.append([tag2idx['_<ROOT>']] + temp_label)
                sentences.append([ROOT] + temp_sentence)
                pos.append([ROOT_POS] + temp_pos)
                char_ = [[char2idx['_ROOT']]]
                for w in temp_sentence:
                    if w in char2idx:
                        char_.append([char2idx[w]])
                    else:
                        char_.append([char2idx[c] for c in w])
                chars.append(char_)
                temp_word = []
                temp_depend = []
                temp_label = []
                temp_sentence = []
                temp_pos = []
        except Exception as e:
            print(e, sentence)
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], chars[:-1]

In [None]:
_buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 140]

def process_data(corpus, batch_size = 32):
    sentences, words, depends, labels, pos, chars = process_corpus(corpus)
    print(len(sentences), len(words), len(depends), len(labels), len(chars))

    data = [[] for _ in _buckets]
    max_char_length = [0 for _ in _buckets]

    for i in range(len(depends)):
        for bucket_id, bucket_size in enumerate(_buckets):
            if len(words[i]) < bucket_size:
                data[bucket_id].append([words[i], pos[i], depends[i], labels[i], chars[i]])
                max_len = max([len(char_seq) for char_seq in chars[i]])
                if max_char_length[bucket_id] < max_len:
                    max_char_length[bucket_id] = max_len
                break
    
    bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
    X = []
    for bucket_id in range(len(_buckets)):
        bucket_length = _buckets[bucket_id]
        bucket_size = bucket_sizes[bucket_id]
        if not bucket_size:
            X.append([])
            continue
        char_length = max_char_length[bucket_id]

        wid_inputs = np.zeros([bucket_size, bucket_length], dtype=np.int32)
        cid_inputs = np.zeros([bucket_size, bucket_length, char_length], dtype=np.int32)
        hid_inputs = np.zeros([bucket_size, bucket_length], dtype=np.int32)
        tid_inputs = np.zeros([bucket_size, bucket_length], dtype=np.int32)
        masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)

        for i, inst in enumerate(data[bucket_id]):
            w, p, d, l, ch = inst
            inst_size = len(w)
            wid_inputs[i, :inst_size] = w
            for c, cids in enumerate(ch):
                cid_inputs[i, c, :len(cids)] = cids
            tid_inputs[i, :inst_size] = l
            hid_inputs[i, :inst_size] = d
            masks[i, :inst_size] = 1.0

        x = (wid_inputs, cid_inputs, hid_inputs, tid_inputs, masks)
        X.append(x)

    train_X = []
    for X_ in X:
        if not len(X_):
            continue
        wid_inputs, cid_inputs, hid_inputs, tid_inputs, masks = X_
        for k in range(0, len(wid_inputs), batch_size):
            index = min(k + batch_size, len(wid_inputs))
            batch_w = wid_inputs[k: index]
            batch_c = cid_inputs[k: index]
            batch_heads = hid_inputs[k: index]
            batch_masks = masks[k: index]
            batch_types = tid_inputs[k: index]
            x = (batch_w, batch_c, batch_heads, batch_masks, batch_types)
            train_X.append(x)
            
    print('trainable batch size', len(train_X))
    return train_X

In [None]:
with open('en_ewt-ud-dev.conllu') as fopen:
    dev = fopen.read().split('\n')

test = process_data(dev)

In [None]:
with open('en_ewt-ud-test.conllu') as fopen:
    dev = fopen.read().split('\n')

test.extend(process_data(dev))

In [None]:
with open('en_ewt-ud-train.conllu') as fopen:
    train = fopen.read().split('\n')

train = process_data(train)

In [None]:
len(train), len(test)

In [None]:
idx2word = {v:k for k, v in word2idx.items()}
idx2tag = {v:k for k, v in tag2idx.items()}
len(idx2word)

In [None]:
import tensorflow as tf

In [None]:
class BiAAttention:
    def __init__(self, input_size_encoder, input_size_decoder, num_labels):
        self.input_size_encoder = input_size_encoder
        self.input_size_decoder = input_size_decoder
        self.num_labels = num_labels
        
        self.W_d = tf.get_variable("W_d", shape=[self.num_labels, self.input_size_decoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_e = tf.get_variable("W_e", shape=[self.num_labels, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.U = tf.get_variable("U", shape=[self.num_labels, self.input_size_decoder, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        
    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        batch = tf.shape(input_d)[0]
        length_decoder = tf.shape(input_d)[1]
        length_encoder = tf.shape(input_e)[1]
        out_d = tf.expand_dims(tf.matmul(self.W_d, tf.transpose(input_d, [0, 2, 1])), 3)
        out_e = tf.expand_dims(tf.matmul(self.W_e, tf.transpose(input_e, [0, 2, 1])), 2)
        output = tf.matmul(tf.expand_dims(input_d, 1), self.U)
        output = tf.matmul(output, tf.transpose(tf.expand_dims(input_e, 1), [0, 1, 3, 2]))
        
        output = output + out_d + out_e
        
        if mask_d is not None:
            d = tf.expand_dims(tf.expand_dims(mask_d, 1), 3)
            e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)
            output = output * d * e
            
        return output
    
class BiLinear:
    def __init__(self, left_features, right_features, out_features):
        self.left_features = left_features
        self.right_features = right_features
        self.out_features = out_features
        
        self.U = tf.get_variable("U-bi", shape=[out_features, left_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_l = tf.get_variable("Wl", shape=[out_features, left_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_r = tf.get_variable("Wr", shape=[out_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
    
    def forward(self, input_left, input_right):
        left_size = tf.shape(input_left)
        output_shape = tf.concat([left_size[:-1], [self.out_features]], axis = 0)
        batch = tf.cast(tf.reduce_prod(left_size[:-1]), tf.int32)
        input_left = tf.reshape(input_left, (batch, self.left_features))
        input_right = tf.reshape(input_right, (batch, self.right_features))
        tiled = tf.tile(tf.expand_dims(input_left, axis = 0), (self.out_features,1,1))
        output = tf.transpose(tf.reduce_sum(tf.matmul(tiled, self.U), axis = 2))
        output = output + tf.matmul(input_left, tf.transpose(self.W_l))\
        + tf.matmul(input_right, tf.transpose(self.W_r))
        
        return tf.reshape(output, output_shape)

class Attention:
    def __init__(self, word_dim, num_words, char_dim, num_chars, num_filters, kernel_size,
                 hidden_size, encoder_layers, num_labels, arc_space, type_space):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,
                                           initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.word_embedd = tf.Variable(tf.random_uniform([num_words, word_dim], -1, 1))
        self.char_embedd = tf.Variable(tf.random_uniform([num_chars, char_dim], -1, 1))
        self.conv1d = tf.layers.Conv1D(num_filters, kernel_size, 1, padding='VALID')
        self.num_labels = num_labels
        self.encoder = tf.nn.rnn_cell.MultiRNNCell([cells(hidden_size) for _ in range(encoder_layers)])

        self.arc_h = tf.layers.Dense(arc_space)
        self.arc_c = tf.layers.Dense(arc_space)
        self.attention = BiAAttention(arc_space, arc_space, 1)

        self.type_h = tf.layers.Dense(type_space)
        self.type_c = tf.layers.Dense(type_space)
        self.bilinear = BiLinear(type_space, type_space, self.num_labels)
        
    def encode(self, input_word, input_char):
        word = tf.nn.embedding_lookup(self.word_embedd, input_word)
        char = tf.nn.embedding_lookup(self.char_embedd, input_char)
        b = tf.shape(char)[0]
        wl = tf.shape(char)[1]
        cl = tf.shape(char)[2]
        d = char.shape[3]
        char = tf.reshape(char, [b * wl, cl, d])
        char = tf.reduce_max(self.conv1d(char), axis = 1)
        char = tf.nn.tanh(char)
        d = char.shape[-1]
        char = tf.reshape(char, [b, wl, d])
        
        src_encoding = tf.concat([word, char], axis=2)
        output, hn = tf.nn.dynamic_rnn(self.encoder, src_encoding, dtype = tf.float32,
                                      scope = 'encoder')
        arc_h = tf.nn.elu(self.arc_h(output))
        arc_c = tf.nn.elu(self.arc_c(output))
        
        type_h = tf.nn.elu(self.type_h(output))
        type_c = tf.nn.elu(self.type_c(output))
        
        return (arc_h, arc_c), (type_h, type_c), hn
    
    def forward(self, input_word, input_char, mask):
        arcs, types, _ = self.encode(input_word, input_char)
        
        out_arc = tf.squeeze(self.attention.forward(arcs[0], arcs[1], mask_d=mask, mask_e=mask), axis = 1)
        return out_arc, types, mask
    
    def loss(self, input_word, input_char, mask, heads, types):
        out_arc, out_type, _ = self.forward(input_word, input_char, mask)
        type_h, type_c = out_type
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        batch_index = tf.range(0, batch)
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        minus_inf = -1e8
        minus_mask = (1 - mask) * minus_inf
        out_arc = out_arc + tf.expand_dims(minus_mask, axis = 2) + tf.expand_dims(minus_mask, axis = 1)
        loss_arc = tf.nn.log_softmax(out_arc, dim=1)
        loss_type = tf.nn.log_softmax(out_type, dim=2)
        loss_arc = loss_arc * tf.expand_dims(mask, axis = 2) * tf.expand_dims(mask, axis = 1)
        loss_type = loss_type * tf.expand_dims(mask, axis = 2)
        num = tf.reduce_sum(mask) - tf.cast(batch, tf.float32)
        child_index = tf.tile(tf.expand_dims(tf.range(0, max_len), 1), [1, batch])
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(t, axis = 0),
                                               tf.expand_dims(child_index, axis = 0)], axis = 0))
        loss_arc = tf.gather_nd(loss_arc, concatenated)
        loss_arc = tf.transpose(loss_arc, [1, 0])[1:]
        
        t = tf.transpose(types)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(child_index, axis = 0),
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        loss_type = tf.gather_nd(loss_type, concatenated)
        loss_type = tf.transpose(loss_type, [1, 0])[1:]
        return tf.reduce_sum(-loss_arc) / num, tf.reduce_sum(-loss_type) / num
    
    def decode(self, input_word, input_char, mask, leading_symbolic=0):
        out_arc, out_type, _ = self.forward(input_word, input_char, mask)
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        out_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1 - mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        out_arc = tf.where(minus_mask, tf.fill(tf.shape(out_arc), -np.inf), out_arc)
        heads = tf.argmax(out_arc, axis = 1)
        type_h, type_c = out_type
        batch = tf.shape(type_h)[0]
        max_len = tf.shape(type_h)[1]
        batch_index = tf.range(0, batch)
        t = tf.cast(tf.transpose(heads), tf.int32)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        out_type = out_type[:, :, leading_symbolic:]
        types = tf.argmax(out_type, axis = 2)
        return heads, types
    
class Model:
    def __init__(self, learning_rate = 1e-3, cov = 0.0):
        self.attention = Attention(word_dim = 128, 
                            num_words = len(word2idx), 
                            char_dim = 128, 
                            num_chars = len(char2idx), 
                            num_filters = 128, 
                            kernel_size = 3,
                            hidden_size = 256, 
                            encoder_layers = 1,
                            num_labels = len(tag2idx), 
                            arc_space = 128, 
                            type_space = 128)
        self.words = tf.placeholder(tf.int32, (None, None))
        self.chars = tf.placeholder(tf.int32, (None, None, None))
        self.heads = tf.placeholder(tf.int32, (None, None))
        self.types = tf.placeholder(tf.int32, (None, None))
        self.mask = tf.placeholder(tf.float32, (None, None))
        loss_arc, loss_type = self.attention.loss(self.words, self.chars,
                                                 self.mask, self.heads, self.types)
        self.cost = loss_arc + loss_type
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        self.decode = self.attention.decode(self.words, self.chars, self.mask)

In [None]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())

In [None]:
batch_w, batch_c, batch_heads, batch_masks, batch_types = train[0]

In [None]:
%%time
sess.run(model.decode, feed_dict = {model.words: batch_w[:5],
                                    model.chars: batch_c[:5],
                                    model.mask: batch_masks[:5]})

#model.attention.decode(batch_w[:5], batch_c[:5], batch_masks[:5])

In [None]:
%%time
sess.run(model.attention.decode(batch_w[:5], batch_c[:5], batch_masks[:5]))

#model.attention.decode(batch_w[:5], batch_c[:5], batch_masks[:5])

In [None]:
def evaluate(words, heads_pred, types_pred, heads, types, lengths,
             symbolic_root=False, symbolic_end=False):
    batch_size, _ = words.shape
    ucorr = 0.
    lcorr = 0.
    total = 0.
    ucomplete_match = 0.
    lcomplete_match = 0.

    corr_root = 0.
    total_root = 0.
    start = 1 if symbolic_root else 0
    end = 1 if symbolic_end else 0
    for i in range(batch_size):
        ucm = 1.
        lcm = 1.
        for j in range(start, lengths[i] - end):

            total += 1
            if heads[i, j] == heads_pred[i, j]:
                ucorr += 1
                if types[i, j] == types_pred[i, j]:
                    lcorr += 1
                else:
                    lcm = 0
            else:
                ucm = 0
                lcm = 0

            if heads[i, j] == 0:
                total_root += 1
                corr_root += 1 if heads_pred[i, j] == 0 else 0

        ucomplete_match += ucm
        lcomplete_match += lcm

    return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \
           (corr_root, total_root), batch_size

In [None]:
from tqdm import tqdm
epoch = 20

for e in range(epoch):
    train_loss, test_loss = [], []
    pbar = tqdm(range(len(train)), desc = 'train minibatch loop')
    for k in pbar:
        batch_w, batch_c, batch_heads, batch_masks, batch_types = train[k]
        feed_dict = {model.words: batch_w,
                     model.chars: batch_c,
                     model.heads: batch_heads,
                     model.mask: batch_masks,
                     model.types: batch_types}
        cost, _ = sess.run([model.cost, model.optimizer], feed_dict = feed_dict)
        train_loss.append(cost)
        pbar.set_postfix(cost = cost)
    
    pbar = tqdm(range(len(test)), desc = 'test minibatch loop')
    for k in pbar:
        batch_w, batch_c, batch_heads, batch_masks, batch_types = test[k]
        feed_dict = {model.words: batch_w,
                     model.chars: batch_c,
                     model.heads: batch_heads,
                     model.mask: batch_masks,
                     model.types: batch_types}
        cost = sess.run(model.cost, feed_dict = feed_dict)
        test_loss.append(cost)
        pbar.set_postfix(cost = cost)
    
    print('epoch: %d, avg train loss: %f, avg test loss: %f'%(e + 1, np.mean(train_loss),
                                                             np.mean(test_loss)))