In [2]:
%load_ext autoreload
%autoreload 2

# <center> Semantic Relation Classification via Bidirectional LSTM Networks w/ Entity-aware Attention using Latent Entity Typing </center>

# <center> Tensorflow Estimator </center>

This repository contains the official TensorFlow implementation of the following paper:

> **Semantic Relation Classification via Bidirectional LSTM Networks with Entity-aware Attention using Latent Entity Typing**<br>
> Joohong Lee, Sangwoo Seo, Yong Suk Choi<br>
> [https://arxiv.org/abs/1901.08163](https://arxiv.org/abs/1901.08163)
> 
> **Abstract:** *Classifying semantic relations between entity pairs in sentences is an important task in Natural Language Processing (NLP). Most previous models for relation classification rely on the high-level lexical and syntactic features obtained by NLP tools such as WordNet, dependency parser, part-of-speech (POS) tagger, and named entity recognizers (NER). In addition, state-of-the-art neural models based on attention mechanisms do not fully utilize information of entity that may be the most crucial features for relation classification. To address these issues, we propose a novel end-to-end recurrent neural model which incorporates an entity-aware attention mechanism with a latent entity typing (LET) method. Our model not only utilizes entities and their latent types as features effectively but also is more interpretable by visualizing attention mechanisms applied to our model and results of LET. Experimental results on the SemEval-2010 Task 8, one of the most popular relation classification task, demonstrate that our model outperforms existing state-of-the-art models without any high-level features.*

![title](https://user-images.githubusercontent.com/15166794/52579582-c7339100-2e69-11e9-9081-711e7576e717.png)

Code was transformed into an estimator format from the following repository:

> **Entity-aware Attention for Relation Classification**<br>    
> [https://github.com/roomylee/entity-aware-relation-classification](https://github.com/roomylee/entity-aware-relation-classification)

## Required Libraries 

Please install them manually using conda or pip.

In [3]:
import tensorflow as tf
from tensorflow import data
import pandas as pd
import swifter
from datetime import datetime
from keras.preprocessing.text import Tokenizer
import multiprocessing
import shutil
import re
import nltk
from tqdm import tqdm_notebook
import numpy as np

Using TensorFlow backend.


## Global Variables  and FLAGS

In [4]:
MODEL_NAME = 'EARC-model-01'
TRAIN_DATA_FILES_PATTERN = 'data/processed/train-*.tsv'
VALID_DATA_FILES_PATTERN = 'data/processed/valid-*.tsv'
VOCAB_LIST_FILE = 'data/processed/vocab_list.tsv'
POS_LIST_FILE = 'data/processed/posidx_list.tsv'
RESUME_TRAINING = True
MULTI_THREADING = True

CLASS_TO_LABEL = {'Other': 0,
                  'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                  'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                  'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                  'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                  'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                  'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                  'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                  'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                  'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}

TARGET_LABELS = list(CLASS_TO_LABEL.keys())

In [5]:
MAX_DOCUMENT_LENGTH = 90
PAD_WORD = '#=KS=#'
HEADER_DEFAULTS = [['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA']]
TARGET_NAME = 'class'
WEIGHT_COLUNM_NAME = 'weight' #This will not be used for now

## Text Cleaning & Feature Engineering

A text input needs to be transformed, using ```text2features```. We also need to create a tsv file containing the training data and testing data.

```python
x1 = "<e1>Avocados</e1> come from farms in the <e2>south of Mexico</e2>"

text2features(x1, MAX_DOCUMENT_LENGTH)
```

In [6]:
def clean_str(text):
    text = text.lower()
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"that's", "that is ", text)
    text = re.sub(r"there's", "there is ", text)
    text = re.sub(r"it's", "it is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()

In [7]:
def text2features(text, max_sentence_length):
    text = text.replace('<e1>', ' _e11_ ')
    text = text.replace('</e1>', ' _e12_ ')
    text = text.replace('<e2>', ' _e21_ ')
    text = text.replace('</e2>', ' _e22_ ')
    text = clean_str(text)
    tokens = nltk.word_tokenize(text)
    e1 = tokens.index("e12") - 1
    e2 = tokens.index("e22") - 1
    p1 = ""
    p2 = ""
    for word_idx in range(len(tokens)):
        p1 += str((max_sentence_length - 1) + word_idx - e1) + " "
        p2 += str((max_sentence_length - 1) + word_idx - e2) + " "
    text = " ".join(tokens)
    features = {
        'text': text, 
        'e1': e1,
        'e2': e2,
        'p1': p1,
        'p2': p2
    }
    return features

In [8]:
df = pd.read_csv('./data/raw-train-data.tsv', sep='\t', header=None, names=['class', 'text'])
s = df.text.swifter.apply(lambda x: text2features(x, MAX_DOCUMENT_LENGTH))
train = pd.DataFrame(s.to_list())
train['class'] = df['class']
HEADER = train.columns

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=8000, style=ProgressStyle(description_widt…




In [9]:
df = pd.read_csv('./data/raw-valid-data.tsv', sep='\t', header=None, names=['class', 'text'])
s = df.text.swifter.apply(lambda x: text2features(x, MAX_DOCUMENT_LENGTH))
test = pd.DataFrame(s.to_list())
test['class'] = df['class']

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=2717, style=ProgressStyle(description_widt…




In [10]:
train.to_csv('./data/processed/train-data.tsv', sep='\t', index=False, header=False)
test.to_csv('./data/processed/valid-data.tsv', sep='\t', index=False, header=False)

In [11]:
print("tokenizing input data...")
tokenizer = Tokenizer(lower=False, char_level=False)
tokenizer.fit_on_texts(train.text.tolist() + test.text.tolist())  #leaky
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

tokenizing input data...
dictionary size:  22382


In [12]:
vocab = list(word_index.keys())
vocab.insert(0, '#=KS=#')
N_WORDS = len(vocab)
with open('./data/processed/vocab_list.tsv', 'w+') as f:
    for w in vocab:
        f.write(w + '\n')

In [13]:
pos_tokenizer = Tokenizer(lower=False, char_level=False)
pos_tokenizer.fit_on_texts(train.p1.tolist() + train.p2.tolist() + test.p1.tolist() + test.p2.tolist())  #leaky

In [14]:
pos_idx = list(pos_tokenizer.word_index.keys())
pos_idx.insert(0, '0')
N_POS = len(pos_idx)
with open('./data/processed/posidx_list.tsv', 'w+') as f:
    for p in pos_idx:
        f.write(p + '\n')

## Build Glove Embedding Matrix

In [15]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm_notebook(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [16]:
FAST_TEXT = '/home/luis.magana/embeddings/crawl-300d-2M.vec'
GLOVE = '/home/luis.magana/embeddings/glove.840B.300d.txt'

In [17]:
embedding_matrix, _ = build_matrix(word_index, GLOVE)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




## Estimator Functions

In [18]:
def parse_tsv_row(tsv_row):
    """
    This function assumes that the text has already been cleaned and tokenized. 
    """
    data = tf.decode_csv(tsv_row, record_defaults=HEADER_DEFAULTS, field_delim='\t')
    features = dict(zip(HEADER, data))
    target = features.pop(TARGET_NAME)
    # giving more weight to "spam" records are the are only 13% of the training set
    # features[WEIGHT_COLUNM_NAME] =  tf.cond( tf.equal(target,'spam'), lambda: 6.6, lambda: 1.0 ) 
    features[WEIGHT_COLUNM_NAME] = 1 # set to one for now
    return features, target

In [19]:
def parse_label_2_one_hot(label_string_tensor):
    """
    Takes a tensor string containg the labeled class and returns a one-hot vector representation.
    """
    table = tf.contrib.lookup.index_table_from_tensor(tf.constant(TARGET_LABELS))
    index = table.lookup(label_string_tensor)
    #one_m_depth =  tf.dtypes.cast(tf.keras.backend.max(index), tf.int32) 
    #depth = tf.math.add(tf.constant(1), one_m_depth)
    return tf.one_hot(index, len(TARGET_LABELS))

def input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=1,
                 batch_size=200):
    """
    Input Function for tensorflow estimator. Returns the tensor features and 
    one-hot representation of the target class.
    """
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
    
    buffer_size = 2 * batch_size + 1
   
    print("\n", "* data input_fn:")
    print("===========================================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("===========================================", "\n")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size)
        
    dataset = dataset.map(lambda tsv_row: parse_tsv_row(tsv_row), 
                          num_parallel_calls=num_threads)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.prefetch(buffer_size)
    
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, parse_label_2_one_hot(target)

def process_text(text_feature): 
    """
    The text features will be transformed into a word id vector. 
    
    in  ---> ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    out ---> [27 39 40 41 42  1 43  0  0 ... 0]
    """
    # Load vocabolary lookup table to map word => word_id
    vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file=VOCAB_LIST_FILE, 
                                                          num_oov_buckets=1, default_value=-1)
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    words = tf.string_split(text_feature)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_words = tf.sparse_tensor_to_dense(words, default_value=PAD_WORD)
    # Convert word to word_ids via the vocab lookup table
    word_ids = vocab_table.lookup(dense_words)
    # Create a word_ids padding
    padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    word_ids_padded = tf.pad(word_ids, padding)
    word_id_vector = tf.slice(word_ids_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    # Return the final word_id_vector
    return word_id_vector

def process_pos(text_feature):
    """
    The text features will be transformed into a position id vector. See model diagram and paper for more information.
    
    in  ---> [95 96 97 98 99 100 101 999 999 999 ... 999]
    out ---> [11 12 13 14 15  16  21  17  17  17 ...  17]
    """
    # Load vocabolary lookup table to map word => word_id
    pos_table = tf.contrib.lookup.index_table_from_file(vocabulary_file=POS_LIST_FILE, 
                                                          num_oov_buckets=1, default_value=-1)
    # Split text to words -> this will produce sparse tensor with variable-lengthes (word count) entries
    poss = tf.string_split(text_feature)
    # Convert sparse tensor to dense tensor by padding each entry to match the longest in the batch
    dense_poss = tf.sparse_tensor_to_dense(poss, default_value='0')
    # Convert word to word_ids via the vocab lookup table
    pos_ids = pos_table.lookup(dense_poss)
    # Create a word_ids padding
    padding = tf.constant([[0,0],[0, MAX_DOCUMENT_LENGTH]])
    # Pad all the word_ids entries to the maximum document length
    pos_ids_padded = tf.pad(pos_ids, padding)
    pos_id_vector = tf.slice(pos_ids_padded, [0,0], [-1, MAX_DOCUMENT_LENGTH])
    # Return the final word_id_vector
    return pos_id_vector

def process_ent(text_feature):
    """
    The text features will be transformed into tensor of type int32.
    """
    return tf.strings.to_number(text_feature, out_type=tf.dtypes.int32)

## Custom Attention Blocks for RNNs and Entities

These functions were obtained directly from the paper's original code. Please see original author for questions. 

In [20]:
def attention(inputs, e1, e2, p1, p2, attention_size):
    # inputs = (batch, seq_len, hidden)
    # e1, e2 = (batch, seq_len)
    # p1, p2 = (batch, seq_len, dist_emb_size)
    # attention_size = scalar(int)
    def extract_entity(x, e):
        e_idx = tf.concat([tf.expand_dims(tf.range(tf.shape(e)[0]), axis=-1), tf.expand_dims(e, axis=-1)], axis=-1)
        return tf.gather_nd(x, e_idx)  # (batch, hidden)
    seq_len = tf.shape(inputs)[1]  # fixed at run-time
    hidden_size = inputs.shape[2].value  # fixed at compile-time
    latent_size = hidden_size

    # Latent Relation Variable based on Entities
    e1_h = extract_entity(inputs, e1)  # (batch, hidden)
    e2_h = extract_entity(inputs, e2)  # (batch, hidden)
    e1_type, e2_type, e1_alphas, e2_alphas = latent_type_attention(e1_h, e2_h,
                                                                   num_type=3,
                                                                   latent_size=latent_size)  # (batch, hidden)
    e1_h = tf.concat([e1_h, e1_type], axis=-1)  # (batch, hidden+latent)
    e2_h = tf.concat([e2_h, e2_type], axis=-1)  # (batch, hidden+latent)

    # v*tanh(W*[h;p1;p2]+W*[e1;e2]) 85.18%? 84.83% 84.55%
    e_h = tf.layers.dense(tf.concat([e1_h, e2_h], -1), attention_size, use_bias=False, kernel_initializer=initializer())
    e_h = tf.reshape(tf.tile(e_h, [1, seq_len]), [-1, seq_len, attention_size])
    v = tf.layers.dense(tf.concat([inputs, p1, p2], axis=-1), attention_size, use_bias=False, kernel_initializer=initializer())
    v = tf.tanh(tf.add(v, e_h))

    u_omega = tf.get_variable("u_omega", [attention_size], initializer=initializer())
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (batch, seq_len)
    alphas = tf.nn.softmax(vu, name='alphas')  # (batch, seq_len)

    # v*tanh(W*[h;p1;p2;e1;e2]) 85.18% 84.41%
    # e1_h = tf.reshape(tf.tile(e1_h, [1, seq_len]), [-1, seq_len, hidden_size+latent_size])
    # e2_h = tf.reshape(tf.tile(e2_h, [1, seq_len]), [-1, seq_len, hidden_size+latent_size])
    # v = tf.concat([inputs, p1, p2, e1_h, e2_h], axis=-1)
    # v = tf.layers.dense(v, attention_size, activation=tf.tanh, kernel_initializer=initializer())
    #
    # u_omega = tf.get_variable("u_omega", [attention_size], initializer=initializer())
    # vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (batch, seq_len)
    # alphas = tf.nn.softmax(vu, name='alphas')  # (batch, seq_len)

    # output
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)  # (batch, hidden)

    return output, alphas, e1_alphas, e2_alphas


def latent_type_attention(e1, e2, num_type, latent_size):
    # Latent Entity Type Vectors
    latent_type = tf.get_variable("latent_type", shape=[num_type, latent_size], initializer=initializer())

    # e1_h = tf.layers.dense(e1, latent_size, kernel_initializer=initializer())
    # e2_h = tf.layers.dense(e2, latent_size, kernel_initializer=initializer())

    e1_sim = tf.matmul(e1, tf.transpose(latent_type))  # (batch, num_type)
    e1_alphas = tf.nn.softmax(e1_sim, name='e1_alphas')  # (batch, num_type)
    e1_type = tf.matmul(e1_alphas, latent_type, name='e1_type')  # (batch, hidden)

    e2_sim = tf.matmul(e2, tf.transpose(latent_type))  # (batch, num_type)
    e2_alphas = tf.nn.softmax(e2_sim, name='e2_alphas')  # (batch, num_type)
    e2_type = tf.matmul(e2_alphas, latent_type, name='e2_type')  # (batch, hidden)

    return e1_type, e2_type, e1_alphas, e2_alphas


def multihead_attention(queries, keys, num_units, num_heads,
                        dropout_rate=0, scope="multihead_attention", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # Linear projections
        Q = tf.layers.dense(queries, num_units, kernel_initializer=initializer())  # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, kernel_initializer=initializer())  # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, kernel_initializer=initializer())  # (N, T_k, C)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0)  # (h*N, T_q, C/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0)  # (h*N, T_k, C/h)

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)

        # Scale
        outputs /= K_.get_shape().as_list()[-1] ** 0.5

        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)

        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)  # (h*N, T_q, T_k)

        # Activation
        alphas = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)

        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
        alphas *= query_masks  # broadcasting. (N, T_q, C)

        # Dropouts
        alphas = tf.layers.dropout(alphas, rate=dropout_rate, training=tf.convert_to_tensor(True))

        # Weighted sum
        outputs = tf.matmul(alphas, V_)  # ( h*N, T_q, C/h)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)

        # Linear
        outputs = tf.layers.dense(outputs, num_units, activation=tf.nn.relu, kernel_initializer=initializer())

        # Residual connection
        outputs += queries

        # Normalize
        outputs = layer_norm(outputs)  # (N, T_q, C)

    return outputs, alphas


def layer_norm(inputs, epsilon=1e-8, scope="layer_norm", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
        outputs = gamma * normalized + beta

    return outputs

## Tensorflow Estimator 

In [27]:
def initializer():
    return tf.keras.initializers.glorot_normal()

def seq_length(seq):
    relevant = tf.sign(tf.abs(seq))
    length = tf.reduce_sum(relevant, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length

def model_fn(features, labels, mode, params):
    embedding_size = params.embedding_size
    pos_vocab_size = params.pos_vocab_size
    pos_embedding_size = params.pos_embedding_size
    num_heads = params.num_heads
    attention_size = params.attention_size
    hidden_size = params.hidden_size
    l2_reg_lambda = params.l2_reg_lambda
    emb_dropout_keep_prob = params.emb_dropout_keep_prob
    rnn_dropout_keep_prob = params.rnn_dropout_keep_prob
    dropout_keep_prob = params.dropout_keep_prob
    num_classes = params.num_classes
    learning_rate = params.learning_rate
    decay_rate = params.decay_rate
    
    input_x = process_text(features['text'])
    input_e1 = process_ent(features['e1'])
    input_e2 = process_ent(features['e2'])
    input_p1 = process_pos(features['p1'])
    input_p2 = process_pos(features['p2'])
    
    # Word Embedding Layer
    with tf.name_scope('word-embeddings'):
        W_text = tf.get_variable("W_text", [N_WORDS, embedding_size], 
                                    initializer=tf.constant_initializer(params.embedding_initializer()))
        embedded_chars = tf.nn.embedding_lookup(W_text, input_x)
        #embedded_chars = tf.contrib.layers.embed_sequence(
        #    input_x, N_WORDS, embedding_size,
        #    initializer=params.embedding_initializer)
        
    # Position Embedding Layer
    with tf.name_scope('position-embeddings'):
        W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer())
        p1 = tf.nn.embedding_lookup(W_pos, input_p1)[:, :tf.shape(embedded_chars)[1]]
        p2 = tf.nn.embedding_lookup(W_pos, input_p2)[:, :tf.shape(embedded_chars)[1]]
    
    # Dropout for Word Embedding
    with tf.variable_scope('dropout-embeddings'):
        if mode == tf.estimator.ModeKeys.TRAIN:
            embedded_chars = tf.nn.dropout(embedded_chars,  emb_dropout_keep_prob)
        else:
            embedded_chars = tf.nn.dropout(embedded_chars,  1.0)
    
    # Self Attention
    with tf.variable_scope("self-attention"):
        attn, alphas = multihead_attention(embedded_chars, embedded_chars,
                                                                num_units=embedding_size, num_heads=num_heads)
    # Bidirectional LSTM
    with tf.variable_scope("bi-lstm"):
        #tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell <--- speed up
        #tf.nn.rnn_cell.LSTMCell <--- normal LSTMCell # Must Use Initializer
        if mode == tf.estimator.ModeKeys.TRAIN:
            rnn_dropout_keep_prob = 1.0
        else:
            rnn_dropout_keep_prob = params.rnn_dropout_keep_prob
        _fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
        fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, rnn_dropout_keep_prob)
        _bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
        bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, rnn_dropout_keep_prob)
        rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
                                                              cell_bw=bw_cell,
                                                              inputs=attn,
                                                              sequence_length=seq_length(input_x),
                                                              dtype=tf.float32)
        rnn_outputs = tf.concat(rnn_outputs, axis=-1)
        
    # Attention
    with tf.variable_scope('attention'):
        attn, alphas, e1_alphas, e2_alphas = attention(rnn_outputs, input_e1, input_e2,
                                                       p1, p2, attention_size=attention_size)
    # Dropout
    with tf.variable_scope('dropout'):
        if mode == tf.estimator.ModeKeys.TRAIN:
            h_drop = tf.nn.dropout(attn, dropout_keep_prob)
        else: 
            h_drop = tf.nn.dropout(attn, 1.0)

    # Fully connected layer
    with tf.variable_scope('output'):
        logits = tf.layers.dense(h_drop, num_classes, kernel_initializer=initializer())
        probabilities = tf.nn.softmax(logits)
        predicted_indices = tf.argmax(logits, 1, name="predictions")
        
    # Provide an estimator spec for `ModeKeys.PREDICT`.
    if mode == tf.estimator.ModeKeys.PREDICT:
        # Convert predicted_indices back into strings
        predictions = {
            'class': tf.gather(TARGET_LABELS, predicted_indices),
            'probabilities': probabilities
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        # Provide an estimator spec for `ModeKeys.PREDICT` modes.
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)
    
    # weights
    # weights = features[WEIGHT_COLUNM_NAME]
    
    # Calculate mean cross-entropy loss
    losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels)
    l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
    loss = tf.reduce_mean(losses) + l2_reg_lambda * l2
    loss = tf.identity(loss, name="loss")
    tf.summary.scalar("loss", loss)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        #Calc Train Accuracy
        accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, 1), predictions=predicted_indices)
        # Save accuracy scalar to Tensorboard output.
        tf.summary.scalar('train_accuracy', accuracy[1])
        
        #Create Hook
        summary_hook = tf.train.SummarySaverHook(
            save_steps=20,
            output_dir='/tmp/tf/train',
            summary_op=tf.summary.merge_all())
        
        # Create Optimiser
        optimizer = tf.train.AdadeltaOptimizer(learning_rate, decay_rate, 1e-6)

        # Create training operation
        gvs = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs, global_step=tf.train.get_global_step())

        # Provide an estimator spec for `ModeKeys.TRAIN` modes.
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss, 
                                          train_op=train_op, 
                                          training_hooks=[summary_hook])
          
    if mode == tf.estimator.ModeKeys.EVAL: 
        #Calc Train Accuracy
        eval_accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, 1), predictions=predicted_indices)
        # Save accuracy scalar to Tensorboard output.
        tf.summary.scalar('eval_accuracy', eval_accuracy[1])
        
        eval_metric_ops = {
            'accuracy': eval_accuracy,
            'f1_score': tf.contrib.metrics.f1_score(labels, probabilities)
        }
        # Provide an estimator spec for `ModeKeys.EVAL` modes.
        return tf.estimator.EstimatorSpec(mode, 
                                          loss=loss,
                                          eval_metric_ops=eval_metric_ops)

In [28]:
def embedding_initializer(shape=None, dtype=tf.float32, partition_info=None):
    assert dtype is tf.float32
    return embedding_matrix

def create_estimator(run_config, hparams):
    estimator = tf.estimator.Estimator(model_fn=model_fn, 
                                  params=hparams, 
                                  config=run_config)
    print("Estimator Type: {}".format(type(estimator)))
    return estimator

## Hyper-Parameter Tunning

In [29]:
TRAIN_SIZE = train.shape[0]
NUM_EPOCHS = 100
BATCH_SIZE = 20
EVAL_AFTER_SEC = 600
TOTAL_STEPS = int((TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

hparams  = tf.contrib.training.HParams(
    embedding_initializer = embedding_initializer,
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    num_classes = len(TARGET_LABELS),
    embedding_size = 300,
    pos_embedding_size = 50,
    pos_vocab_size = len(pos_idx),
    emb_dropout_keep_prob = 0.3,
    hidden_size = 300,
    rnn_dropout_keep_prob = 0.3,
    dropout_keep_prob = 0.5,
    num_heads = 4,
    attention_size = 50,
    learning_rate = 1.0,
    decay_rate = 0.9,
    max_steps = TOTAL_STEPS,
    l2_reg_lambda = 1e-5,
    model_dir = model_dir
)

session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
session_conf.gpu_options.allow_growth = True

run_config = tf.estimator.RunConfig(
    session_config=session_conf,
    log_step_count_steps=300,
    save_checkpoints_steps=5000,
    tf_random_seed=17081992,
    model_dir=model_dir
)

print(hparams, "\n")
print("Model Directory:", run_config.model_dir)
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each", EVAL_AFTER_SEC, "training seconds")

[('attention_size', 50), ('batch_size', 20), ('decay_rate', 0.9), ('dropout_keep_prob', 0.5), ('emb_dropout_keep_prob', 0.3), ('embedding_initializer', <function embedding_initializer at 0x7ff86d796d08>), ('embedding_size', 300), ('hidden_size', 300), ('l2_reg_lambda', 1e-05), ('learning_rate', 1.0), ('max_steps', 40000), ('model_dir', 'trained_models/EARC-model-01'), ('num_classes', 19), ('num_epochs', 100), ('num_heads', 4), ('pos_embedding_size', 50), ('pos_vocab_size', 162), ('rnn_dropout_keep_prob', 0.3)] 

Model Directory: trained_models/EARC-model-01
Dataset Size: 8000
Batch Size: 20
Steps per Epoch: 400.0
Total Steps: 40000
That is 1 evaluation step after each 600 training seconds


## Estimator Training/Validation

In [30]:
def serving_input_fn():
    """
    Serving input function, should contain a receiver_tensor with the same features as the input during training, 
    this function will also be used during validation.
    """
    receiver_tensor = {
      'text': tf.placeholder(tf.string, [None]),
      'e1': tf.placeholder(tf.string, [None]),
      'e2': tf.placeholder(tf.string, [None]),
      'p1': tf.placeholder(tf.string, [None]),
      'p2': tf.placeholder(tf.string, [None]),
    }
    features = {
      key: tensor
      for key, tensor in receiver_tensor.items()
    }
    return tf.estimator.export.ServingInputReceiver(
        features, receiver_tensor)

In [31]:
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: input_fn(
        VALID_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        batch_size=hparams.batch_size
    ),
    exporters=[tf.estimator.LatestExporter(
        name="predict", # the name of the folder in which the model will be exported to under export
        serving_input_receiver_fn=serving_input_fn,
        exports_to_keep=1,
        as_text=True)],
    steps=None,
    throttle_secs = EVAL_AFTER_SEC
)

In [None]:
#if not RESUME_TRAINING:
print("Removing previous artifacts...")
shutil.rmtree(model_dir, ignore_errors=True)
#else:
#    print("Resuming training...")
    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

import logging

# get TF logger
log = logging.getLogger('tensorflow')
log.setLevel(logging.DEBUG)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# create file handler which logs even debug messages
fh = logging.FileHandler('tensorflow.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
log.addHandler(fh)

estimator = create_estimator(run_config, hparams)

tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

Removing previous artifacts...
Experiment started at 16:32:40
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/EARC-model-01', '_tf_random_seed': 17081992, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': gpu_options {
  allow_growth: true
}
allow_soft_placement: true
log_device_placement: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 300, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff86d6732e8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Estimator Type: <class 'tensorflow_estimator.python.estimator.estimator.E

## Testing and Inference

In [None]:
TEST_SIZE = 1393
test_input_fn = lambda: input_fn(files_name_pattern= VALID_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)
test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print("# Test Measures: {}".format(test_results))

In [None]:
myExamples = [
    "This is the sprawling <e1>complex</e1> that is Peru's largest <e2>producer</e2> of silver.", #Other
    "The <e1>dog</e1> knocked over the monitor, and now is <e2>broken</e2>", #what is broken? the dog? 
    "<e1>Avocados</e1> come from farms in the <e2>south of Mexico</e2>" #Entity-Origin
]

In [None]:
s = pd.Series(myExamples).apply(lambda x: text2features(x, MAX_DOCUMENT_LENGTH))
feed = pd.DataFrame(s.tolist()).astype(str).to_dict(orient='list')

In [None]:
import os

export_dir = model_dir + "/export/predict/"

saved_model_dir = export_dir + "/" + os.listdir(path=export_dir)[-1] 

print(saved_model_dir, "\n")

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
    signature_def_key="prediction"
)

output = predictor_fn(feed)
print(output)