In [1]:
import tensorflow as tf
import re
import time
import os
import pickle


def BLSTM(input, hidden_state_dimension, initializer, sequence_length=None, output_sequence=True):
    with tf.variable_scope("bidirectional_LSTM"):
        if sequence_length == None:
            batch_size = 1
            sequence_length = tf.shape(input)[1]
            sequence_length = tf.expand_dims(sequence_length, axis=0, name='sequence_length')
        else:
            batch_size = tf.shape(sequence_length)[0]

        lstm_cell = {}
        initial_state = {}
        for direction in ["forward", "backward"]:
            with tf.variable_scope(direction):
                # LSTM cell
                lstm_cell[direction] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(hidden_state_dimension,
                                                                                     forget_bias=1.0,
                                                                                     initializer=initializer,
                                                                                     state_is_tuple=True)
                # initial state: http://stackoverflow.com/questions/38441589/tensorflow-rnn-initial-state
                initial_cell_state = tf.get_variable("initial_cell_state", shape=[1, hidden_state_dimension],
                                                     dtype=tf.float32, initializer=initializer)
                initial_output_state = tf.get_variable("initial_output_state", shape=[1, hidden_state_dimension],
                                                       dtype=tf.float32, initializer=initializer)
                c_states = tf.tile(initial_cell_state, tf.stack([batch_size, 1]))
                h_states = tf.tile(initial_output_state, tf.stack([batch_size, 1]))
                initial_state[direction] = tf.contrib.rnn.LSTMStateTuple(c_states, h_states)

        # sequence_length must be provided for tf.nn.bidirectional_dynamic_rnn due to internal bug
        outputs, final_states = tf.nn.bidirectional_dynamic_rnn(lstm_cell["forward"],
                                                                lstm_cell["backward"],
                                                                input,
                                                                dtype=tf.float32,
                                                                sequence_length=sequence_length,
                                                                initial_state_fw=initial_state["forward"],
                                                                initial_state_bw=initial_state["backward"])
        if output_sequence == True:
            outputs_forward, outputs_backward = outputs
            output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence')
        else:
            # max pooling
            #             outputs_forward, outputs_backward = outputs
            #             output = tf.concat([outputs_forward, outputs_backward], axis=2, name='output_sequence')
            #             output = tf.reduce_max(output, axis=1, name='output')
            # last pooling
            final_states_forward, final_states_backward = final_states
            output = tf.concat([final_states_forward[1], final_states_backward[1]], axis=1, name='output')

    return output

  from ._conv import register_converters as _register_converters


In [2]:
class Char_BLSTM_CRF(object):
    """
    An LSTM architecture for named entity recognition.
    Uses a character embedding layer followed by an LSTM to generate vector representation from characters for each token.
    Then the character vector is concatenated with token embedding vector, which is input to another LSTM  followed by a CRF layer.
    """

    def __init__(self, dataset,token_embedding_dimension,character_lstm_hidden_state_dimension,
                 token_lstm_hidden_state_dimension,character_embedding_dimension,
                 freeze_token_embeddings=False,
                 learning_rate=0.005, gradient_clipping_value=5.0, optimizer='sgd',maximum_number_of_epochs=30):
      
        self.verbose = True
        self.input_token_indices = tf.placeholder(tf.int32, [None], name="input_token_indices")
        self.input_label_indices_vector = tf.placeholder(tf.float32, [None, dataset.number_of_classes],
                                                         name="input_label_indices_vector")
        self.input_label_indices_flat = tf.placeholder(tf.int32, [None], name="input_label_indices_flat")
        self.input_token_character_indices = tf.placeholder(tf.int32, [None, None], name="input_token_indices")
        self.input_token_lengths = tf.placeholder(tf.int32, [None], name="input_token_lengths")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Internal parameters
        initializer = tf.contrib.layers.xavier_initializer()

        # Character-level LSTM
        # Idea: reshape so that we have a tensor [number_of_token, max_token_length, token_embeddings_size], which we pass to the LSTM

        # Character embedding layer
        with tf.variable_scope("character_embedding"):
            self.character_embedding_weights = tf.get_variable("character_embedding_weights",
                    shape=[dataset.alphabet_size,character_embedding_dimension],initializer=initializer)
            embedded_characters = tf.nn.embedding_lookup(self.character_embedding_weights,
                                                             self.input_token_character_indices,
                                                             name='embedded_characters')
            if self.verbose: print("embedded_characters: {0}".format(embedded_characters))
#                 utils_tf.variable_summaries(self.character_embedding_weights)

        # Character LSTM layer
        with tf.variable_scope('character_lstm') as vs:
            character_lstm_output = BLSTM(embedded_characters,
                                                           character_lstm_hidden_state_dimension,
                                                           initializer,
                                                           sequence_length=self.input_token_lengths,
                                                           output_sequence=False)
            self.character_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        # Token embedding layer
        with tf.variable_scope("token_embedding"):
            self.token_embedding_weights = tf.get_variable(
                "token_embedding_weights",
                shape=[dataset.vocabulary_size, token_embedding_dimension],
                initializer=initializer,
                trainable=not freeze_token_embeddings)
            embedded_tokens = tf.nn.embedding_lookup(self.token_embedding_weights, self.input_token_indices)
#             utils_tf.variable_summaries(self.token_embedding_weights)

        # Concatenate character LSTM outputs and token embeddings
        
        with tf.variable_scope("concatenate_token_and_character_vectors"):
            token_lstm_input = tf.concat([character_lstm_output, embedded_tokens], axis=1, name='token_lstm_input')
            if self.verbose: 
                print('embedded_tokens: {0}'.format(embedded_tokens))
                print("token_lstm_input: {0}".format(token_lstm_input))
        

        # Add dropout
        with tf.variable_scope("dropout"):
            token_lstm_input_drop = tf.nn.dropout(token_lstm_input, self.dropout_keep_prob,
                                                  name='token_lstm_input_drop')
            if self.verbose: print("token_lstm_input_drop: {0}".format(token_lstm_input_drop))
            # https://www.tensorflow.org/api_guides/python/contrib.rnn
            # Prepare data shape to match `rnn` function requirements
            # Current data input shape: (batch_size, n_steps, n_input)
            # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
            token_lstm_input_drop_expanded = tf.expand_dims(token_lstm_input_drop, axis=0,
                                                            name='token_lstm_input_drop_expanded')
            if self.verbose: print("token_lstm_input_drop_expanded: {0}".format(token_lstm_input_drop_expanded))

        # Token LSTM layer
        with tf.variable_scope('token_lstm') as vs:
            token_lstm_output = BLSTM(token_lstm_input_drop_expanded,
                                                   token_lstm_hidden_state_dimension, initializer,
                                                   output_sequence=True)
            token_lstm_output_squeezed = tf.squeeze(token_lstm_output, axis=0)
            self.token_lstm_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        # Needed only if Bidirectional LSTM is used for token level
        with tf.variable_scope("feedforward_after_lstm") as vs:
            W = tf.get_variable(
                "W",
                shape=[2 * token_lstm_hidden_state_dimension, token_lstm_hidden_state_dimension],
                initializer=initializer)
            b = tf.Variable(tf.constant(0.0, shape=[token_lstm_hidden_state_dimension]), name="bias")
            outputs = tf.nn.xw_plus_b(token_lstm_output_squeezed, W, b, name="output_before_tanh")
            outputs = tf.nn.tanh(outputs, name="output_after_tanh")
#             utils_tf.variable_summaries(W)
#             utils_tf.variable_summaries(b)
            self.token_lstm_variables += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        with tf.variable_scope("feedforward_before_crf") as vs:
            W = tf.get_variable(
                "W",
                shape=[token_lstm_hidden_state_dimension, dataset.number_of_classes],
                initializer=initializer)
            b = tf.Variable(tf.constant(0.0, shape=[dataset.number_of_classes]), name="bias")
            scores = tf.nn.xw_plus_b(outputs, W, b, name="scores")
            self.unary_scores = scores
            self.predictions = tf.argmax(self.unary_scores, 1, name="predictions")
#             utils_tf.variable_summaries(W)
#             utils_tf.variable_summaries(b)
            self.feedforward_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)

        # CRF layer
        with tf.variable_scope("crf") as vs:
            # Add start and end tokens
            small_score = -1000.0
            large_score = 0.0
            sequence_length = tf.shape(self.unary_scores)[0]
            unary_scores_with_start_and_end = tf.concat(
                [self.unary_scores, tf.tile(tf.constant(small_score, shape=[1, 2]), [sequence_length, 1])], 1)
            start_unary_scores = [[small_score] * dataset.number_of_classes + [large_score, small_score]]
            end_unary_scores = [[small_score] * dataset.number_of_classes + [small_score, large_score]]
            self.unary_scores = tf.concat([start_unary_scores, unary_scores_with_start_and_end, end_unary_scores],
                                              0)
            start_index = dataset.number_of_classes
            end_index = dataset.number_of_classes + 1
            input_label_indices_flat_with_start_and_end = tf.concat(
                    [tf.constant(start_index, shape=[1]), self.input_label_indices_flat,
                     tf.constant(end_index, shape=[1])], 0)

            # Apply CRF layer
            sequence_length = tf.shape(self.unary_scores)[0]
            sequence_lengths = tf.expand_dims(sequence_length, axis=0, name='sequence_lengths')
            unary_scores_expanded = tf.expand_dims(self.unary_scores, axis=0, name='unary_scores_expanded')
            input_label_indices_flat_batch = tf.expand_dims(input_label_indices_flat_with_start_and_end, axis=0,
                                                                name='input_label_indices_flat_batch')
            if self.verbose: print('unary_scores_expanded: {0}'.format(unary_scores_expanded))
            if self.verbose: print('input_label_indices_flat_batch: {0}'.format(input_label_indices_flat_batch))
            if self.verbose: print("sequence_lengths: {0}".format(sequence_lengths))
            # https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/crf
            # Compute the log-likelihood of the gold sequences and keep the transition params for inference at test time.
            self.transition_parameters = tf.get_variable(
                    "transitions",
                    shape=[dataset.number_of_classes + 2, dataset.number_of_classes + 2],
                    initializer=initializer)
#                 utils_tf.variable_summaries(self.transition_parameters)
            log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
                    unary_scores_expanded, input_label_indices_flat_batch, sequence_lengths,
                    transition_params=self.transition_parameters)
            self.loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss')
            self.accuracy = tf.constant(1)

            self.crf_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name)


        self.define_training_procedure(learning_rate=learning_rate,gradient_clipping_value=gradient_clipping_value,optimizer=optimizer)
        self.summary_op = tf.summary.merge_all()
        self.saver = tf.train.Saver(
            max_to_keep=maximum_number_of_epochs)  # defaults to saving all variables
    def define_training_procedure(self ,learning_rate ,gradient_clipping_value, optimizer='sgd'):
        # Define training procedure
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        if optimizer == 'adam':
            self.optimizer = tf.train.AdamOptimizer(learning_rate)
        elif optimizer == 'sgd':
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        elif optimizer == 'adadelta':
            self.optimizer = tf.train.AdadeltaOptimizer(learning_rate)
        else:
            raise ValueError('The lr_method parameter must be either adadelta, adam or sgd.')

        grads_and_vars = self.optimizer.compute_gradients(self.loss)
        if gradient_clipping_value:
            grads_and_vars = [(tf.clip_by_value(grad, -gradient_clipping_value,
                                                gradient_clipping_value), var)
                              for grad, var in grads_and_vars]
        # By defining a global_step variable and passing it to the optimizer we allow TensorFlow handle the counting of training steps for us.
        # The global step will be automatically incremented by one every time you execute train_op.
        self.train_op = self.optimizer.apply_gradients(grads_and_vars, global_step=self.global_step)

In [3]:
parameters = {'pretrained_model_folder':'../model',
                      'dataset_text_folder':'../../../ML_EntityData/data/en',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
                      'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':3,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
                      'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
                      'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':100,
                      'token_lstm_hidden_state_dimension':100,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/glove.6B.100d.txt',
                      'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
                      'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}

In [4]:
import utils
import dataset as ds
# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.Dataset(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Checking the validity of BRAT-formatted train set... Done.
Checking compatibility between CONLL and BRAT for train_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted valid set... Done.
Checking compatibility between CONLL and BRAT for valid_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Checking the validity of BRAT-formatted test set... Done.
Checking compatibility between CONLL and BRAT for test_compatible_with_brat set ... Done.
Checking validity of CONLL BIOES format... Done.
Load dataset... done (44.11 seconds)


In [5]:

# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = Char_BLSTM_CRF(dataset=dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

embedded_characters: Tensor("character_embedding/embedded_characters:0", shape=(?, ?, 25), dtype=float32)
embedded_tokens: Tensor("token_embedding/embedding_lookup:0", shape=(?, 100), dtype=float32)
token_lstm_input: Tensor("concatenate_token_and_character_vectors/token_lstm_input:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop: Tensor("dropout/token_lstm_input_drop/mul:0", shape=(?, 150), dtype=float32)
token_lstm_input_drop_expanded: Tensor("dropout/token_lstm_input_drop_expanded:0", shape=(1, ?, 150), dtype=float32)
unary_scores_expanded: Tensor("crf/unary_scores_expanded:0", shape=(1, ?, 19), dtype=float32)
input_label_indices_flat_batch: Tensor("crf/input_label_indices_flat_batch:0", shape=(1, ?), dtype=int32)
sequence_lengths: Tensor("crf/sequence_lengths:0", shape=(1,), dtype=int32)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
